From b58c3478bf2d78edfc8f706e5f9487e2ab35ee4b Mon Sep 17 00:00:00 2001
From: AlongWY <AlongWY@users.noreply.github.com>
Date: Sun, 1 Sep 2024 05:26:10 +0000
Subject: [PATCH] deploy: 72066be21ad467c8ffc76b74c152b38decf3f0ac

---
 .nojekyll   |     0
 cache.json  |     1 +
 favicon.ico |   Bin 0 -> 15086 bytes
 index.css   |   355 +
 index.html  | 64390 ++++++++++++++++++++++++++++++++++++++++++++++++++
 index.js    |    39 +
 6 files changed, 64785 insertions(+)
 create mode 100644 .nojekyll
 create mode 100644 cache.json
 create mode 100644 favicon.ico
 create mode 100644 index.css
 create mode 100644 index.html
 create mode 100644 index.js

diff --git a/.nojekyll b/.nojekyll
new file mode 100644
index 00000000..e69de29b
diff --git a/cache.json b/cache.json
new file mode 100644
index 00000000..9193b647
--- /dev/null
+++ b/cache.json
@@ -0,0 +1 @@
+{"2024-08-26T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2408.14471v1","updated":"2024-08-26T17:59:01Z","published":"2024-08-26T17:59:01Z","title":"A Practitioner's Guide to Continual Multimodal Pretraining","summary":"  Multimodal foundation models serve numerous applications at the intersection\nof vision and language. Still, despite being pretrained on extensive data, they\nbecome outdated over time. To keep models updated, research into continual\npretraining mainly explores scenarios with either (1) infrequent,\nindiscriminate updates on large-scale new data, or (2) frequent, sample-level\nupdates. However, practical model deployment often operates in the gap between\nthese two limit cases, as real-world applications often demand adaptation to\nspecific subdomains, tasks or concepts -- spread over the entire, varying life\ncycle of a model. In this work, we complement current perspectives on continual\npretraining through a research test bed as well as provide comprehensive\nguidance for effective continual model updates in such scenarios. We first\nintroduce FoMo-in-Flux, a continual multimodal pretraining benchmark with\nrealistic compute constraints and practical deployment requirements,\nconstructed over 63 datasets with diverse visual and semantic coverage. Using\nFoMo-in-Flux, we explore the complex landscape of practical continual\npretraining through multiple perspectives: (1) A data-centric investigation of\ndata mixtures and stream orderings that emulate real-world deployment\nsituations, (2) a method-centric investigation ranging from simple fine-tuning\nand traditional continual learning strategies to parameter-efficient updates\nand model merging, (3) meta learning rate schedules and mechanistic design\nchoices, and (4) the influence of model and compute scaling. Together, our\ninsights provide a practitioner's guide to continual multimodal pretraining for\nreal-world deployment. Our benchmark and code is here:\nhttps://github.com/ExplainableML/fomo_in_flux.\n","authors":["Karsten Roth","Vishaal Udandarao","Sebastian Dziadzio","Ameya Prabhu","Mehdi Cherti","Oriol Vinyals","Olivier Hénaff","Samuel Albanie","Matthias Bethge","Zeynep Akata"],"pdf_url":"https://arxiv.org/pdf/2408.14471v1.pdf","comment":"Technical Report. 52 pages"},{"id":"http://arxiv.org/abs/2408.14470v1","updated":"2024-08-26T17:58:53Z","published":"2024-08-26T17:58:53Z","title":"Step-by-Step Unmasking for Parameter-Efficient Fine-tuning of Large\n  Language Models","summary":"  Fine-tuning large language models (LLMs) on downstream tasks requires\nsubstantial computational resources. A class of parameter-efficient fine-tuning\n(PEFT) aims to mitigate these computational challenges by selectively\nfine-tuning only a small fraction of the model parameters. Although\ncomputationally efficient, these techniques often fail to match the performance\nof fully fine-tuned models, primarily due to inherent biases introduced during\nparameter selection. Traditional selective PEFT techniques use a fixed set of\nparameters based on a predefined budget (a process also known as unmasking),\nfailing to capture parameter importance dynamically and often ending up\nexceeding the budget. We introduce $\\text{ID}^3$, a novel selective PEFT method\nthat calculates parameter importance continually and dynamically unmasks\nparameters by balancing exploration and exploitation in parameter selection.\nOur empirical study on 15 tasks spanning natural language understanding and\ngenerative tasks demonstrates the effectiveness of our method compared to\nfixed-masking-based PEFT techniques. We analytically show that $\\text{ID}^3$\nreduces the number of gradient updates by a factor of two, enhancing\ncomputational efficiency. $\\text{ID}^3$ is robust to random initialization of\nneurons and, therefore, can be seamlessly integrated into existing additive and\nreparametrization-based PEFT modules such as adapters and LoRA for dynamic\nsparsification.\n","authors":["Aradhye Agarwal","Suhas K Ramesh","Ayan Sengupta","Tanmoy Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2408.14470v1.pdf","comment":"15 pages, 7 tables, 9 figures"},{"id":"http://arxiv.org/abs/2408.14467v1","updated":"2024-08-26T17:58:17Z","published":"2024-08-26T17:58:17Z","title":"Explicit Inductive Inference using Large Language Models","summary":"  Large Language Models (LLMs) are reported to hold undesirable attestation\nbias on inference tasks: when asked to predict if a premise P entails a\nhypothesis H, instead of considering H's conditional truthfulness entailed by\nP, LLMs tend to use the out-of-context truth label of H as a fragile proxy. In\nthis paper, we propose a pipeline that exploits this bias to do explicit\ninductive inference. Our pipeline uses an LLM to transform a premise into a set\nof attested alternatives, and then aggregate answers of the derived new\nentailment inquiries to support the original inference prediction. On a\ndirectional predicate entailment benchmark, we demonstrate that by applying\nthis simple pipeline, we can improve the overall performance of LLMs on\ninference and substantially alleviate the impact of their attestation bias.\n","authors":["Tianyang Liu","Tianyi Li","Liang Cheng","Mark Steedman"],"pdf_url":"https://arxiv.org/pdf/2408.14467v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11796v2","updated":"2024-08-26T17:50:46Z","published":"2024-08-21T17:38:48Z","title":"LLM Pruning and Distillation in Practice: The Minitron Approach","summary":"  We present a comprehensive report on compressing the Llama 3.1 8B and Mistral\nNeMo 12B models to 4B and 8B parameters, respectively, using pruning and\ndistillation. We explore two distinct pruning strategies: (1) depth pruning and\n(2) joint hidden/attention/MLP (width) pruning, and evaluate the results on\ncommon benchmarks from the LM Evaluation Harness. The models are then aligned\nwith NeMo Aligner and tested in instruct-tuned versions. This approach produces\na compelling 4B model from Llama 3.1 8B and a state-of-the-art\nMistral-NeMo-Minitron-8B (MN-Minitron-8B for brevity) model from Mistral NeMo\n12B. We found that with no access to the original data, it is beneficial to\nslightly fine-tune teacher models on the distillation dataset. We open-source\nour base model weights on Hugging Face with a permissive license.\n","authors":["Sharath Turuvekere Sreenivas","Saurav Muralidharan","Raviraj Joshi","Marcin Chochowski","Mostofa Patwary","Mohammad Shoeybi","Bryan Catanzaro","Jan Kautz","Pavlo Molchanov"],"pdf_url":"https://arxiv.org/pdf/2408.11796v2.pdf","comment":"v2: Added missing references. Cleaned up runtime performance section"},{"id":"http://arxiv.org/abs/2306.13840v3","updated":"2024-08-26T17:34:44Z","published":"2023-06-24T02:25:56Z","title":"Beyond Scale: The Diversity Coefficient as a Data Quality Metric for\n  Variability in Natural Language Data","summary":"  Current trends in pre-training Large Language Models (LLMs) primarily focus\non the scaling of model and dataset size. While the quality of pre-training\ndata is considered an important factor for training powerful LLMs, it remains a\nnebulous concept that has not been rigorously characterized. To this end, we\npropose a formalization of one key aspect of data quality -- measuring the\nvariability of natural language data -- specifically via a measure we call the\ndiversity coefficient. Our empirical analysis shows that the proposed diversity\ncoefficient aligns with the intuitive properties of diversity and variability,\ne.g., it increases as the number of latent concepts increases. Then, we measure\nthe diversity coefficient of publicly available pre-training datasets and\ndemonstrate that their formal diversity is high compared to theoretical lower\nand upper bounds. Finally, we conduct a comprehensive set of controlled\ninterventional experiments with GPT-2 and LLaMAv2 that demonstrate the\ndiversity coefficient of pre-training data characterizes useful aspects of\ndownstream model evaluation performance -- totaling 44 models of various sizes\n(51M to 7B parameters). We conclude that our formal notion of diversity is an\nimportant aspect of data quality that captures variability and causally leads\nto improved evaluation performance.\n","authors":["Brando Miranda","Alycia Lee","Sudharsan Sundar","Allison Casasola","Sanmi Koyejo"],"pdf_url":"https://arxiv.org/pdf/2306.13840v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10468v3","updated":"2024-08-26T17:28:23Z","published":"2024-08-20T00:40:49Z","title":"Tracing Privacy Leakage of Language Models to Training Data via Adjusted\n  Influence Functions","summary":"  The responses generated by Large Language Models (LLMs) can include sensitive\ninformation from individuals and organizations, leading to potential privacy\nleakage. This work implements Influence Functions (IFs) to trace privacy\nleakage back to the training data, thereby mitigating privacy concerns of\nLanguage Models (LMs). However, we notice that current IFs struggle to\naccurately estimate the influence of tokens with large gradient norms,\npotentially overestimating their influence. When tracing the most influential\nsamples, this leads to frequently tracing back to samples with large gradient\nnorm tokens, overshadowing the actual most influential samples even if their\ninfluences are well estimated. To address this issue, we propose Heuristically\nAdjusted IF (HAIF), which reduces the weight of tokens with large gradient\nnorms, thereby significantly improving the accuracy of tracing the most\ninfluential samples. To establish easily obtained groundtruth for tracing\nprivacy leakage, we construct two datasets, PII-E and PII-CR, representing two\ndistinct scenarios: one with identical text in the model outputs and\npre-training data, and the other where models leverage their reasoning\nabilities to generate text divergent from pre-training data. HAIF significantly\nimproves tracing accuracy, enhancing it by 20.96% to 73.71% on the PII-E\ndataset and 3.21% to 45.93% on the PII-CR dataset, compared to the best SOTA\nIFs against various GPT-2 and QWen-1.5 models. HAIF also outperforms SOTA IFs\non real-world pretraining data CLUECorpus2020, demonstrating strong robustness\nregardless prompt and response lengths.\n","authors":["Jinxin Liu","Zao Yang"],"pdf_url":"https://arxiv.org/pdf/2408.10468v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14438v1","updated":"2024-08-26T17:25:16Z","published":"2024-08-26T17:25:16Z","title":"Evaluating Large Language Models on Spatial Tasks: A Multi-Task\n  Benchmarking Study","summary":"  The advent of large language models such as ChatGPT, Gemini, and others has\nunderscored the importance of evaluating their diverse capabilities, ranging\nfrom natural language understanding to code generation. However, their\nperformance on spatial tasks has not been comprehensively assessed. This study\naddresses this gap by introducing a novel multi-task spatial evaluation\ndataset, designed to systematically explore and compare the performance of\nseveral advanced models on spatial tasks. The dataset encompasses twelve\ndistinct task types, including spatial understanding and path planning, each\nwith verified, accurate answers. We evaluated multiple models, including\nOpenAI's gpt-3.5-turbo, gpt-4o, and ZhipuAI's glm-4, through a two-phase\ntesting approach. Initially, we conducted zero-shot testing, followed by\ncategorizing the dataset by difficulty and performing prompt tuning tests.\nResults indicate that gpt-4o achieved the highest overall accuracy in the first\nphase, with an average of 71.3%. Although moonshot-v1-8k slightly\nunderperformed overall, it surpassed gpt-4o in place name recognition tasks.\nThe study also highlights the impact of prompt strategies on model performance\nin specific tasks. For example, the Chain-of-Thought (COT) strategy increased\ngpt-4o's accuracy in path planning from 12.4% to 87.5%, while a one-shot\nstrategy enhanced moonshot-v1-8k's accuracy in mapping tasks from 10.1% to\n76.3%.\n","authors":["Liuchang Xu Shuo Zhao","Qingming Lin","Luyao Chen","Qianqian Luo","Sensen Wu","Xinyue Ye","Hailin Feng","Zhenhong Du"],"pdf_url":"https://arxiv.org/pdf/2408.14438v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14419v1","updated":"2024-08-26T17:04:23Z","published":"2024-08-26T17:04:23Z","title":"CHARTOM: A Visual Theory-of-Mind Benchmark for Multimodal Large Language\n  Models","summary":"  We introduce CHARTOM, a visual theory-of-mind benchmark for multimodal large\nlanguage models. CHARTOM consists of specially designed data visualizing\ncharts. Given a chart, a language model needs to not only correctly comprehend\nthe chart (the FACT question) but also judge if the chart will be misleading to\na human reader (the MIND question). Both questions have significant societal\nbenefits. We detail the construction of the CHARTOM benchmark including its\ncalibration on human performance.\n","authors":["Shubham Bharti","Shiyun Cheng","Jihyun Rho","Martina Rao","Xiaojin Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.14419v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14418v1","updated":"2024-08-26T17:04:00Z","published":"2024-08-26T17:04:00Z","title":"MEDSAGE: Enhancing Robustness of Medical Dialogue Summarization to ASR\n  Errors with LLM-generated Synthetic Dialogues","summary":"  Automatic Speech Recognition (ASR) systems are pivotal in transcribing speech\ninto text, yet the errors they introduce can significantly degrade the\nperformance of downstream tasks like summarization. This issue is particularly\npronounced in clinical dialogue summarization, a low-resource domain where\nsupervised data for fine-tuning is scarce, necessitating the use of ASR models\nas black-box solutions. Employing conventional data augmentation for enhancing\nthe noise robustness of summarization models is not feasible either due to the\nunavailability of sufficient medical dialogue audio recordings and\ncorresponding ASR transcripts. To address this challenge, we propose MEDSAGE,\nan approach for generating synthetic samples for data augmentation using Large\nLanguage Models (LLMs). Specifically, we leverage the in-context learning\ncapabilities of LLMs and instruct them to generate ASR-like errors based on a\nfew available medical dialogue examples with audio recordings. Experimental\nresults show that LLMs can effectively model ASR noise, and incorporating this\nnoisy data into the training process significantly improves the robustness and\naccuracy of medical dialogue summarization systems. This approach addresses the\nchallenges of noisy ASR outputs in critical applications, offering a robust\nsolution to enhance the reliability of clinical dialogue summarization.\n","authors":["Kuluhan Binici","Abhinav Ramesh Kashyap","Viktor Schlegel","Andy T. Liu","Vijay Prakash Dwivedi","Thanh-Tung Nguyen","Xiaoxue Gao","Nancy F. Chen","Stefan Winkler"],"pdf_url":"https://arxiv.org/pdf/2408.14418v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05720v2","updated":"2024-08-26T16:48:08Z","published":"2024-03-08T23:17:55Z","title":"A Dataset and Benchmark for Hospital Course Summarization with Adapted\n  Large Language Models","summary":"  Brief hospital course (BHC) summaries are clinical documents that summarize a\npatient's hospital stay. While large language models (LLMs) depict remarkable\ncapabilities in automating real-world tasks, their capabilities for healthcare\napplications such as synthesizing BHCs from clinical notes have not been shown.\nWe introduce a novel pre-processed dataset, the MIMIC-IV-BHC, encapsulating\nclinical note and brief hospital course (BHC) pairs to adapt LLMs for BHC\nsynthesis. Furthermore, we introduce a benchmark of the summarization\nperformance of two general-purpose LLMs and three healthcare-adapted LLMs.\n  Using clinical notes as input, we apply prompting-based (using in-context\nlearning) and fine-tuning-based adaptation strategies to three open-source LLMs\n(Clinical-T5-Large, Llama2-13B, FLAN-UL2) and two proprietary LLMs (GPT-3.5,\nGPT-4). We evaluate these LLMs across multiple context-length inputs using\nnatural language similarity metrics. We further conduct a clinical study with\nfive clinicians, comparing clinician-written and LLM-generated BHCs across 30\nsamples, focusing on their potential to enhance clinical decision-making\nthrough improved summary quality. We observe that the Llama2-13B fine-tuned LLM\noutperforms other domain-adapted models given quantitative evaluation metrics\nof BLEU and BERT-Score. GPT-4 with in-context learning shows more robustness to\nincreasing context lengths of clinical note inputs than fine-tuned Llama2-13B.\nDespite comparable quantitative metrics, the reader study depicts a significant\npreference for summaries generated by GPT-4 with in-context learning compared\nto both Llama2-13B fine-tuned summaries and the original summaries,\nhighlighting the need for qualitative clinical evaluation.\n","authors":["Asad Aali","Dave Van Veen","Yamin Ishraq Arefeen","Jason Hom","Christian Bluethgen","Eduardo Pontes Reis","Sergios Gatidis","Namuun Clifford","Joseph Daws","Arash S. Tehrani","Jangwon Kim","Akshay S. Chaudhari"],"pdf_url":"https://arxiv.org/pdf/2403.05720v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14398v1","updated":"2024-08-26T16:29:13Z","published":"2024-08-26T16:29:13Z","title":"Language-specific Calibration for Pruning Multilingual Language Models","summary":"  Recent advances in large language model (LLM) pruning have shown\nstate-of-the-art compression results in post-training and retraining-free\nsettings while maintaining high predictive performance. However, such research\nmainly considers calibrating pruning using English text, despite the\nmultilingual nature of modern LLMs and their frequent uses in non-English\nlanguages. In this paper, we set out to explore effective strategies for\ncalibrating the pruning of multilingual language models. We present the first\ncomprehensive empirical study, comparing different calibration languages for\npruning multilingual models across diverse tasks, models, and state-of-the-art\npruning techniques. Our results present practical suggestions, for example,\ncalibrating in the target language can efficiently yield lower perplexity, but\ndoes not necessarily benefit downstream tasks. Our further analysis experiments\nunveil that calibration in the target language mainly contributes to preserving\nlanguage-specific features related to fluency and coherence, but might not\ncontribute to capturing language-agnostic features such as language\nunderstanding and reasoning. Last, we provide practical recommendations for\nfuture practitioners.\n","authors":["Simon Kurz","Zhixue Zhao","Jian-Jia Chen","Lucie Flek"],"pdf_url":"https://arxiv.org/pdf/2408.14398v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14397v1","updated":"2024-08-26T16:28:56Z","published":"2024-08-26T16:28:56Z","title":"Uncovering Knowledge Gaps in Radiology Report Generation Models through\n  Knowledge Graphs","summary":"  Recent advancements in artificial intelligence have significantly improved\nthe automatic generation of radiology reports. However, existing evaluation\nmethods fail to reveal the models' understanding of radiological images and\ntheir capacity to achieve human-level granularity in descriptions. To bridge\nthis gap, we introduce a system, named ReXKG, which extracts structured\ninformation from processed reports to construct a comprehensive radiology\nknowledge graph. We then propose three metrics to evaluate the similarity of\nnodes (ReXKG-NSC), distribution of edges (ReXKG-AMS), and coverage of subgraphs\n(ReXKG-SCS) across various knowledge graphs. We conduct an in-depth comparative\nanalysis of AI-generated and human-written radiology reports, assessing the\nperformance of both specialist and generalist models. Our study provides a\ndeeper understanding of the capabilities and limitations of current AI models\nin radiology report generation, offering valuable insights for improving model\nperformance and clinical applicability.\n","authors":["Xiaoman Zhang","Julián N. Acosta","Hong-Yu Zhou","Pranav Rajpurkar"],"pdf_url":"https://arxiv.org/pdf/2408.14397v1.pdf","comment":"Code is available at: https://github.com/rajpurkarlab/ReXKG"},{"id":"http://arxiv.org/abs/2408.14380v1","updated":"2024-08-26T16:00:41Z","published":"2024-08-26T16:00:41Z","title":"Probing Causality Manipulation of Large Language Models","summary":"  Large language models (LLMs) have shown various ability on natural language\nprocessing, including problems about causality. It is not intuitive for LLMs to\ncommand causality, since pretrained models usually work on statistical\nassociations, and do not focus on causes and effects in sentences. So that\nprobing internal manipulation of causality is necessary for LLMs. This paper\nproposes a novel approach to probe causality manipulation hierarchically, by\nproviding different shortcuts to models and observe behaviors. We exploit\nretrieval augmented generation (RAG) and in-context learning (ICL) for models\non a designed causality classification task. We conduct experiments on\nmainstream LLMs, including GPT-4 and some smaller and domain-specific models.\nOur results suggest that LLMs can detect entities related to causality and\nrecognize direct causal relationships. However, LLMs lack specialized cognition\nfor causality, merely treating them as part of the global semantic of the\nsentence.\n","authors":["Chenyang Zhang","Haibo Tong","Bin Zhang","Dongyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.14380v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19178v2","updated":"2024-08-26T15:59:03Z","published":"2024-04-30T01:02:15Z","title":"Revenge of the Fallen? Recurrent Models Match Transformers at Predicting\n  Human Language Comprehension Metrics","summary":"  Transformers have generally supplanted recurrent neural networks as the\ndominant architecture for both natural language processing tasks and for\nmodelling the effect of predictability on online human language comprehension.\nHowever, two recently developed recurrent model architectures, RWKV and Mamba,\nappear to perform natural language tasks comparably to or better than\ntransformers of equivalent scale. In this paper, we show that contemporary\nrecurrent models are now also able to match - and in some cases, exceed - the\nperformance of comparably sized transformers at modeling online human language\ncomprehension. This suggests that transformer language models are not uniquely\nsuited to this task, and opens up new directions for debates about the extent\nto which architectural features of language models make them better or worse\nmodels of human language comprehension.\n","authors":["James A. Michaelov","Catherine Arnett","Benjamin K. Bergen"],"pdf_url":"https://arxiv.org/pdf/2404.19178v2.pdf","comment":"Accepted at COLM 2024"},{"id":"http://arxiv.org/abs/2408.14354v1","updated":"2024-08-26T15:30:05Z","published":"2024-08-26T15:30:05Z","title":"SWE-bench-java: A GitHub Issue Resolving Benchmark for Java","summary":"  GitHub issue resolving is a critical task in software engineering, recently\ngaining significant attention in both industry and academia. Within this task,\nSWE-bench has been released to evaluate issue resolving capabilities of large\nlanguage models (LLMs), but has so far only focused on Python version. However,\nsupporting more programming languages is also important, as there is a strong\ndemand in industry. As a first step toward multilingual support, we have\ndeveloped a Java version of SWE-bench, called SWE-bench-java. We have publicly\nreleased the dataset, along with the corresponding Docker-based evaluation\nenvironment and leaderboard, which will be continuously maintained and updated\nin the coming months. To verify the reliability of SWE-bench-java, we implement\na classic method SWE-agent and test several powerful LLMs on it. As is well\nknown, developing a high-quality multi-lingual benchmark is time-consuming and\nlabor-intensive, so we welcome contributions through pull requests or\ncollaboration to accelerate its iteration and refinement, paving the way for\nfully automated programming.\n","authors":["Daoguang Zan","Zhirong Huang","Ailun Yu","Shaoxin Lin","Yifan Shi","Wei Liu","Dong Chen","Zongshuai Qi","Hao Yu","Lei Yu","Dezhi Ran","Muhan Zeng","Bo Shen","Pan Bian","Guangtai Liang","Bei Guan","Pengjie Huang","Tao Xie","Yongji Wang","Qianxiang Wang"],"pdf_url":"https://arxiv.org/pdf/2408.14354v1.pdf","comment":"This work is in progress"},{"id":"http://arxiv.org/abs/2408.14352v1","updated":"2024-08-26T15:29:34Z","published":"2024-08-26T15:29:34Z","title":"Assessing Contamination in Large Language Models: Introducing the\n  LogProber method","summary":"  In machine learning, contamination refers to situations where testing data\nleak into the training set. The issue is particularly relevant for the\nevaluation of the performance of Large Language Models (LLMs), which are\ngenerally trained on gargantuan, and generally opaque, corpora of text scraped\nfrom the world wide web. Developing tools to detect contamination is therefore\ncrucial to be able to fairly and properly track the evolution of the\nperformance of LLMs. Most recent works in the field are not tailored to\nquantify contamination on short sequences of text like we find in psychology\nquestionnaires. In the present paper we introduce LogProber, a novel,\nefficient, algorithm that we show able to detect contamination using token\nprobability in given sentences. In the second part we investigate the\nlimitations of the method and discuss how different training methods can\ncontaminate models without leaving traces in the token probabilities.\n","authors":["Nicolas Yax","Pierre-Yves Oudeyer","Stefano Palminteri"],"pdf_url":"https://arxiv.org/pdf/2408.14352v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14340v1","updated":"2024-08-26T15:13:14Z","published":"2024-08-26T15:13:14Z","title":"Foundation Models for Music: A Survey","summary":"  In recent years, foundation models (FMs) such as large language models (LLMs)\nand latent diffusion models (LDMs) have profoundly impacted diverse sectors,\nincluding music. This comprehensive review examines state-of-the-art (SOTA)\npre-trained models and foundation models in music, spanning from representation\nlearning, generative learning and multimodal learning. We first contextualise\nthe significance of music in various industries and trace the evolution of AI\nin music. By delineating the modalities targeted by foundation models, we\ndiscover many of the music representations are underexplored in FM development.\nThen, emphasis is placed on the lack of versatility of previous methods on\ndiverse music applications, along with the potential of FMs in music\nunderstanding, generation and medical application. By comprehensively exploring\nthe details of the model pre-training paradigm, architectural choices,\ntokenisation, finetuning methodologies and controllability, we emphasise the\nimportant topics that should have been well explored, like instruction tuning\nand in-context learning, scaling law and emergent ability, as well as\nlong-sequence modelling etc. A dedicated section presents insights into music\nagents, accompanied by a thorough analysis of datasets and evaluations\nessential for pre-training and downstream tasks. Finally, by underscoring the\nvital importance of ethical considerations, we advocate that following research\non FM for music should focus more on such issues as interpretability,\ntransparency, human responsibility, and copyright issues. The paper offers\ninsights into future challenges and trends on FMs for music, aiming to shape\nthe trajectory of human-AI collaboration in the music realm.\n","authors":["Yinghao Ma","Anders Øland","Anton Ragni","Bleiz MacSen Del Sette","Charalampos Saitis","Chris Donahue","Chenghua Lin","Christos Plachouras","Emmanouil Benetos","Elio Quinton","Elona Shatri","Fabio Morreale","Ge Zhang","György Fazekas","Gus Xia","Huan Zhang","Ilaria Manco","Jiawen Huang","Julien Guinot","Liwei Lin","Luca Marinelli","Max W. Y. Lam","Megha Sharma","Qiuqiang Kong","Roger B. Dannenberg","Ruibin Yuan","Shangda Wu","Shih-Lun Wu","Shuqi Dai","Shun Lei","Shiyin Kang","Simon Dixon","Wenhu Chen","Wehhao Huang","Xingjian Du","Xingwei Qu","Xu Tan","Yizhi Li","Zeyue Tian","Zhiyong Wu","Zhizheng Wu","Ziyang Ma","Ziyu Wang"],"pdf_url":"https://arxiv.org/pdf/2408.14340v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16528v2","updated":"2024-08-26T14:59:53Z","published":"2024-05-26T11:29:57Z","title":"LoQT: Low Rank Adapters for Quantized Training","summary":"  Training of large neural networks requires significant computational\nresources. Despite advances using low-rank adapters and quantization,\npretraining of models such as LLMs on consumer hardware has not been possible\nwithout model sharding, offloading during training, or per-layer gradient\nupdates. To address these limitations, we propose LoQT, a method for\nefficiently training quantized models. LoQT uses gradient-based tensor\nfactorization to initialize low-rank trainable weight matrices that are\nperiodically merged into quantized full-rank weight matrices. Our approach is\nsuitable for both pretraining and fine-tuning of models, which we demonstrate\nexperimentally for language modeling and downstream task adaptation. We find\nthat LoQT enables efficient training of models up to 7B parameters on a\nconsumer-grade 24GB GPU. We also demonstrate the feasibility of training a 13B\nparameter model using per-layer gradient updates on the same hardware.\n","authors":["Sebastian Loeschcke","Mads Toftrup","Michael J. Kastoryano","Serge Belongie","Vésteinn Snæbjarnarson"],"pdf_url":"https://arxiv.org/pdf/2405.16528v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14317v1","updated":"2024-08-26T14:45:03Z","published":"2024-08-26T14:45:03Z","title":"Claim Verification in the Age of Large Language Models: A Survey","summary":"  The large and ever-increasing amount of data available on the Internet\ncoupled with the laborious task of manual claim and fact verification has\nsparked the interest in the development of automated claim verification\nsystems. Several deep learning and transformer-based models have been proposed\nfor this task over the years. With the introduction of Large Language Models\n(LLMs) and their superior performance in several NLP tasks, we have seen a\nsurge of LLM-based approaches to claim verification along with the use of novel\nmethods such as Retrieval Augmented Generation (RAG). In this survey, we\npresent a comprehensive account of recent claim verification frameworks using\nLLMs. We describe the different components of the claim verification pipeline\nused in these frameworks in detail including common approaches to retrieval,\nprompting, and fine-tuning. Finally, we describe publicly available English\ndatasets created for this task.\n","authors":["Alphaeus Dmonte","Roland Oruche","Marcos Zampieri","Prasad Calyam","Isabelle Augenstein"],"pdf_url":"https://arxiv.org/pdf/2408.14317v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14307v1","updated":"2024-08-26T14:38:19Z","published":"2024-08-26T14:38:19Z","title":"LLM-3D Print: Large Language Models To Monitor and Control 3D Printing","summary":"  Industry 4.0 has revolutionized manufacturing by driving digitalization and\nshifting the paradigm toward additive manufacturing (AM). Fused Deposition\nModeling (FDM), a key AM technology, enables the creation of highly customized,\ncost-effective products with minimal material waste through layer-by-layer\nextrusion, posing a significant challenge to traditional subtractive methods.\nHowever, the susceptibility of material extrusion techniques to errors often\nrequires expert intervention to detect and mitigate defects that can severely\ncompromise product quality. While automated error detection and machine\nlearning models exist, their generalizability across diverse 3D printer setups,\nfirmware, and sensors is limited, and deep learning methods require extensive\nlabeled datasets, hindering scalability and adaptability. To address these\nchallenges, we present a process monitoring and control framework that\nleverages pre-trained Large Language Models (LLMs) alongside 3D printers to\ndetect and address printing defects. The LLM evaluates print quality by\nanalyzing images captured after each layer or print segment, identifying\nfailure modes and querying the printer for relevant parameters. It then\ngenerates and executes a corrective action plan. We validated the effectiveness\nof the proposed framework in identifying defects by comparing it against a\ncontrol group of engineers with diverse AM expertise. Our evaluation\ndemonstrated that LLM-based agents not only accurately identify common 3D\nprinting errors, such as inconsistent extrusion, stringing, warping, and layer\nadhesion, but also effectively determine the parameters causing these failures\nand autonomously correct them without any need for human intervention.\n","authors":["Yayati Jadhav","Peter Pak","Amir Barati Farimani"],"pdf_url":"https://arxiv.org/pdf/2408.14307v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10594v3","updated":"2024-08-26T14:30:38Z","published":"2024-06-15T11:03:33Z","title":"BlockPruner: Fine-grained Pruning for Large Language Models","summary":"  With the rapid growth in the size and complexity of large language models\n(LLMs), the costs associated with their training and inference have escalated\nsignificantly. Research indicates that certain layers in LLMs harbor\nsubstantial redundancy, and pruning these layers has minimal impact on the\noverall performance. While various layer pruning methods have been developed\nbased on this insight, they generally overlook the finer-grained redundancies\nwithin the layers themselves. In this paper, we delve deeper into the\narchitecture of LLMs and demonstrate that finer-grained pruning can be achieved\nby targeting redundancies in multi-head attention (MHA) and multi-layer\nperceptron (MLP) blocks. We propose a novel, training-free structured pruning\napproach called BlockPruner. Unlike existing layer pruning methods, BlockPruner\nsegments each Transformer layer into MHA and MLP blocks. It then assesses the\nimportance of these blocks using perplexity measures and applies a heuristic\nsearch for iterative pruning. We applied BlockPruner to LLMs of various sizes\nand architectures and validated its performance across a wide range of\ndownstream tasks. Experimental results show that BlockPruner achieves more\ngranular and effective pruning compared to state-of-the-art baselines.\n","authors":["Longguang Zhong","Fanqi Wan","Ruijun Chen","Xiaojun Quan","Liangzhi Li"],"pdf_url":"https://arxiv.org/pdf/2406.10594v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14283v1","updated":"2024-08-26T14:09:28Z","published":"2024-08-26T14:09:28Z","title":"Predictability and Causality in Spanish and English Natural Language\n  Generation","summary":"  In recent years, the field of Natural Language Generation (NLG) has been\nboosted by the recent advances in deep learning technologies. Nonetheless,\nthese new data-intensive methods introduce language-dependent disparities in\nNLG as the main training data sets are in English. Also, most neural NLG\nsystems use decoder-only (causal) transformer language models, which work well\nfor English, but were not designed with other languages in mind. In this work\nwe depart from the hypothesis that they may introduce generation bias in target\nlanguages with less rigid word ordering, subject omission, or different\nattachment preferences for relative clauses, so that for these target languages\nother language generation strategies may be more desirable. This paper first\ncompares causal and non-causal language modeling for English and Spanish, two\nlanguages with different grammatical structures and over 1.5 billion and 0.5\nbillion speakers, respectively. For this purpose, we define a novel metric of\naverage causal and non-causal context-conditioned entropy of the grammatical\ncategory distribution for both languages as an information-theoretic a priori\napproach. The evaluation of natural text sources (such as training data) in\nboth languages reveals lower average non-causal conditional entropy in Spanish\nand lower causal conditional entropy in English. According to this experiment,\nSpanish is more predictable than English given a non-causal context. Then, by\napplying a conditional relative entropy metric to text generation experiments,\nwe obtain as insights that the best performance is respectively achieved with\ncausal NLG in English, and with non-causal NLG in Spanish. These insights\nsupport further research in NLG in Spanish using bidirectional transformer\nlanguage models.\n","authors":["Andrea Busto-Castiñeira","Francisco J. González-Castaño","Silvia García-Méndez","Francisco de Arriba-Pérez"],"pdf_url":"https://arxiv.org/pdf/2408.14283v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09869v2","updated":"2024-08-26T13:55:59Z","published":"2024-08-19T10:20:06Z","title":"Docling Technical Report","summary":"  This technical report introduces Docling, an easy to use, self-contained,\nMIT-licensed open-source package for PDF document conversion. It is powered by\nstate-of-the-art specialized AI models for layout analysis (DocLayNet) and\ntable structure recognition (TableFormer), and runs efficiently on commodity\nhardware in a small resource budget. The code interface allows for easy\nextensibility and addition of new features and models.\n","authors":["Christoph Auer","Maksym Lysak","Ahmed Nassar","Michele Dolfi","Nikolaos Livathinos","Panos Vagenas","Cesar Berrospi Ramis","Matteo Omenetti","Fabian Lindlbauer","Kasper Dinkla","Valery Weber","Lucas Morin","Ingmar Meijer","Viktor Kuropiatnyk","Peter W. J. Staar"],"pdf_url":"https://arxiv.org/pdf/2408.09869v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14277v1","updated":"2024-08-26T13:53:04Z","published":"2024-08-26T13:53:04Z","title":"Epidemic Information Extraction for Event-Based Surveillance using Large\n  Language Models","summary":"  This paper presents a novel approach to epidemic surveillance, leveraging the\npower of Artificial Intelligence and Large Language Models (LLMs) for effective\ninterpretation of unstructured big data sources, like the popular ProMED and\nWHO Disease Outbreak News. We explore several LLMs, evaluating their\ncapabilities in extracting valuable epidemic information. We further enhance\nthe capabilities of the LLMs using in-context learning, and test the\nperformance of an ensemble model incorporating multiple open-source LLMs. The\nfindings indicate that LLMs can significantly enhance the accuracy and\ntimeliness of epidemic modelling and forecasting, offering a promising tool for\nmanaging future pandemic events.\n","authors":["Sergio Consoli","Peter Markov","Nikolaos I. Stilianakis","Lorenzo Bertolini","Antonio Puertas Gallardo","Mario Ceresa"],"pdf_url":"https://arxiv.org/pdf/2408.14277v1.pdf","comment":"11 pages, 4 figures, Ninth International Congress on Information and\n  Communication Technology (ICICT 2024)"},{"id":"http://arxiv.org/abs/2408.14262v1","updated":"2024-08-26T13:29:25Z","published":"2024-08-26T13:29:25Z","title":"Self-supervised Speech Representations Still Struggle with African\n  American Vernacular English","summary":"  Underperformance of ASR systems for speakers of African American Vernacular\nEnglish (AAVE) and other marginalized language varieties is a well-documented\nphenomenon, and one that reinforces the stigmatization of these varieties. We\ninvestigate whether or not the recent wave of Self-Supervised Learning (SSL)\nspeech models can close the gap in ASR performance between AAVE and Mainstream\nAmerican English (MAE). We evaluate four SSL models (wav2vec 2.0, HuBERT,\nWavLM, and XLS-R) on zero-shot Automatic Speech Recognition (ASR) for these two\nvarieties and find that these models perpetuate the bias in performance against\nAAVE. Additionally, the models have higher word error rates on utterances with\nmore phonological and morphosyntactic features of AAVE. Despite the success of\nSSL speech models in improving ASR for low resource varieties, SSL pre-training\nalone may not bridge the gap between AAVE and MAE. Our code is publicly\navailable at https://github.com/cmu-llab/s3m-aave.\n","authors":["Kalvin Chang","Yi-Hui Chou","Jiatong Shi","Hsuan-Ming Chen","Nicole Holliday","Odette Scharenborg","David R. Mortensen"],"pdf_url":"https://arxiv.org/pdf/2408.14262v1.pdf","comment":"INTERSPEECH 2024"},{"id":"http://arxiv.org/abs/2407.20584v2","updated":"2024-08-26T13:19:48Z","published":"2024-07-30T06:33:44Z","title":"Pruning Large Language Models with Semi-Structural Adaptive Sparse\n  Training","summary":"  The tremendous success of Large Language Models (LLMs) across various complex\ntasks relies heavily on their substantial scale, which raises challenges during\nmodel deployment due to their large memory consumption. Recently, numerous\nstudies have attempted to compress LLMs using one-shot pruning methods.\nHowever, these methods often experience considerable performance degradation on\ncomplex language understanding tasks, calling into question the feasibility of\npruning in LLMs. To address this issue, we propose a pruning pipeline for\nsemi-structured sparse models via retraining, termed Adaptive Sparse Trainer\n(AST). Unlike previous one-shot pruning methods, AST incrementally transforms\ndense models into sparse ones by applying decay to masked weights while\nallowing the model to adaptively select masks throughout the training process.\nFurthermore, we observe that using distillation with a dense model as the\nteacher can prevent the sparse model from falling into local optima and\naccelerate convergence. In addition, we incorporate extra well-initialized\nparameters to further enhance model performance with minimal increase in memory\nfootprint. AST can significantly enhance model performance, approaching the\nlevel of dense models. When applied to the LLaMA2-7B model, AST reduces the\nzero-shot accuracy gap between dense and semi-structured sparse models to 1.12%\nacross multiple zero-shot tasks, utilizing less than 0.4% of the pretraining\ntokens. Our work demonstrates the feasibility of deploying semi-structured\nsparse large language models and introduces a novel method for achieving highly\ncompressed models when combined with existing quantization techniques.\n","authors":["Weiyu Huang","Yuezhou Hu","Guohao Jian","Jun Zhu","Jianfei Chen"],"pdf_url":"https://arxiv.org/pdf/2407.20584v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14236v1","updated":"2024-08-26T12:50:27Z","published":"2024-08-26T12:50:27Z","title":"DSTI at LLMs4OL 2024 Task A: Intrinsic versus extrinsic knowledge for\n  type classification","summary":"  We introduce semantic towers, an extrinsic knowledge representation method,\nand compare it to intrinsic knowledge in large language models for ontology\nlearning. Our experiments show a trade-off between performance and semantic\ngrounding for extrinsic knowledge compared to a fine-tuned model intrinsic\nknowledge. We report our findings on the Large Language Models for Ontology\nLearning (LLMs4OL) 2024 challenge.\n","authors":["Hanna Abi Akl"],"pdf_url":"https://arxiv.org/pdf/2408.14236v1.pdf","comment":"8 pages, 4 figures, accepted for the LLMs4OL challenge at the\n  International Semantic Web Conference (ISWC) 2024"},{"id":"http://arxiv.org/abs/2406.10265v2","updated":"2024-08-26T10:54:12Z","published":"2024-06-11T07:42:13Z","title":"Improving Language Models for Emotion Analysis: Insights from Cognitive\n  Science","summary":"  We propose leveraging cognitive science research on emotions and\ncommunication to improve language models for emotion analysis. First, we\npresent the main emotion theories in psychology and cognitive science. Then, we\nintroduce the main methods of emotion annotation in natural language processing\nand their connections to psychological theories. We also present the two main\ntypes of analyses of emotional communication in cognitive pragmatics. Finally,\nbased on the cognitive science research presented, we propose directions for\nimproving language models for emotion analysis. We suggest that these research\nefforts pave the way for constructing new annotation schemes, methods, and a\npossible benchmark for emotional understanding, considering different facets of\nhuman emotion and communication.\n","authors":["Constant Bonard","Gustave Cortal"],"pdf_url":"https://arxiv.org/pdf/2406.10265v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05141v2","updated":"2024-08-26T10:53:28Z","published":"2024-08-09T15:53:55Z","title":"A Hybrid RAG System with Comprehensive Enhancement on Complex Reasoning","summary":"  Retrieval-augmented generation (RAG) is a framework enabling large language\nmodels (LLMs) to enhance their accuracy and reduce hallucinations by\nintegrating external knowledge bases. In this paper, we introduce a hybrid RAG\nsystem enhanced through a comprehensive suite of optimizations that\nsignificantly improve retrieval quality, augment reasoning capabilities, and\nrefine numerical computation ability. We refined the text chunks and tables in\nweb pages, added attribute predictors to reduce hallucinations, conducted LLM\nKnowledge Extractor and Knowledge Graph Extractor, and finally built a\nreasoning strategy with all the references. We evaluated our system on the CRAG\ndataset through the Meta CRAG KDD Cup 2024 Competition. Both the local and\nonline evaluations demonstrate that our system significantly enhances complex\nreasoning capabilities. In local evaluations, we have significantly improved\naccuracy and reduced error rates compared to the baseline model, achieving a\nnotable increase in scores. In the meanwhile, we have attained outstanding\nresults in online assessments, demonstrating the performance and generalization\ncapabilities of the proposed system. The source code for our system is released\nin \\url{https://gitlab.aicrowd.com/shizueyy/crag-new}.\n","authors":["Ye Yuan","Chengwu Liu","Jingyang Yuan","Gongbo Sun","Siqi Li","Ming Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.05141v2.pdf","comment":"Technical report for 3rd prize in Task 1 of Meta CRAG KDD Cup 2024"},{"id":"http://arxiv.org/abs/2312.03731v7","updated":"2024-08-26T10:11:45Z","published":"2023-11-28T02:36:53Z","title":"MultiGPrompt for Multi-Task Pre-Training and Prompting on Graphs","summary":"  Graphs can inherently model interconnected objects on the Web, thereby\nfacilitating a series of Web applications, such as web analyzing and content\nrecommendation. Recently, Graph Neural Networks (GNNs) have emerged as a\nmainstream technique for graph representation learning. However, their efficacy\nwithin an end-to-end supervised framework is significantly tied to the\navailabilityof task-specific labels. To mitigate labeling costs and enhance\nrobustness in few-shot settings, pre-training on self-supervised tasks has\nemerged as a promising method, while prompting has been proposed to further\nnarrow the objective gap between pretext and downstream tasks. Although there\nhas been some initial exploration of prompt-based learning on graphs, they\nprimarily leverage a single pretext task, resulting in a limited subset of\ngeneral knowledge that could be learned from the pre-training data. Hence, in\nthis paper, we propose MultiGPrompt, a novel multi-task pre-training and\nprompting framework to exploit multiple pretext tasks for more comprehensive\npre-trained knowledge. First, in pre-training, we design a set of pretext\ntokens to synergize multiple pretext tasks. Second, we propose a dual-prompt\nmechanism consisting of composed and open prompts to leverage task-specific and\nglobal pre-training knowledge, to guide downstream tasks in few-shot settings.\nFinally, we conduct extensive experiments on six public datasets to evaluate\nand analyze MultiGPrompt.\n","authors":["Xingtong Yu","Chang Zhou","Yuan Fang","Xinming Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.03731v7.pdf","comment":"WWW2024 research track"},{"id":"http://arxiv.org/abs/2408.14154v1","updated":"2024-08-26T09:57:19Z","published":"2024-08-26T09:57:19Z","title":"Investigating the effect of Mental Models in User Interaction with an\n  Adaptive Dialog Agent","summary":"  Mental models play an important role in whether user interaction with\nintelligent systems, such as dialog systems is successful or not. Adaptive\ndialog systems present the opportunity to align a dialog agent's behavior with\nheterogeneous user expectations. However, there has been little research into\nwhat mental models users form when interacting with a task-oriented dialog\nsystem, how these models affect users' interactions, or what role system\nadaptation can play in this process, making it challenging to avoid damage to\nhuman-AI partnership. In this work, we collect a new publicly available dataset\nfor exploring user mental models about information seeking dialog systems. We\ndemonstrate that users have a variety of conflicting mental models about such\nsystems, the validity of which directly impacts the success of their\ninteractions and perceived usability of system. Furthermore, we show that\nadapting a dialog agent's behavior to better align with users' mental models,\neven when done implicitly, can improve perceived usability, dialog efficiency,\nand success. To this end, we argue that implicit adaptation can be a valid\nstrategy for task-oriented dialog systems, so long as developers first have a\nsolid understanding of users' mental models.\n","authors":["Lindsey Vanderlyn","Dirk Väth","Ngoc Thang Vu"],"pdf_url":"https://arxiv.org/pdf/2408.14154v1.pdf","comment":"submitted to COLING 2025"},{"id":"http://arxiv.org/abs/2408.14153v1","updated":"2024-08-26T09:55:34Z","published":"2024-08-26T09:55:34Z","title":"Explaining Vision-Language Similarities in Dual Encoders with\n  Feature-Pair Attributions","summary":"  Dual encoder architectures like CLIP models map two types of inputs into a\nshared embedding space and learn similarities between them. However, it is not\nunderstood how such models compare two inputs. Here, we address this research\ngap with two contributions. First, we derive a method to attribute predictions\nof any differentiable dual encoder onto feature-pair interactions between its\ninputs. Second, we apply our method to CLIP-type models and show that they\nlearn fine-grained correspondences between parts of captions and regions in\nimages. They match objects across input modes and also account for mismatches.\nHowever, this visual-linguistic grounding ability heavily varies between object\nclasses, depends on the training data distribution, and largely improves after\nin-domain training. Using our method we can identify knowledge gaps about\nspecific object classes in individual models and can monitor their improvement\nupon fine-tuning.\n","authors":["Lucas Möller","Pascal Tilli","Ngoc Thang Vu","Sebastian Padó"],"pdf_url":"https://arxiv.org/pdf/2408.14153v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.04660v3","updated":"2024-08-26T09:37:46Z","published":"2024-08-05T20:01:10Z","title":"XMainframe: A Large Language Model for Mainframe Modernization","summary":"  Mainframe operating systems, despite their inception in the 1940s, continue\nto support critical sectors like finance and government. However, these systems\nare often viewed as outdated, requiring extensive maintenance and\nmodernization. Addressing this challenge necessitates innovative tools that can\nunderstand and interact with legacy codebases. To this end, we introduce\nXMainframe, a state-of-the-art large language model (LLM) specifically designed\nwith knowledge of mainframe legacy systems and COBOL codebases. Our solution\ninvolves the creation of an extensive data collection pipeline to produce\nhigh-quality training datasets, enhancing XMainframe's performance in this\nspecialized domain. Additionally, we present MainframeBench, a comprehensive\nbenchmark for assessing mainframe knowledge, including multiple-choice\nquestions, question answering, and COBOL code summarization. Our empirical\nevaluations demonstrate that XMainframe consistently outperforms existing\nstate-of-the-art LLMs across these tasks. Specifically, XMainframe achieves 30%\nhigher accuracy than DeepSeek-Coder on multiple-choice questions, doubles the\nBLEU score of Mixtral-Instruct 8x7B on question answering, and scores six times\nhigher than GPT-3.5 on COBOL summarization. Our work highlights the potential\nof XMainframe to drive significant advancements in managing and modernizing\nlegacy systems, thereby enhancing productivity and saving time for software\ndevelopers.\n","authors":["Anh T. V. Dau","Hieu Trung Dao","Anh Tuan Nguyen","Hieu Trung Tran","Phong X. Nguyen","Nghi D. Q. Bui"],"pdf_url":"https://arxiv.org/pdf/2408.04660v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14141v1","updated":"2024-08-26T09:37:42Z","published":"2024-08-26T09:37:42Z","title":"Crowd-Calibrator: Can Annotator Disagreement Inform Calibration in\n  Subjective Tasks?","summary":"  Subjective tasks in NLP have been mostly relegated to objective standards,\nwhere the gold label is decided by taking the majority vote. This obfuscates\nannotator disagreement and the inherent uncertainty of the label. We argue that\nsubjectivity should factor into model decisions and play a direct role via\ncalibration under a selective prediction setting. Specifically, instead of\ncalibrating confidence purely from the model's perspective, we calibrate models\nfor subjective tasks based on crowd worker agreement. Our method,\nCrowd-Calibrator, models the distance between the distribution of crowd worker\nlabels and the model's own distribution over labels to inform whether the model\nshould abstain from a decision. On two highly subjective tasks, hate speech\ndetection and natural language inference, our experiments show Crowd-Calibrator\neither outperforms or achieves competitive performance with existing selective\nprediction baselines. Our findings highlight the value of bringing human\ndecision-making into model predictions.\n","authors":["Urja Khurana","Eric Nalisnick","Antske Fokkens","Swabha Swayamdipta"],"pdf_url":"https://arxiv.org/pdf/2408.14141v1.pdf","comment":"Accepted at COLM 2024"},{"id":"http://arxiv.org/abs/2403.08564v2","updated":"2024-08-26T09:35:39Z","published":"2024-03-13T14:19:08Z","title":"Non-discrimination Criteria for Generative Language Models","summary":"  Generative AI, such as large language models, has undergone rapid development\nwithin recent years. As these models become increasingly available to the\npublic, concerns arise about perpetuating and amplifying harmful biases in\napplications. Gender stereotypes can be harmful and limiting for the\nindividuals they target, whether they consist of misrepresentation or\ndiscrimination. Recognizing gender bias as a pervasive societal construct, this\npaper studies how to uncover and quantify the presence of gender biases in\ngenerative language models. In particular, we derive generative AI analogues of\nthree well-known non-discrimination criteria from classification, namely\nindependence, separation and sufficiency. To demonstrate these criteria in\naction, we design prompts for each of the criteria with a focus on occupational\ngender stereotype, specifically utilizing the medical test to introduce the\nground truth in the generative AI context. Our results address the presence of\noccupational gender bias within such conversational language models.\n","authors":["Sara Sterlie","Nina Weng","Aasa Feragen"],"pdf_url":"https://arxiv.org/pdf/2403.08564v2.pdf","comment":"14 pages, 3 figures"},{"id":"http://arxiv.org/abs/2408.14137v1","updated":"2024-08-26T09:34:36Z","published":"2024-08-26T09:34:36Z","title":"Multi-Faceted Evaluation of Modeling Languages for Augmented Reality\n  Applications -- The Case of ARWFML","summary":"  The evaluation of modeling languages for augmented reality applications poses\nparticular challenges due to the three-dimensional environment they target. The\npreviously introduced Augmented Reality Workflow Modeling Language (ARWFML)\nenables the model-based creation of augmented reality scenarios without\nprogramming knowledge. Building upon the first design cycle of the language's\nspecification, this paper presents two further design iterations for refining\nthe language based on multi-faceted evaluations. These include a comparative\nevaluation of implementation options and workflow capabilities, the\nintroduction of a 3D notation, and the development of a new 3D modeling\nenvironment. On this basis, a comprehensibility study of the language was\nconducted. Thereby, we show how modeling languages for augmented reality can be\nevolved towards a maturity level suitable for empirical evaluations.\n","authors":["Fabian Muff","Hans-Georg Fill"],"pdf_url":"https://arxiv.org/pdf/2408.14137v1.pdf","comment":"Accepted manuscript for the 43rd International Conference on\n  Conceptual Modeling Conceptual Modeling, AI, and Beyond 28-31 October 2024 |\n  Pittsburgh, Pennsylvania, USA"},{"id":"http://arxiv.org/abs/2408.14119v1","updated":"2024-08-26T09:08:26Z","published":"2024-08-26T09:08:26Z","title":"Contrastive Learning Subspace for Text Clustering","summary":"  Contrastive learning has been frequently investigated to learn effective\nrepresentations for text clustering tasks. While existing contrastive\nlearning-based text clustering methods only focus on modeling instance-wise\nsemantic similarity relationships, they ignore contextual information and\nunderlying relationships among all instances that needs to be clustered. In\nthis paper, we propose a novel text clustering approach called Subspace\nContrastive Learning (SCL) which models cluster-wise relationships among\ninstances. Specifically, the proposed SCL consists of two main modules: (1) a\nself-expressive module that constructs virtual positive samples and (2) a\ncontrastive learning module that further learns a discriminative subspace to\ncapture task-specific cluster-wise relationships among texts. Experimental\nresults show that the proposed SCL method not only has achieved superior\nresults on multiple task clustering datasets but also has less complexity in\npositive sample construction.\n","authors":["Qian Yong","Chen Chen","Xiabing Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.14119v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11534v6","updated":"2024-08-26T08:52:44Z","published":"2023-08-21T06:51:56Z","title":"PlatoLM: Teaching LLMs in Multi-Round Dialogue via a User Simulator","summary":"  The unparalleled performance of closed-sourced ChatGPT has sparked efforts\ntowards its democratization, with notable strides made by leveraging real user\nand ChatGPT dialogues, as evidenced by Vicuna. However, due to challenges in\ngathering dialogues involving human participation, current endeavors like Baize\nand UltraChat rely on ChatGPT conducting roleplay to simulate humans based on\ninstructions, resulting in overdependence on seeds, diminished human-likeness,\nlimited topic diversity, and an absence of genuine multi-round conversational\ndynamics. To address the above issues, we propose a paradigm to simulate human\nbehavior better and explore the benefits of incorporating more human-like\nquestions in multi-turn conversations. Specifically, we directly target human\nquestions extracted from genuine human-machine conversations as a learning goal\nand provide a novel user simulator called `Socratic'. The experimental results\nshow our response model, `PlatoLM', achieves SoTA performance among LLaMA-based\n7B models in MT-Bench. Our findings further demonstrate that our method\nintroduces highly human-like questioning patterns and rich topic structures,\nwhich can teach the response model better than previous works in multi-round\nconversations.\n","authors":["Chuyi Kong","Yaxin Fan","Xiang Wan","Feng Jiang","Benyou Wang"],"pdf_url":"https://arxiv.org/pdf/2308.11534v6.pdf","comment":"23 pages"},{"id":"http://arxiv.org/abs/2406.10833v2","updated":"2024-08-26T08:47:54Z","published":"2024-06-16T08:03:24Z","title":"A Comprehensive Survey of Scientific Large Language Models and Their\n  Applications in Scientific Discovery","summary":"  In many scientific fields, large language models (LLMs) have revolutionized\nthe way text and other modalities of data (e.g., molecules and proteins) are\nhandled, achieving superior performance in various applications and augmenting\nthe scientific discovery process. Nevertheless, previous surveys on scientific\nLLMs often concentrate on one or two fields or a single modality. In this\npaper, we aim to provide a more holistic view of the research landscape by\nunveiling cross-field and cross-modal connections between scientific LLMs\nregarding their architectures and pre-training techniques. To this end, we\ncomprehensively survey over 250 scientific LLMs, discuss their commonalities\nand differences, as well as summarize pre-training datasets and evaluation\ntasks for each field and modality. Moreover, we investigate how LLMs have been\ndeployed to benefit scientific discovery. Resources related to this survey are\navailable at https://github.com/yuzhimanhua/Awesome-Scientific-Language-Models.\n","authors":["Yu Zhang","Xiusi Chen","Bowen Jin","Sheng Wang","Shuiwang Ji","Wei Wang","Jiawei Han"],"pdf_url":"https://arxiv.org/pdf/2406.10833v2.pdf","comment":"34 pages (GitHub:\n  https://github.com/yuzhimanhua/Awesome-Scientific-Language-Models)"},{"id":"http://arxiv.org/abs/2403.11322v4","updated":"2024-08-26T08:25:01Z","published":"2024-03-17T19:54:16Z","title":"StateFlow: Enhancing LLM Task-Solving through State-Driven Workflows","summary":"  It is a notable trend to use Large Language Models (LLMs) to tackle complex\ntasks, e.g., tasks that require a sequence of actions and dynamic interaction\nwith tools and external environments. In this paper, we propose StateFlow, a\nnovel LLM-based task-solving paradigm that conceptualizes complex task-solving\nprocesses as state machines. In StateFlow, we distinguish between \"process\ngrounding\" (via state and state transitions) and \"sub-task solving\" (through\nactions within a state), enhancing control and interpretability of the\ntask-solving procedure. A state represents the status of a running process. The\ntransitions between states are controlled by heuristic rules or decisions made\nby the LLM, allowing for a dynamic and adaptive progression. Upon entering a\nstate, a series of actions is executed, involving not only calling LLMs guided\nby different prompts, but also the utilization of external tools as needed. Our\nresults show that StateFlow significantly enhances LLMs' efficiency. For\ninstance, StateFlow achieves 13% and 28% higher success rates compared to ReAct\nin InterCode SQL and ALFWorld benchmark, with 5x and 3x less cost respectively.\nWe also show that StateFlow can be combined with iterative refining methods\nlike Reflexion to further improve performance.\n","authors":["Yiran Wu","Tianwei Yue","Shaokun Zhang","Chi Wang","Qingyun Wu"],"pdf_url":"https://arxiv.org/pdf/2403.11322v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03624v2","updated":"2024-08-26T08:09:39Z","published":"2024-07-04T04:19:50Z","title":"Question-Analysis Prompting Improves LLM Performance in Reasoning Tasks","summary":"  Although LLMs have the potential to transform many fields, they still\nunderperform humans in reasoning tasks. Existing methods induce the model to\nproduce step-by-step calculations, but this research explores the question:\nDoes making the LLM analyze the question improve its performance? We propose a\nnovel prompting strategy called Question Analysis Prompting (QAP), in which the\nmodel is prompted to explain the question in $n$ words before solving. The\nvalue of $n$ influences the length of response generated by the model. QAP is\nevaluated on GPT 3.5 Turbo and GPT 4 Turbo on arithmetic datasets GSM8K, AQuA,\nand SAT and commonsense dataset StrategyQA. QAP is compared with other\nstate-of-the-art prompts including Chain-of-Thought (CoT), Plan and Solve\nPrompting (PS+) and Take A Deep Breath (TADB). QAP outperforms all\nstate-of-the-art prompts on AQuA and SAT datasets on both GPT3.5 and GPT4. QAP\nconsistently ranks among the top-2 prompts on 75\\% of the tests. A key factor\nof QAP performance can be attributed to response length, where detailed\nresponses are beneficial when answering harder questions, but can negatively\naffect easy questions.\n","authors":["Dharunish Yugeswardeenoo","Kevin Zhu","Sean O'Brien"],"pdf_url":"https://arxiv.org/pdf/2407.03624v2.pdf","comment":"Accepted in Proceedings of the 62nd Annual Meeting of the Association\n  for Computational Linguistics: Student Research Workshop (ACL-SRW 2024) 11\n  pages, 8 figures"},{"id":"http://arxiv.org/abs/2407.09893v2","updated":"2024-08-26T07:54:27Z","published":"2024-07-13T13:58:24Z","title":"Synergistic Multi-Agent Framework with Trajectory Learning for\n  Knowledge-Intensive Tasks","summary":"  Recent advancements in Large Language Models (LLMs) have led to significant\nbreakthroughs in various natural language processing tasks. However, generating\nfactually consistent responses in knowledge-intensive scenarios remains a\nchallenge due to issues such as hallucination, difficulty in acquiring\nlong-tailed knowledge, and limited memory expansion. This paper introduces\nSMART, a novel multi-agent framework that leverages external knowledge to\nenhance the interpretability and factual consistency of LLM-generated\nresponses. SMART comprises four specialized agents, each performing a specific\nsub-trajectory action to navigate complex knowledge-intensive tasks. We propose\na multi-agent co-training paradigm, Long-Short Trajectory Learning, which\nensures synergistic collaboration among agents while maintaining fine-grained\nexecution by each agent. Extensive experiments on five knowledge-intensive\ntasks demonstrate SMART's superior performance compared to widely adopted\nknowledge internalization and knowledge enhancement methods. Our framework can\nextend beyond knowledge-intensive tasks to more complex scenarios. Our code is\navailable at https://github.com/yueshengbin/SMART.\n","authors":["Shengbin Yue","Siyuan Wang","Wei Chen","Xuanjing Huang","Zhongyu Wei"],"pdf_url":"https://arxiv.org/pdf/2407.09893v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.04975v4","updated":"2024-08-26T07:48:19Z","published":"2024-08-09T09:56:30Z","title":"reCSE: Portable Reshaping Features for Sentence Embedding in\n  Self-supervised Contrastive Learning","summary":"  We propose reCSE, a self supervised contrastive learning sentence\nrepresentation framework based on feature reshaping. This framework is\ndifferent from the current advanced models that use discrete data augmentation\nmethods, but instead reshapes the input features of the original sentence,\naggregates the global information of each token in the sentence, and alleviates\nthe common problems of representation polarity and GPU memory consumption\nlinear increase in current advanced models. In addition, our reCSE has achieved\ncompetitive performance in semantic similarity tasks. And the experiment proves\nthat our proposed feature reshaping method has strong universality, which can\nbe transplanted to other self supervised contrastive learning frameworks and\nenhance their representation ability, even achieving state-of-the-art\nperformance. Our code is available at https://github.com/heavenhellchen/reCSE.\n","authors":["Fufangchen Zhao","Jian Gao","Danfeng Yan"],"pdf_url":"https://arxiv.org/pdf/2408.04975v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10903v3","updated":"2024-08-26T07:37:19Z","published":"2024-08-20T14:47:38Z","title":"BEYOND DIALOGUE: A Profile-Dialogue Alignment Framework Towards General\n  Role-Playing Language Model","summary":"  The rapid advancement of large language models (LLMs) has revolutionized\nrole-playing, enabling the development of general role-playing models. However,\ncurrent role-playing training has two significant issues: (I) Using a\npredefined role profile to prompt dialogue training for specific scenarios\nusually leads to inconsistencies and even conflicts between the dialogue and\nthe profile, resulting in training biases. (II) The model learns to imitate the\nrole based solely on the profile, neglecting profile-dialogue alignment at the\nsentence level. In this work, we propose a simple yet effective framework\ncalled BEYOND DIALOGUE, designed to overcome these hurdles. This framework\ninnovatively introduces \"beyond dialogue\" tasks to align dialogue with profile\ntraits based on each specific scenario, thereby eliminating biases during\ntraining. Furthermore, by adopting an innovative prompting mechanism that\ngenerates reasoning outcomes for training, the framework allows the model to\nachieve fine-grained alignment between profile and dialogue at the sentence\nlevel. The aforementioned methods are fully automated and low-cost.\nAdditionally, the integration of automated dialogue and objective evaluation\nmethods forms a comprehensive framework, paving the way for general\nrole-playing. Experimental results demonstrate that our model excels in\nadhering to and reflecting various dimensions of role profiles, outperforming\nmost proprietary general and specialized role-playing baselines. All code and\ndatasets are available at https://github.com/yuyouyu32/BeyondDialogue.\n","authors":["Yeyong Yu","Rusheng Yu","Haojie Wei","Zhanqiu Zhang","Quan Qian"],"pdf_url":"https://arxiv.org/pdf/2408.10903v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14053v1","updated":"2024-08-26T07:19:07Z","published":"2024-08-26T07:19:07Z","title":"Enhancing Depression Diagnosis with Chain-of-Thought Prompting","summary":"  When using AI to detect signs of depressive disorder, AI models habitually\ndraw preemptive conclusions. We theorize that using chain-of-thought (CoT)\nprompting to evaluate Patient Health Questionnaire-8 (PHQ-8) scores will\nimprove the accuracy of the scores determined by AI models. In our findings,\nwhen the models reasoned with CoT, the estimated PHQ-8 scores were consistently\ncloser on average to the accepted true scores reported by each participant\ncompared to when not using CoT. Our goal is to expand upon AI models'\nunderstanding of the intricacies of human conversation, allowing them to more\neffectively assess a patient's feelings and tone, therefore being able to more\naccurately discern mental disorder symptoms; ultimately, we hope to augment AI\nmodels' abilities, so that they can be widely accessible and used in the\nmedical field.\n","authors":["Elysia Shi","Adithri Manda","London Chowdhury","Runeema Arun","Kevin Zhu","Michael Lam"],"pdf_url":"https://arxiv.org/pdf/2408.14053v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16535v3","updated":"2024-08-26T07:14:46Z","published":"2023-09-28T15:47:03Z","title":"KLoB: a Benchmark for Assessing Knowledge Locating Methods in Language\n  Models","summary":"  Recently, Locate-Then-Edit paradigm has emerged as one of the main approaches\nin changing factual knowledge stored in the Language models. However, there is\na lack of research on whether present locating methods can pinpoint the exact\nparameters embedding the desired knowledge. Moreover, although many researchers\nhave questioned the validity of locality hypothesis of factual knowledge, no\nmethod is provided to test the a hypothesis for more in-depth discussion and\nresearch. Therefore, we introduce KLoB, a benchmark examining three essential\nproperties that a reliable knowledge locating method should satisfy. KLoB can\nserve as a benchmark for evaluating existing locating methods in language\nmodels, and can contributes a method to reassessing the validity of locality\nhypothesis of factual knowledge. KLoB is publicly available at an anonymous\nGitHub: \\url{https://github.com/anon6662/KLoB}.\n","authors":["Yiming Ju","Xingrun Xing","Zhixiong Zeng"],"pdf_url":"https://arxiv.org/pdf/2309.16535v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03791v2","updated":"2024-08-26T07:13:47Z","published":"2024-07-04T09:55:04Z","title":"M5 -- A Diverse Benchmark to Assess the Performance of Large Multimodal\n  Models Across Multilingual and Multicultural Vision-Language Tasks","summary":"  Since the release of ChatGPT, the field of Natural Language Processing has\nexperienced rapid advancements, particularly in Large Language Models (LLMs)\nand their multimodal counterparts, Large Multimodal Models (LMMs). Despite\ntheir impressive capabilities, LLMs often exhibit significant performance\ndisparities across different languages and cultural contexts, as demonstrated\nby various text-only benchmarks. However, current research lacks such\nbenchmarks for multimodal visio-linguistic settings. This work fills this gap\nby introducing M5, the first comprehensive benchmark designed to evaluate LMMs\non diverse vision-language tasks within a multilingual and multicultural\ncontext. M5 includes eight datasets covering five tasks and $41$ languages,\nwith a focus on underrepresented languages and culturally diverse images.\nFurthermore, we introduce two novel datasets, M5-VGR and M5-VLOD, including a\nnew Visio-Linguistic Outlier Detection task, in which all evaluated open-source\nmodels fail to significantly surpass the random baseline. Through extensive\nevaluation and analyses, we highlight substantial task-agnostic performance\ndisparities between high- and low-resource languages. Moreover, we show that\nlarger models do not necessarily outperform smaller ones in a multilingual\nsetting.\n","authors":["Florian Schneider","Sunayana Sitaram"],"pdf_url":"https://arxiv.org/pdf/2407.03791v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.06607v4","updated":"2024-08-26T06:57:51Z","published":"2023-11-11T16:37:41Z","title":"Monkey: Image Resolution and Text Label Are Important Things for Large\n  Multi-modal Models","summary":"  Large Multimodal Models (LMMs) have shown promise in vision-language tasks\nbut struggle with high-resolution input and detailed scene understanding.\nAddressing these challenges, we introduce Monkey to enhance LMM capabilities.\nFirstly, Monkey processes input images by dividing them into uniform patches,\neach matching the size (e.g., 448x448) used in the original training of the\nwell-trained vision encoder. Equipped with individual adapter for each patch,\nMonkey can handle higher resolutions up to 1344x896 pixels, enabling the\ndetailed capture of complex visual information. Secondly, it employs a\nmulti-level description generation method, enriching the context for\nscene-object associations. This two-part strategy ensures more effective\nlearning from generated data: the higher resolution allows for a more detailed\ncapture of visuals, which in turn enhances the effectiveness of comprehensive\ndescriptions. Extensive ablative results validate the effectiveness of our\ndesigns. Additionally, experiments on 18 datasets further demonstrate that\nMonkey surpasses existing LMMs in many tasks like Image Captioning and various\nVisual Question Answering formats. Specially, in qualitative tests focused on\ndense text question answering, Monkey has exhibited encouraging results\ncompared with GPT4V. Code is available at\nhttps://github.com/Yuliang-Liu/Monkey.\n","authors":["Zhang Li","Biao Yang","Qiang Liu","Zhiyin Ma","Shuo Zhang","Jingxu Yang","Yabo Sun","Yuliang Liu","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2311.06607v4.pdf","comment":"CVPR 2024 Highlight"},{"id":"http://arxiv.org/abs/2408.03633v3","updated":"2024-08-26T06:19:53Z","published":"2024-08-07T08:44:44Z","title":"CARE: A Clue-guided Assistant for CSRs to Read User Manuals","summary":"  It is time-saving to build a reading assistant for customer service\nrepresentations (CSRs) when reading user manuals, especially information-rich\nones. Current solutions don't fit the online custom service scenarios well due\nto the lack of attention to user questions and possible responses. Hence, we\npropose to develop a time-saving and careful reading assistant for CSRs, named\nCARE. It can help the CSRs quickly find proper responses from the user manuals\nvia explicit clue chains. Specifically, each of the clue chains is formed by\ninferring over the user manuals, starting from the question clue aligned with\nthe user question and ending at a possible response. To overcome the shortage\nof supervised data, we adopt the self-supervised strategy for model learning.\nThe offline experiment shows that CARE is efficient in automatically inferring\naccurate responses from the user manual. The online experiment further\ndemonstrates the superiority of CARE to reduce CSRs' reading burden and keep\nhigh service quality, in particular with >35% decrease in time spent and\nkeeping a >0.75 ICC score.\n","authors":["Weihong Du","Jia Liu","Zujie Wen","Dingnan Jin","Hongru Liang","Wenqiang Lei"],"pdf_url":"https://arxiv.org/pdf/2408.03633v3.pdf","comment":"Accepted to The 62nd Annual Meeting of the Association for\n  Computational Linguistics (ACL 2024)"},{"id":"http://arxiv.org/abs/2408.14028v1","updated":"2024-08-26T05:38:27Z","published":"2024-08-26T05:38:27Z","title":"SurGen: Text-Guided Diffusion Model for Surgical Video Generation","summary":"  Diffusion-based video generation models have made significant strides,\nproducing outputs with improved visual fidelity, temporal coherence, and user\ncontrol. These advancements hold great promise for improving surgical education\nby enabling more realistic, diverse, and interactive simulation environments.\nIn this study, we introduce SurGen, a text-guided diffusion model tailored for\nsurgical video synthesis, producing the highest resolution and longest duration\nvideos among existing surgical video generation models. We validate the visual\nand temporal quality of the outputs using standard image and video generation\nmetrics. Additionally, we assess their alignment to the corresponding text\nprompts through a deep learning classifier trained on surgical data. Our\nresults demonstrate the potential of diffusion models to serve as valuable\neducational tools for surgical trainees.\n","authors":["Joseph Cho","Samuel Schmidgall","Cyril Zakka","Mrudang Mathur","Rohan Shad","William Hiesinger"],"pdf_url":"https://arxiv.org/pdf/2408.14028v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14026v1","updated":"2024-08-26T05:36:35Z","published":"2024-08-26T05:36:35Z","title":"Empowering Low-Resource Language ASR via Large-Scale Pseudo Labeling","summary":"  In this study, we tackle the challenge of limited labeled data for\nlow-resource languages in ASR, focusing on Hindi. Specifically, we explore\npseudo-labeling, by proposing a generic framework combining multiple ideas from\nexisting works. Our framework integrates multiple base models for transcription\nand evaluators for assessing audio-transcript pairs, resulting in robust\npseudo-labeling for low resource languages. We validate our approach with a new\nbenchmark, IndicYT, comprising diverse YouTube audio files from multiple\ncontent categories. Our findings show that augmenting pseudo labeled data from\nYouTube with existing training data leads to significant performance\nimprovements on IndicYT, without affecting performance on out-of-domain\nbenchmarks, demonstrating the efficacy of pseudo-labeled data in enhancing ASR\ncapabilities for low-resource languages. The benchmark, code and models\ndeveloped as a part of this work will be made publicly available.\n","authors":["Kaushal Santosh Bhogale","Deovrat Mehendale","Niharika Parasa","Sathish Kumar Reddy G","Tahir Javed","Pratyush Kumar","Mitesh M. Khapra"],"pdf_url":"https://arxiv.org/pdf/2408.14026v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.05561v5","updated":"2024-08-26T05:31:38Z","published":"2024-01-10T22:07:21Z","title":"TrustLLM: Trustworthiness in Large Language Models","summary":"  Large language models (LLMs), exemplified by ChatGPT, have gained\nconsiderable attention for their excellent natural language processing\ncapabilities. Nonetheless, these LLMs present many challenges, particularly in\nthe realm of trustworthiness. Therefore, ensuring the trustworthiness of LLMs\nemerges as an important topic. This paper introduces TrustLLM, a comprehensive\nstudy of trustworthiness in LLMs, including principles for different dimensions\nof trustworthiness, established benchmark, evaluation, and analysis of\ntrustworthiness for mainstream LLMs, and discussion of open challenges and\nfuture directions. Specifically, we first propose a set of principles for\ntrustworthy LLMs that span eight different dimensions. Based on these\nprinciples, we further establish a benchmark across six dimensions including\ntruthfulness, safety, fairness, robustness, privacy, and machine ethics. We\nthen present a study evaluating 16 mainstream LLMs in TrustLLM, consisting of\nover 30 datasets. Our findings firstly show that in general trustworthiness and\nutility (i.e., functional effectiveness) are positively related. Secondly, our\nobservations reveal that proprietary LLMs generally outperform most open-source\ncounterparts in terms of trustworthiness, raising concerns about the potential\nrisks of widely accessible open-source LLMs. However, a few open-source LLMs\ncome very close to proprietary ones. Thirdly, it is important to note that some\nLLMs may be overly calibrated towards exhibiting trustworthiness, to the extent\nthat they compromise their utility by mistakenly treating benign prompts as\nharmful and consequently not responding. Finally, we emphasize the importance\nof ensuring transparency not only in the models themselves but also in the\ntechnologies that underpin trustworthiness. Knowing the specific trustworthy\ntechnologies that have been employed is crucial for analyzing their\neffectiveness.\n","authors":["Yue Huang","Lichao Sun","Haoran Wang","Siyuan Wu","Qihui Zhang","Yuan Li","Chujie Gao","Yixin Huang","Wenhan Lyu","Yixuan Zhang","Xiner Li","Zhengliang Liu","Yixin Liu","Yijue Wang","Zhikun Zhang","Bertie Vidgen","Bhavya Kailkhura","Caiming Xiong","Chaowei Xiao","Chunyuan Li","Eric Xing","Furong Huang","Hao Liu","Heng Ji","Hongyi Wang","Huan Zhang","Huaxiu Yao","Manolis Kellis","Marinka Zitnik","Meng Jiang","Mohit Bansal","James Zou","Jian Pei","Jian Liu","Jianfeng Gao","Jiawei Han","Jieyu Zhao","Jiliang Tang","Jindong Wang","Joaquin Vanschoren","John Mitchell","Kai Shu","Kaidi Xu","Kai-Wei Chang","Lifang He","Lifu Huang","Michael Backes","Neil Zhenqiang Gong","Philip S. Yu","Pin-Yu Chen","Quanquan Gu","Ran Xu","Rex Ying","Shuiwang Ji","Suman Jana","Tianlong Chen","Tianming Liu","Tianyi Zhou","William Wang","Xiang Li","Xiangliang Zhang","Xiao Wang","Xing Xie","Xun Chen","Xuyu Wang","Yan Liu","Yanfang Ye","Yinzhi Cao","Yong Chen","Yue Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.05561v5.pdf","comment":"This work is still under work and we welcome your contribution"},{"id":"http://arxiv.org/abs/2405.14213v2","updated":"2024-08-26T04:59:05Z","published":"2024-05-23T06:17:23Z","title":"From Text to Pixel: Advancing Long-Context Understanding in MLLMs","summary":"  The rapid progress in Multimodal Large Language Models (MLLMs) has\nsignificantly advanced their ability to process and understand complex visual\nand textual information. However, the integration of multiple images and\nextensive textual contexts remains a challenge due to the inherent limitation\nof the models' capacity to handle long input sequences efficiently. In this\npaper, we introduce SEEKER, a multimodal large language model designed to\ntackle this issue. SEEKER aims to optimize the compact encoding of long text by\ncompressing the text sequence into the visual pixel space via images, enabling\nthe model to handle long text within a fixed token-length budget efficiently.\nOur empirical experiments on six long-context multimodal tasks demonstrate that\nSEEKER can leverage fewer image tokens to convey the same amount of textual\ninformation compared with the OCR-based approach, and is more efficient in\nunderstanding long-form multimodal input and generating long-form textual\noutput, outperforming all existing proprietary and open-source MLLMs by large\nmargins.\n","authors":["Yujie Lu","Xiujun Li","Tsu-Jui Fu","Miguel Eckstein","William Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2405.14213v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.12529v2","updated":"2024-08-26T04:28:41Z","published":"2024-07-17T13:11:28Z","title":"Crafting the Path: Robust Query Rewriting for Information Retrieval","summary":"  Query rewriting aims to generate a new query that can complement the original\nquery to improve the information retrieval system. Recent studies on query\nrewriting, such as query2doc, query2expand and querey2cot, rely on the internal\nknowledge of Large Language Models (LLMs) to generate a relevant passage to add\ninformation to the query. Nevertheless, the efficacy of these methodologies may\nmarkedly decline in instances where the requisite knowledge is not encapsulated\nwithin the model's intrinsic parameters. In this paper, we propose a novel\nstructured query rewriting method called Crafting the Path tailored for\nretrieval systems. Crafting the Path involves a three-step process that crafts\nquery-related information necessary for finding the passages to be searched in\neach step. Specifically, the Crafting the Path begins with Query Concept\nComprehension, proceeds to Query Type Identification, and finally conducts\nExpected Answer Extraction. Experimental results show that our method\noutperforms previous rewriting methods, especially in less familiar domains for\nLLMs. We demonstrate that our method is less dependent on the internal\nparameter knowledge of the model and generates queries with fewer factual\ninaccuracies. Furthermore, we observe that \\name{} demonstrates superior\nperformance in the retrieval-augmented generation scenarios.\n","authors":["Ingeol Baek","Jimin Lee","Joonho Yang","Hwanhee Lee"],"pdf_url":"https://arxiv.org/pdf/2407.12529v2.pdf","comment":"3 figures, 13 tables"},{"id":"http://arxiv.org/abs/2408.12321v2","updated":"2024-08-26T04:27:54Z","published":"2024-08-22T11:57:16Z","title":"MaVEn: An Effective Multi-granularity Hybrid Visual Encoding Framework\n  for Multimodal Large Language Model","summary":"  This paper presents MaVEn, an innovative Multi-granularity Visual Encoding\nframework designed to enhance the capabilities of Multimodal Large Language\nModels (MLLMs) in multi-image reasoning. Current MLLMs primarily focus on\nsingle-image visual understanding, limiting their ability to interpret and\nintegrate information across multiple images. MaVEn addresses this limitation\nby combining discrete visual symbol sequences, which abstract coarse-grained\nsemantic concepts, with traditional continuous representation sequences that\nmodel fine-grained features. This dual approach bridges the semantic gap\nbetween visual and textual data, thereby improving the model's ability to\nprocess and interpret information from multiple images effectively.\nAdditionally, we design a dynamic reduction mechanism by for long-sequence\ncontinuous features to enhance multi-image processing efficiency. Experimental\nresults demonstrate that MaVEn significantly enhances MLLMs' understanding in\ncomplex multi-image scenarios, while also improving performance in single-image\ncontexts.\n","authors":["Chaoya Jiang","Jia Hongrui","Haiyang Xu","Wei Ye","Mengfan Dong","Ming Yan","Ji Zhang","Fei Huang","Shikun Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.12321v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13282v1","updated":"2024-08-26T02:53:55Z","published":"2024-08-26T02:53:55Z","title":"Question answering system of bridge design specification based on large\n  language model","summary":"  This paper constructs question answering system for bridge design\nspecification based on large language model. Three implementation schemes are\ntried: full fine-tuning of the Bert pretrained model, parameter-efficient\nfine-tuning of the Bert pretrained model, and self-built language model from\nscratch. Through the self-built question and answer task dataset, based on the\ntensorflow and keras deep learning platform framework, the model is constructed\nand trained to predict the start position and end position of the answer in the\nbridge design specification given by the user. The experimental results show\nthat full fine-tuning of the Bert pretrained model achieves 100% accuracy in\nthe training-dataset, validation-dataset and test-dataset, and the system can\nextract the answers from the bridge design specification given by the user to\nanswer various questions of the user; While parameter-efficient fine-tuning of\nthe Bert pretrained model and self-built language model from scratch perform\nwell in the training-dataset, their generalization ability in the test-dataset\nneeds to be improved. The research of this paper provides a useful reference\nfor the development of question answering system in professional field.\n","authors":["Leye Zhang","Xiangxiang Tian","Hongjun Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.13282v1.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2408.13987v1","updated":"2024-08-26T02:53:24Z","published":"2024-08-26T02:53:24Z","title":"Focused Large Language Models are Stable Many-Shot Learners","summary":"  In-Context Learning (ICL) enables large language models (LLMs) to achieve\nrapid task adaptation by learning from demonstrations. With the increase in\navailable context length of LLMs, recent experiments have shown that the\nperformance of ICL does not necessarily scale well in many-shot (demonstration)\nsettings. We theoretically and experimentally confirm that the reason lies in\nmore demonstrations dispersing the model attention from the query, hindering\nits understanding of key content. Inspired by how humans learn from examples,\nwe propose a training-free method FocusICL, which conducts triviality filtering\nto avoid attention being diverted by unimportant contents at token-level and\noperates hierarchical attention to further ensure sufficient attention towards\ncurrent query at demonstration-level. We also design an efficient\nhyperparameter searching strategy for FocusICL based on model perplexity of\ndemonstrations. Comprehensive experiments validate that FocusICL achieves an\naverage performance improvement of 5.2% over vanilla ICL and scales well with\nmany-shot demonstrations.\n","authors":["Peiwen Yuan","Shaoxiong Feng","Yiwei Li","Xinglin Wang","Yueqi Zhang","Chuyi Tan","Boyuan Pan","Heda Wang","Yao Hu","Kan Li"],"pdf_url":"https://arxiv.org/pdf/2408.13987v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2305.07895v7","updated":"2024-08-26T02:37:14Z","published":"2023-05-13T11:28:37Z","title":"OCRBench: On the Hidden Mystery of OCR in Large Multimodal Models","summary":"  Large models have recently played a dominant role in natural language\nprocessing and multimodal vision-language learning. However, their\neffectiveness in text-related visual tasks remains relatively unexplored. In\nthis paper, we conducted a comprehensive evaluation of Large Multimodal Models,\nsuch as GPT4V and Gemini, in various text-related visual tasks including Text\nRecognition, Scene Text-Centric Visual Question Answering (VQA),\nDocument-Oriented VQA, Key Information Extraction (KIE), and Handwritten\nMathematical Expression Recognition (HMER). To facilitate the assessment of\nOptical Character Recognition (OCR) capabilities in Large Multimodal Models, we\npropose OCRBench, a comprehensive evaluation benchmark. OCRBench contains 29\ndatasets, making it the most comprehensive OCR evaluation benchmark available.\nFurthermore, our study reveals both the strengths and weaknesses of these\nmodels, particularly in handling multilingual text, handwritten text,\nnon-semantic text, and mathematical expression recognition. Most importantly,\nthe baseline results presented in this study could provide a foundational\nframework for the conception and assessment of innovative strategies targeted\nat enhancing zero-shot multimodal techniques. The evaluation pipeline and\nbenchmark are available at https://github.com/Yuliang-Liu/MultimodalOCR.\n","authors":["Yuliang Liu","Zhang Li","Mingxin Huang","Biao Yang","Wenwen Yu","Chunyuan Li","Xucheng Yin","Cheng-lin Liu","Lianwen Jin","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2305.07895v7.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13986v1","updated":"2024-08-26T02:36:55Z","published":"2024-08-26T02:36:55Z","title":"AgentMove: Predicting Human Mobility Anywhere Using Large Language Model\n  based Agentic Framework","summary":"  Human mobility prediction plays a crucial role in various real-world\napplications. Although deep learning based models have shown promising results\nover the past decade, their reliance on extensive private mobility data for\ntraining and their inability to perform zero-shot predictions, have hindered\nfurther advancements. Recently, attempts have been made to apply large language\nmodels (LLMs) to mobility prediction task. However, their performance has been\nconstrained by the absence of a systematic design of workflow. They directly\ngenerate the final output using LLMs, which limits the potential of LLMs to\nuncover complex mobility patterns and underestimates their extensive reserve of\nglobal geospatial knowledge. In this paper, we introduce AgentMove, a\nsystematic agentic prediction framework to achieve generalized mobility\nprediction for any cities worldwide. In AgentMove, we first decompose the\nmobility prediction task into three sub-tasks and then design corresponding\nmodules to complete these subtasks, including spatial-temporal memory for\nindividual mobility pattern mining, world knowledge generator for modeling the\neffects of urban structure and collective knowledge extractor for capturing the\nshared patterns among population. Finally, we combine the results of three\nmodules and conduct a reasoning step to generate the final predictions.\nExtensive experiments on mobility data from two sources in 12 cities\ndemonstrate that AgentMove outperforms the best baseline more than 8% in\nvarious metrics and it shows robust predictions with various LLMs as base and\nalso less geographical bias across cities. Codes and data can be found in\nhttps://github.com/tsinghua-fib-lab/AgentMove.\n","authors":["Jie Feng","Yuwei Du","Jie Zhao","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2408.13986v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2408.13985v1","updated":"2024-08-26T02:35:37Z","published":"2024-08-26T02:35:37Z","title":"TF-Attack: Transferable and Fast Adversarial Attacks on Large Language\n  Models","summary":"  With the great advancements in large language models (LLMs), adversarial\nattacks against LLMs have recently attracted increasing attention. We found\nthat pre-existing adversarial attack methodologies exhibit limited\ntransferability and are notably inefficient, particularly when applied to LLMs.\nIn this paper, we analyze the core mechanisms of previous predominant\nadversarial attack methods, revealing that 1) the distributions of importance\nscore differ markedly among victim models, restricting the transferability; 2)\nthe sequential attack processes induces substantial time overheads. Based on\nthe above two insights, we introduce a new scheme, named TF-Attack, for\nTransferable and Fast adversarial attacks on LLMs. TF-Attack employs an\nexternal LLM as a third-party overseer rather than the victim model to identify\ncritical units within sentences. Moreover, TF-Attack introduces the concept of\nImportance Level, which allows for parallel substitutions of attacks. We\nconduct extensive experiments on 6 widely adopted benchmarks, evaluating the\nproposed method through both automatic and human metrics. Results show that our\nmethod consistently surpasses previous methods in transferability and delivers\nsignificant speed improvements, up to 20 times faster than earlier attack\nstrategies.\n","authors":["Zelin Li","Kehai Chen","Xuefeng Bai","Lemao Liu","Mingming Yang","Yang Xiang","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.13985v1.pdf","comment":"14 pages, 6 figures. arXiv admin note: text overlap with\n  arXiv:2305.17440 by other authors"},{"id":"http://arxiv.org/abs/2408.12095v2","updated":"2024-08-26T02:26:31Z","published":"2024-08-22T03:08:49Z","title":"uMedSum: A Unified Framework for Advancing Medical Abstractive\n  Summarization","summary":"  Medical abstractive summarization faces the challenge of balancing\nfaithfulness and informativeness. Current methods often sacrifice key\ninformation for faithfulness or introduce confabulations when prioritizing\ninformativeness. While recent advancements in techniques like in-context\nlearning (ICL) and fine-tuning have improved medical summarization, they often\noverlook crucial aspects such as faithfulness and informativeness without\nconsidering advanced methods like model reasoning and self-improvement.\nMoreover, the field lacks a unified benchmark, hindering systematic evaluation\ndue to varied metrics and datasets. This paper addresses these gaps by\npresenting a comprehensive benchmark of six advanced abstractive summarization\nmethods across three diverse datasets using five standardized metrics. Building\non these findings, we propose uMedSum, a modular hybrid summarization framework\nthat introduces novel approaches for sequential confabulation removal followed\nby key missing information addition, ensuring both faithfulness and\ninformativeness. Our work improves upon previous GPT-4-based state-of-the-art\n(SOTA) medical summarization methods, significantly outperforming them in both\nquantitative metrics and qualitative domain expert evaluations. Notably, we\nachieve an average relative performance improvement of 11.8% in reference-free\nmetrics over the previous SOTA. Doctors prefer uMedSum's summaries 6 times more\nthan previous SOTA in difficult cases where there are chances of confabulations\nor missing information. These results highlight uMedSum's effectiveness and\ngeneralizability across various datasets and metrics, marking a significant\nadvancement in medical summarization.\n","authors":["Aishik Nagar","Yutong Liu","Andy T. Liu","Viktor Schlegel","Vijay Prakash Dwivedi","Arun-Kumar Kaliya-Perumal","Guna Pratheep Kalanchiam","Yili Tang","Robby T. Tan"],"pdf_url":"https://arxiv.org/pdf/2408.12095v2.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2310.03328v3","updated":"2024-08-26T02:05:37Z","published":"2023-10-05T05:55:06Z","title":"Reformulating Domain Adaptation of Large Language Models as\n  Adapt-Retrieve-Revise: A Case Study on Chinese Legal Domain","summary":"  While large language models (LLMs) like GPT-4 have recently demonstrated\nastonishing zero-shot capabilities in general domain tasks, they often generate\ncontent with hallucinations in specific domains such as Chinese law, hindering\ntheir application in these areas. This is typically due to the absence of\ntraining data that encompasses such a specific domain, preventing GPT-4 from\nacquiring in-domain knowledge. A pressing challenge is that it's not plausible\nto continue training LLMs of such scale on in-domain data.\n  This paper introduces a simple and effective domain adaptation framework for\nGPT-4 by reformulating generation as an \\textbf{adapt-retrieve-revise} process.\nThe initial step is to \\textbf{adapt} an affordable 7B LLM to the target domain\nby continuing learning on in-domain data. When solving a task, we leverage the\nadapted LLM to generate a draft answer given a task query. Then, the draft\nanswer will be used to \\textbf{retrieve} supporting evidence candidates from an\nexternal in-domain knowledge base. Finally, the draft answer and retrieved\nevidence are concatenated into a whole prompt to let GPT-4 assess the evidence\nand \\textbf{revise} the draft answer to generate the final answer.\n  Our proposal combines the advantages of the efficiency of adapting a smaller\n7B model with the evidence-assessing capability of GPT-4 and effectively\nprevents GPT-4 from generating hallucinatory content. In the zero-shot setting\nof four Chinese legal tasks, our method improves accuracy by 33.3\\% compared to\nthe direct generation by GPT-4. When compared to two stronger retrieval-based\nbaselines, our method outperforms them by 15.4\\% and 23.9\\%. Our code will be\nreleased\n","authors":["Zhen wan","Yating Zhang","Yexiang Wang","Fei Cheng","Sadao Kurohashi"],"pdf_url":"https://arxiv.org/pdf/2310.03328v3.pdf","comment":"Accepted by ACL 2024 Findings"},{"id":"http://arxiv.org/abs/2407.07275v2","updated":"2024-08-26T00:55:01Z","published":"2024-07-09T23:39:37Z","title":"Remastering Divide and Remaster: A Cinematic Audio Source Separation\n  Dataset with Multilingual Support","summary":"  Cinematic audio source separation (CASS), as a problem of extracting the\ndialogue, music, and effects stems from their mixture, is a relatively new\nsubtask of audio source separation. To date, only one publicly available\ndataset exists for CASS, that is, the Divide and Remaster (DnR) dataset, which\nis currently at version 2. While DnR v2 has been an incredibly useful resource\nfor CASS, several areas of improvement have been identified, particularly\nthrough its use in the 2023 Sound Demixing Challenge. In this work, we develop\nversion 3 of the DnR dataset, addressing issues relating to vocal content in\nnon-dialogue stems, loudness distributions, mastering process, and linguistic\ndiversity. In particular, the dialogue stem of DnR v3 includes speech content\nfrom more than 30 languages from multiple families including but not limited to\nthe Germanic, Romance, Indo-Aryan, Dravidian, Malayo-Polynesian, and Bantu\nfamilies. Benchmark results using the Bandit model indicated that training on\nmultilingual data yields significant generalizability to the model even in\nlanguages with low data availability. Even in languages with high data\navailability, the multilingual model often performs on par or better than\ndedicated models trained on monolingual CASS datasets. Dataset and model\nimplementation will be made available at\nhttps://github.com/kwatcharasupat/source-separation-landing.\n","authors":["Karn N. Watcharasupat","Chih-Wei Wu","Iroro Orife"],"pdf_url":"https://arxiv.org/pdf/2407.07275v2.pdf","comment":"Accepted to the 5th IEEE International Symposium on the Internet of\n  Sounds. Camera-ready version"},{"id":"http://arxiv.org/abs/2408.13966v1","updated":"2024-08-26T00:23:56Z","published":"2024-08-26T00:23:56Z","title":"Reducing the Cost: Cross-Prompt Pre-Finetuning for Short Answer Scoring","summary":"  Automated Short Answer Scoring (SAS) is the task of automatically scoring a\ngiven input to a prompt based on rubrics and reference answers. Although SAS is\nuseful in real-world applications, both rubrics and reference answers differ\nbetween prompts, thus requiring a need to acquire new data and train a model\nfor each new prompt. Such requirements are costly, especially for schools and\nonline courses where resources are limited and only a few prompts are used. In\nthis work, we attempt to reduce this cost through a two-phase approach: train a\nmodel on existing rubrics and answers with gold score signals and finetune it\non a new prompt. Specifically, given that scoring rubrics and reference answers\ndiffer for each prompt, we utilize key phrases, or representative expressions\nthat the answer should contain to increase scores, and train a SAS model to\nlearn the relationship between key phrases and answers using already annotated\nprompts (i.e., cross-prompts). Our experimental results show that finetuning on\nexisting cross-prompt data with key phrases significantly improves scoring\naccuracy, especially when the training data is limited. Finally, our extensive\nanalysis shows that it is crucial to design the model so that it can learn the\ntask's general property.\n","authors":["Hiroaki Funayama","Yuya Asazuma","Yuichiroh Matsubayashi","Tomoya Mizumoto","Kentaro Inui"],"pdf_url":"https://arxiv.org/pdf/2408.13966v1.pdf","comment":"This is the draft submitted to AIED 2023. For the latest version,\n  please visit: https://link.springer.com/chapter/10.1007/978-3-031-36272-9_7"},{"id":"http://arxiv.org/abs/2408.14698v1","updated":"2024-08-26T23:52:27Z","published":"2024-08-26T23:52:27Z","title":"Smart Multi-Modal Search: Contextual Sparse and Dense Embedding\n  Integration in Adobe Express","summary":"  As user content and queries become increasingly multi-modal, the need for\neffective multi-modal search systems has grown. Traditional search systems\noften rely on textual and metadata annotations for indexed images, while\nmulti-modal embeddings like CLIP enable direct search using text and image\nembeddings. However, embedding-based approaches face challenges in integrating\ncontextual features such as user locale and recency. Building a scalable\nmulti-modal search system requires fine-tuning several components. This paper\npresents a multi-modal search architecture and a series of AB tests that\noptimize embeddings and multi-modal technologies in Adobe Express template\nsearch. We address considerations such as embedding model selection, the roles\nof embeddings in matching and ranking, and the balance between dense and sparse\nembeddings. Our iterative approach demonstrates how utilizing sparse, dense,\nand contextual features enhances short and long query search, significantly\nreduces null rates (over 70\\%), and increases click-through rates (CTR). Our\nfindings provide insights into developing robust multi-modal search systems,\nthereby enhancing relevance for complex queries.\n","authors":["Cherag Aroraa","Tracy Holloway King","Jayant Kumar","Yi Lu","Sanat Sharma","Arvind Srikantan","David Uvalle","Josep Valls-Vargas","Harsha Vardhan"],"pdf_url":"https://arxiv.org/pdf/2408.14698v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14690v1","updated":"2024-08-26T23:30:15Z","published":"2024-08-26T23:30:15Z","title":"Training-Free Activation Sparsity in Large Language Models","summary":"  Activation sparsity can enable practical inference speedups in large language\nmodels (LLMs) by reducing the compute and memory-movement required for matrix\nmultiplications during the forward pass. However, existing methods face\nlimitations that inhibit widespread adoption. Some approaches are tailored\ntowards older models with ReLU-based sparsity, while others require extensive\ncontinued pre-training on up to hundreds of billions of tokens. This paper\ndescribes TEAL, a simple training-free method that applies magnitude-based\nactivation sparsity to hidden states throughout the entire model. TEAL achieves\n40-50% model-wide sparsity with minimal performance degradation across Llama-2,\nLlama-3, and Mistral families, with sizes varying from 7B to 70B. We improve\nexisting sparse kernels and demonstrate wall-clock decoding speed-ups of up to\n1.53$\\times$ and 1.8$\\times$ at 40% and 50% model-wide sparsity. TEAL is\ncompatible with weight quantization, enabling further efficiency gains.\n","authors":["James Liu","Pragaash Ponnusamy","Tianle Cai","Han Guo","Yoon Kim","Ben Athiwaratkun"],"pdf_url":"https://arxiv.org/pdf/2408.14690v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.06264v2","updated":"2024-08-26T22:53:51Z","published":"2024-02-09T09:25:18Z","title":"LLaVA-Docent: Instruction Tuning with Multimodal Large Language Model to\n  Support Art Appreciation Education","summary":"  Art appreciation is vital in nurturing critical thinking and emotional\nintelligence among learners. However, traditional art appreciation education\nhas often been hindered by limited access to art resources, especially for\ndisadvantaged students, and an imbalanced emphasis on STEM subjects in\nmainstream education. In response to these challenges, recent technological\nadvancements have paved the way for innovative solutions. This study explores\nthe application of multi-modal large language models (MLLMs) in art\nappreciation education, focusing on developing LLaVA-Docent, a model that\nleverages these advancements. Our approach involved a comprehensive literature\nreview and consultations with experts in the field, leading to developing a\nrobust data framework. Utilizing this framework, we generated a virtual\ndialogue dataset that was leveraged by GPT-4. This dataset was instrumental in\ntraining the MLLM, named LLaVA-Docent. Six researchers conducted quantitative\nand qualitative evaluations of LLaVA-Docent to assess its effectiveness,\nbenchmarking it against the GPT-4 model in a few-shot setting. The evaluation\nprocess revealed distinct strengths and weaknesses of the LLaVA-Docent model.\nOur findings highlight the efficacy of LLaVA-Docent in enhancing the\naccessibility and engagement of art appreciation education. By harnessing the\npotential of MLLMs, this study makes a significant contribution to the field of\nart education, proposing a novel methodology that reimagines the way art\nappreciation is taught and experienced.\n","authors":["Unggi Lee","Minji Jeon","Yunseo Lee","Gyuri Byun","Yoorim Son","Jaeyoon Shin","Hongkyu Ko","Hyeoncheol Kim"],"pdf_url":"https://arxiv.org/pdf/2402.06264v2.pdf","comment":"37 pages, 4 figures, 10 tables"},{"id":"http://arxiv.org/abs/2406.10774v2","updated":"2024-08-26T21:01:02Z","published":"2024-06-16T01:33:02Z","title":"Quest: Query-Aware Sparsity for Efficient Long-Context LLM Inference","summary":"  As the demand for long-context large language models (LLMs) increases, models\nwith context windows of up to 128K or 1M tokens are becoming increasingly\nprevalent. However, long-context LLM inference is challenging since the\ninference speed decreases significantly as the sequence length grows. This\nslowdown is primarily caused by loading a large KV cache during self-attention.\nPrevious works have shown that a small portion of critical tokens will dominate\nthe attention outcomes. However, we observe the criticality of a token highly\ndepends on the query. To this end, we propose Quest, a query-aware KV cache\nselection algorithm. Quest keeps track of the minimal and maximal Key values in\nKV cache pages and estimates the criticality of a given page using Query\nvectors. By only loading the Top-K critical KV cache pages for attention, Quest\nsignificantly speeds up self-attention without sacrificing accuracy. We show\nthat Quest can achieve up to 2.23x self-attention speedup, which reduces\ninference latency by 7.03x while performing well on tasks with long\ndependencies with negligible accuracy loss. Code is available at\nhttp://github.com/mit-han-lab/Quest .\n","authors":["Jiaming Tang","Yilong Zhao","Kan Zhu","Guangxuan Xiao","Baris Kasikci","Song Han"],"pdf_url":"https://arxiv.org/pdf/2406.10774v2.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2408.14636v1","updated":"2024-08-26T21:00:25Z","published":"2024-08-26T21:00:25Z","title":"Relationships are Complicated! An Analysis of Relationships Between\n  Datasets on the Web","summary":"  The Web today has millions of datasets, and the number of datasets continues\nto grow at a rapid pace. These datasets are not standalone entities; rather,\nthey are intricately connected through complex relationships. Semantic\nrelationships between datasets provide critical insights for research and\ndecision-making processes. In this paper, we study dataset relationships from\nthe perspective of users who discover, use, and share datasets on the Web: what\nrelationships are important for different tasks? What contextual information\nmight users want to know? We first present a comprehensive taxonomy of\nrelationships between datasets on the Web and map these relationships to user\ntasks performed during dataset discovery. We develop a series of methods to\nidentify these relationships and compare their performance on a large corpus of\ndatasets generated from Web pages with schema.org markup. We demonstrate that\nmachine-learning based methods that use dataset metadata achieve multi-class\nclassification accuracy of 90%. Finally, we highlight gaps in available\nsemantic markup for datasets and discuss how incorporating comprehensive\nsemantics can facilitate the identification of dataset relationships. By\nproviding a comprehensive overview of dataset relationships at scale, this\npaper sets a benchmark for future research.\n","authors":["Kate Lin","Tarfah Alrashed","Natasha Noy"],"pdf_url":"https://arxiv.org/pdf/2408.14636v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03605v2","updated":"2024-08-26T20:48:19Z","published":"2024-04-04T17:25:30Z","title":"Mitigating the Impact of Outlier Channels for Language Model\n  Quantization with Activation Regularization","summary":"  We consider the problem of accurate quantization for language models, where\nboth the weights and activations are uniformly quantized to 4 bits per\nparameter, the lowest bitwidth format natively supported by GPU hardware. In\nthis context, the key challenge is activation quantization: it is known that\nlanguage models contain outlier channels whose values on average are orders of\nmagnitude higher than than other channels, which prevents accurate low-bitwidth\nquantization with known techniques. We systematically study this phenomena and\nfind that these outlier channels emerge early in training, and that they occur\nmore frequently in layers with residual streams. We then propose a simple\nstrategy which regularizes a layer's inputs via quantization-aware training\n(QAT) and its outputs via activation kurtosis regularization. We show that\nregularizing both the inputs and outputs is crucial for preventing a model's\n\"migrating\" the difficulty in input quantization to the weights, which makes\npost-training quantization (PTQ) of weights more difficult. When combined with\nweight PTQ, we show that our approach can obtain a W4A4 model that performs\ncompetitively to the standard-precision W16A16 baseline.\n","authors":["Aniruddha Nrusimha","Mayank Mishra","Naigang Wang","Dan Alistarh","Rameswar Panda","Yoon Kim"],"pdf_url":"https://arxiv.org/pdf/2404.03605v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14623v1","updated":"2024-08-26T20:36:52Z","published":"2024-08-26T20:36:52Z","title":"MODOC: A Modular Interface for Flexible Interlinking of Text Retrieval\n  and Text Generation Functions","summary":"  Large Language Models (LLMs) produce eloquent texts but often the content\nthey generate needs to be verified. Traditional information retrieval systems\ncan assist with this task, but most systems have not been designed with\nLLM-generated queries in mind. As such, there is a compelling need for\nintegrated systems that provide both retrieval and generation functionality\nwithin a single user interface.\n  We present MODOC, a modular user interface that leverages the capabilities of\nLLMs and provides assistance with detecting their confabulations, promoting\nintegrity in scientific writing. MODOC represents a significant step forward in\nscientific writing assistance. Its modular architecture supports flexible\nfunctions for retrieving information and for writing and generating text in a\nsingle, user-friendly interface.\n","authors":["Yingqiang Gao","Jhony Prada","Nianlong Gu","Jessica Lam","Richard H. R. Hahnloser"],"pdf_url":"https://arxiv.org/pdf/2408.14623v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14622v1","updated":"2024-08-26T20:35:42Z","published":"2024-08-26T20:35:42Z","title":"What Makes a Good Story and How Can We Measure It? A Comprehensive\n  Survey of Story Evaluation","summary":"  With the development of artificial intelligence, particularly the success of\nLarge Language Models (LLMs), the quantity and quality of automatically\ngenerated stories have significantly increased. This has led to the need for\nautomatic story evaluation to assess the generative capabilities of computing\nsystems and analyze the quality of both automatic-generated and human-written\nstories. Evaluating a story can be more challenging than other generation\nevaluation tasks. While tasks like machine translation primarily focus on\nassessing the aspects of fluency and accuracy, story evaluation demands complex\nadditional measures such as overall coherence, character development,\ninterestingness, etc. This requires a thorough review of relevant research. In\nthis survey, we first summarize existing storytelling tasks, including\ntext-to-text, visual-to-text, and text-to-visual. We highlight their evaluation\nchallenges, identify various human criteria to measure stories, and present\nexisting benchmark datasets. Then, we propose a taxonomy to organize evaluation\nmetrics that have been developed or can be adopted for story evaluation. We\nalso provide descriptions of these metrics, along with the discussion of their\nmerits and limitations. Later, we discuss the human-AI collaboration for story\nevaluation and generation. Finally, we suggest potential future research\ndirections, extending from story evaluation to general evaluations.\n","authors":["Dingyi Yang","Qin Jin"],"pdf_url":"https://arxiv.org/pdf/2408.14622v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01663v3","updated":"2024-08-26T20:30:40Z","published":"2024-04-02T06:07:35Z","title":"CMAT: A Multi-Agent Collaboration Tuning Framework for Enhancing Small\n  Language Models","summary":"  Open large language models (LLMs) have significantly advanced the field of\nnatural language processing, showcasing impressive performance across various\ntasks.Despite the significant advancements in LLMs, their effective operation\nstill relies heavily on human input to accurately guide the dialogue flow, with\nagent tuning being a crucial optimization technique that involves human\nadjustments to the model for better response to such guidance.Addressing this\ndependency, our work introduces the TinyAgent model, trained on a meticulously\ncurated high-quality dataset. We also present the Collaborative Multi-Agent\nTuning (CMAT) framework, an innovative system designed to augment language\nagent capabilities through adaptive weight updates based on environmental\nfeedback. This framework fosters collaborative learning and real-time\nadaptation among multiple intelligent agents, enhancing their context-awareness\nand long-term memory. In this research, we propose a new communication agent\nframework that integrates multi-agent systems with environmental feedback\nmechanisms, offering a scalable method to explore cooperative behaviors.\nNotably, our TinyAgent-7B model exhibits performance on par with GPT-3.5,\ndespite having fewer parameters, signifying a substantial improvement in the\nefficiency and effectiveness of LLMs.\n","authors":["Xuechen Liang","Meiling Tao","Yinghui Xia","Tianyu Shi","Jun Wang","JingSong Yang"],"pdf_url":"https://arxiv.org/pdf/2404.01663v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06494v2","updated":"2024-08-26T20:10:52Z","published":"2024-08-12T21:04:16Z","title":"What Color Scheme is More Effective in Assisting Readers to Locate\n  Information in a Color-Coded Article?","summary":"  Color coding, a technique assigning specific colors to cluster information\ntypes, has proven advantages in aiding human cognitive activities, especially\nreading and comprehension. The rise of Large Language Models (LLMs) has\nstreamlined document coding, enabling simple automatic text labeling with\nvarious schemes. This has the potential to make color-coding more accessible\nand benefit more users. However, the impact of color choice on information\nseeking is understudied. We conducted a user study assessing various color\nschemes' effectiveness in LLM-coded text documents, standardizing contrast\nratios to approximately 5.55:1 across schemes. Participants performed timed\ninformation-seeking tasks in color-coded scholarly abstracts. Results showed\nnon-analogous and yellow-inclusive color schemes improved performance, with the\nlatter also being more preferred by participants. These findings can inform\nbetter color scheme choices for text annotation. As LLMs advance document\ncoding, we advocate for more research focusing on the \"color\" aspect of\ncolor-coding techniques.\n","authors":["Ho Yin Ng","Zeyu He","Ting-Hao 'Kenneth' Huang"],"pdf_url":"https://arxiv.org/pdf/2408.06494v2.pdf","comment":"This paper will appear at IEEE VIS 2024"},{"id":"http://arxiv.org/abs/2405.11083v2","updated":"2024-08-26T20:02:16Z","published":"2024-05-17T20:30:49Z","title":"Prompt Exploration with Prompt Regression","summary":"  In the advent of democratized usage of large language models (LLMs), there is\na growing desire to systematize LLM prompt creation and selection processes\nbeyond iterative trial-and-error. Prior works majorly focus on searching the\nspace of prompts without accounting for relations between prompt variations.\nHere we propose a framework, Prompt Exploration with Prompt Regression (PEPR),\nto predict the effect of prompt combinations given results for individual\nprompt elements as well as a simple method to select an effective prompt for a\ngiven use-case. We evaluate our approach with open-source LLMs of different\nsizes on several different tasks.\n","authors":["Michael Feffer","Ronald Xu","Yuekai Sun","Mikhail Yurochkin"],"pdf_url":"https://arxiv.org/pdf/2405.11083v2.pdf","comment":"COLM 2024"},{"id":"http://arxiv.org/abs/2406.06484v2","updated":"2024-08-26T19:50:37Z","published":"2024-06-10T17:24:42Z","title":"Parallelizing Linear Transformers with the Delta Rule over Sequence\n  Length","summary":"  Transformers with linear attention (i.e., linear transformers) and\nstate-space models have recently been suggested as a viable linear-time\nalternative to transformers with softmax attention. However, these models still\nunderperform transformers especially on tasks that require in-context\nretrieval. While more expressive variants of linear transformers which replace\nthe additive outer-product update in linear transformers with the delta rule\nhave been found to be more effective at associative recall, existing algorithms\nfor training such models do not parallelize over sequence length and are thus\ninefficient to train on modern hardware. This work describes a\nhardware-efficient algorithm for training linear transformers with the delta\nrule, which exploits a memory-efficient representation for computing products\nof Householder matrices. This algorithm allows us to scale up DeltaNet to\nstandard language modeling settings. We train a 1.3B model for 100B tokens and\nfind that it outperforms recent linear-time baselines such as Mamba and GLA in\nterms of perplexity and zero-shot performance on downstream tasks (including on\ntasks that focus on recall). We also experiment with two hybrid models which\ncombine DeltaNet layers with (1) sliding-window attention layers every other\nlayer or (2) two global attention layers, and find that these hybrid models\noutperform strong transformer baselines.\n","authors":["Songlin Yang","Bailin Wang","Yu Zhang","Yikang Shen","Yoon Kim"],"pdf_url":"https://arxiv.org/pdf/2406.06484v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2408.14595v1","updated":"2024-08-26T19:26:55Z","published":"2024-08-26T19:26:55Z","title":"Surprisingly Fragile: Assessing and Addressing Prompt Instability in\n  Multimodal Foundation Models","summary":"  Multimodal foundation models (MFMs) such as OFASys show the potential to\nunlock analysis of complex data such as images, videos, and audio data via text\nprompts alone. However, their performance may suffer in the face of text input\nthat differs even slightly from their training distribution, which is\nsurprising considering the use of modality-specific data to \"ground\" the text\ninput. This study demonstrates that prompt instability is a major concern for\nMFMs, leading to a consistent drop in performance across all modalities, but\nthat instability can be mitigated with additional training with augmented data.\nWe evaluate several methods for grounded prompt perturbation, where we generate\nperturbations and filter based on similarity to text and/or modality data.\nAfter re-training the models on the augmented data, we find improved accuracy\nand more stable performance on the perturbed test data regardless of\nperturbation condition, suggesting that the data augmentation strategy helps\nthe models handle domain shifts more effectively. In error analysis, we find\nconsistent patterns of performance improvement across domains, suggesting that\nretraining on prompt perturbations tends to help general reasoning capabilities\nin MFMs.\n","authors":["Ian Stewart","Sameera Horawalavithana","Brendan Kennedy","Sai Munikoti","Karl Pazdernik"],"pdf_url":"https://arxiv.org/pdf/2408.14595v1.pdf","comment":"in submission"},{"id":"http://arxiv.org/abs/2402.17700v2","updated":"2024-08-26T19:26:06Z","published":"2024-02-27T17:25:37Z","title":"RAVEL: Evaluating Interpretability Methods on Disentangling Language\n  Model Representations","summary":"  Individual neurons participate in the representation of multiple high-level\nconcepts. To what extent can different interpretability methods successfully\ndisentangle these roles? To help address this question, we introduce RAVEL\n(Resolving Attribute-Value Entanglements in Language Models), a dataset that\nenables tightly controlled, quantitative comparisons between a variety of\nexisting interpretability methods. We use the resulting conceptual framework to\ndefine the new method of Multi-task Distributed Alignment Search (MDAS), which\nallows us to find distributed representations satisfying multiple causal\ncriteria. With Llama2-7B as the target language model, MDAS achieves\nstate-of-the-art results on RAVEL, demonstrating the importance of going beyond\nneuron-level analyses to identify features distributed across activations. We\nrelease our benchmark at https://github.com/explanare/ravel.\n","authors":["Jing Huang","Zhengxuan Wu","Christopher Potts","Mor Geva","Atticus Geiger"],"pdf_url":"https://arxiv.org/pdf/2402.17700v2.pdf","comment":"Proceedings of the 62nd Annual Meeting of the Association for\n  Computational Linguistics (ACL 2024)"},{"id":"http://arxiv.org/abs/2408.14572v1","updated":"2024-08-26T18:42:59Z","published":"2024-08-26T18:42:59Z","title":"CURLoRA: Stable LLM Continual Fine-Tuning and Catastrophic Forgetting\n  Mitigation","summary":"  This paper introduces CURLoRA, a novel approach to fine-tuning large language\nmodels (LLMs) that leverages CUR matrix decomposition in the context of\nLow-Rank Adaptation (LoRA). Our method addresses two critical challenges in LLM\nfine-tuning: mitigating catastrophic forgetting during continual learning and\nreducing the number of trainable parameters. We propose a unique modification\nto the CUR decomposition process, utilizing inverted probabilities for column\nand row selection which acts as an implicit regularization, and initializing\nthe $U$ matrix as a zero matrix, and only fine-tuning it. We demonstrate\nthrough experiments on multiple datasets that CURLoRA outperforms standard LoRA\nin mitigating catastrophic forgetting. It maintains model stability and\nperformance across tasks while significantly reducing the number of trainable\nparameters. Our results show that CURLoRA achieves very good and stable task\naccuracy while maintaining base model's perplexity scores fixed compared to\nLoRA upon continual fine-tuning, particularly in scenarios with limited data.\n","authors":["Muhammad Fawi"],"pdf_url":"https://arxiv.org/pdf/2408.14572v1.pdf","comment":"Code available at https://github.com/MNoorFawi/curlora"},{"id":"http://arxiv.org/abs/2408.14568v1","updated":"2024-08-26T18:39:31Z","published":"2024-08-26T18:39:31Z","title":"Improving Clinical Note Generation from Complex Doctor-Patient\n  Conversation","summary":"  Writing clinical notes and documenting medical exams is a critical task for\nhealthcare professionals, serving as a vital component of patient care\ndocumentation. However, manually writing these notes is time-consuming and can\nimpact the amount of time clinicians can spend on direct patient interaction\nand other tasks. Consequently, the development of automated clinical note\ngeneration systems has emerged as a clinically meaningful area of research\nwithin AI for health. In this paper, we present three key contributions to the\nfield of clinical note generation using large language models (LLMs). First, we\nintroduce CliniKnote, a comprehensive dataset consisting of 1,200 complex\ndoctor-patient conversations paired with their full clinical notes. This\ndataset, created and curated by medical experts with the help of modern neural\nnetworks, provides a valuable resource for training and evaluating models in\nclinical note generation tasks. Second, we propose the K-SOAP (Keyword,\nSubjective, Objective, Assessment, and Plan) note format, which enhances\ntraditional SOAP~\\cite{podder2023soap} (Subjective, Objective, Assessment, and\nPlan) notes by adding a keyword section at the top, allowing for quick\nidentification of essential information. Third, we develop an automatic\npipeline to generate K-SOAP notes from doctor-patient conversations and\nbenchmark various modern LLMs using various metrics. Our results demonstrate\nsignificant improvements in efficiency and performance compared to standard LLM\nfinetuning methods.\n","authors":["Yizhan Li","Sifan Wu","Christopher Smith","Thomas Lo","Bang Liu"],"pdf_url":"https://arxiv.org/pdf/2408.14568v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14547v1","updated":"2024-08-26T18:00:33Z","published":"2024-08-26T18:00:33Z","title":"Revisiting Image Captioning Training Paradigm via Direct CLIP-based\n  Optimization","summary":"  The conventional training approach for image captioning involves pre-training\na network using teacher forcing and subsequent fine-tuning with Self-Critical\nSequence Training to maximize hand-crafted captioning metrics. However, when\nattempting to optimize modern and higher-quality metrics like CLIP-Score and\nPAC-Score, this training method often encounters instability and fails to\nacquire the genuine descriptive capabilities needed to produce fluent and\ninformative captions. In this paper, we propose a new training paradigm termed\nDirect CLIP-Based Optimization (DiCO). Our approach jointly learns and\noptimizes a reward model that is distilled from a learnable captioning\nevaluator with high human correlation. This is done by solving a weighted\nclassification problem directly inside the captioner. At the same time, DiCO\nprevents divergence from the original model, ensuring that fluency is\nmaintained. DiCO not only exhibits improved stability and enhanced quality in\nthe generated captions but also aligns more closely with human preferences\ncompared to existing methods, especially in modern metrics. Additionally, it\nmaintains competitive performance in traditional metrics. Our source code and\ntrained models are publicly available at https://github.com/aimagelab/DiCO.\n","authors":["Nicholas Moratelli","Davide Caffagni","Marcella Cornia","Lorenzo Baraldi","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2408.14547v1.pdf","comment":"BMVC 2024"},{"id":"http://arxiv.org/abs/2012.12311v4","updated":"2024-08-26T15:34:13Z","published":"2020-12-22T19:32:52Z","title":"Unboxing Engagement in YouTube Influencer Videos: An Attention-Based\n  Approach","summary":"  Influencer marketing videos have surged in popularity, yet significant gaps\nremain in understanding the relationship between video features and engagement.\nThis challenge is intensified by the complexities of interpreting unstructured\ndata. While deep learning models effectively leverage unstructured data to\npredict business outcomes, they often function as black boxes with limited\ninterpretability, particularly when human validation is hindered by the absence\nof a known ground truth. To address this issue, the authors develop an\n\"interpretable deep learning framework\" that not only makes good out-of-sample\npredictions using unstructured data but also provides insights into the\ncaptured relationships. Inspired by visual attention in print advertising, the\ninterpretation approach uses measures of model attention to video features,\neliminating spurious associations through a two-step process and shortlisting\nrelationships for formal causal testing. This method is applicable across\nwell-known attention mechanisms - additive attention, scaled dot-product\nattention, and gradient-based attention - when analyzing text, audio, or video\nimage data. Validated using simulations, this approach outperforms benchmark\nfeature selection methods. This framework is applied to YouTube influencer\nvideos, linking video features to measures of shallow and deep engagement\ndeveloped based on the dual-system framework of thinking. The findings guide\ninfluencers and brands in prioritizing video features associated with deep\nengagement.\n","authors":["Prashant Rajaram","Puneet Manchanda"],"pdf_url":"https://arxiv.org/pdf/2012.12311v4.pdf","comment":"50 pages, Online Appendix"},{"id":"http://arxiv.org/abs/2408.13985v1","updated":"2024-08-26T02:35:37Z","published":"2024-08-26T02:35:37Z","title":"TF-Attack: Transferable and Fast Adversarial Attacks on Large Language\n  Models","summary":"  With the great advancements in large language models (LLMs), adversarial\nattacks against LLMs have recently attracted increasing attention. We found\nthat pre-existing adversarial attack methodologies exhibit limited\ntransferability and are notably inefficient, particularly when applied to LLMs.\nIn this paper, we analyze the core mechanisms of previous predominant\nadversarial attack methods, revealing that 1) the distributions of importance\nscore differ markedly among victim models, restricting the transferability; 2)\nthe sequential attack processes induces substantial time overheads. Based on\nthe above two insights, we introduce a new scheme, named TF-Attack, for\nTransferable and Fast adversarial attacks on LLMs. TF-Attack employs an\nexternal LLM as a third-party overseer rather than the victim model to identify\ncritical units within sentences. Moreover, TF-Attack introduces the concept of\nImportance Level, which allows for parallel substitutions of attacks. We\nconduct extensive experiments on 6 widely adopted benchmarks, evaluating the\nproposed method through both automatic and human metrics. Results show that our\nmethod consistently surpasses previous methods in transferability and delivers\nsignificant speed improvements, up to 20 times faster than earlier attack\nstrategies.\n","authors":["Zelin Li","Kehai Chen","Xuefeng Bai","Lemao Liu","Mingming Yang","Yang Xiang","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.13985v1.pdf","comment":"14 pages, 6 figures"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2408.14471v1","updated":"2024-08-26T17:59:01Z","published":"2024-08-26T17:59:01Z","title":"A Practitioner's Guide to Continual Multimodal Pretraining","summary":"  Multimodal foundation models serve numerous applications at the intersection\nof vision and language. Still, despite being pretrained on extensive data, they\nbecome outdated over time. To keep models updated, research into continual\npretraining mainly explores scenarios with either (1) infrequent,\nindiscriminate updates on large-scale new data, or (2) frequent, sample-level\nupdates. However, practical model deployment often operates in the gap between\nthese two limit cases, as real-world applications often demand adaptation to\nspecific subdomains, tasks or concepts -- spread over the entire, varying life\ncycle of a model. In this work, we complement current perspectives on continual\npretraining through a research test bed as well as provide comprehensive\nguidance for effective continual model updates in such scenarios. We first\nintroduce FoMo-in-Flux, a continual multimodal pretraining benchmark with\nrealistic compute constraints and practical deployment requirements,\nconstructed over 63 datasets with diverse visual and semantic coverage. Using\nFoMo-in-Flux, we explore the complex landscape of practical continual\npretraining through multiple perspectives: (1) A data-centric investigation of\ndata mixtures and stream orderings that emulate real-world deployment\nsituations, (2) a method-centric investigation ranging from simple fine-tuning\nand traditional continual learning strategies to parameter-efficient updates\nand model merging, (3) meta learning rate schedules and mechanistic design\nchoices, and (4) the influence of model and compute scaling. Together, our\ninsights provide a practitioner's guide to continual multimodal pretraining for\nreal-world deployment. Our benchmark and code is here:\nhttps://github.com/ExplainableML/fomo_in_flux.\n","authors":["Karsten Roth","Vishaal Udandarao","Sebastian Dziadzio","Ameya Prabhu","Mehdi Cherti","Oriol Vinyals","Olivier Hénaff","Samuel Albanie","Matthias Bethge","Zeynep Akata"],"pdf_url":"https://arxiv.org/pdf/2408.14471v1.pdf","comment":"Technical Report. 52 pages"},{"id":"http://arxiv.org/abs/2408.14469v1","updated":"2024-08-26T17:58:47Z","published":"2024-08-26T17:58:47Z","title":"Grounded Multi-Hop VideoQA in Long-Form Egocentric Videos","summary":"  This paper considers the problem of Multi-Hop Video Question Answering\n(MH-VidQA) in long-form egocentric videos. This task not only requires to\nanswer visual questions, but also to localize multiple relevant time intervals\nwithin the video as visual evidences. We develop an automated pipeline to\ncreate multi-hop question-answering pairs with associated temporal evidence,\nenabling to construct a large-scale dataset for instruction-tuning. To monitor\nthe progress of this new task, we further curate a high-quality benchmark,\nMultiHop-EgoQA, with careful manual verification and refinement. Experimental\nresults reveal that existing multi-modal systems exhibit inadequate multi-hop\ngrounding and reasoning abilities, resulting in unsatisfactory performance. We\nthen propose a novel architecture, termed as Grounding Scattered Evidence with\nLarge Language Model (GeLM), that enhances multi-modal large language models\n(MLLMs) by incorporating a grounding module to retrieve temporal evidence from\nvideos using flexible grounding tokens. Trained on our visual instruction data,\nGeLM demonstrates improved multi-hop grounding and reasoning capabilities,\nsetting a new baseline for this challenging task. Furthermore, when trained on\nthird-person view videos, the same architecture also achieves state-of-the-art\nperformance on the single-hop VidQA benchmark, ActivityNet-RTL, demonstrating\nits effectiveness.\n","authors":["Qirui Chen","Shangzhe Di","Weidi Xie"],"pdf_url":"https://arxiv.org/pdf/2408.14469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14457v1","updated":"2024-08-26T17:49:27Z","published":"2024-08-26T17:49:27Z","title":"Dense Center-Direction Regression for Object Counting and Localization\n  with Point Supervision","summary":"  Object counting and localization problems are commonly addressed with point\nsupervised learning, which allows the use of less labor-intensive point\nannotations. However, learning based on point annotations poses challenges due\nto the high imbalance between the sets of annotated and unannotated pixels,\nwhich is often treated with Gaussian smoothing of point annotations and focal\nloss. However, these approaches still focus on the pixels in the immediate\nvicinity of the point annotations and exploit the rest of the data only\nindirectly. In this work, we propose a novel approach termed CeDiRNet for\npoint-supervised learning that uses a dense regression of directions pointing\ntowards the nearest object centers, i.e. center-directions. This provides\ngreater support for each center point arising from many surrounding pixels\npointing towards the object center. We propose a formulation of\ncenter-directions that allows the problem to be split into the domain-specific\ndense regression of center-directions and the final localization task based on\na small, lightweight, and domain-agnostic localization network that can be\ntrained with synthetic data completely independent of the target domain. We\ndemonstrate the performance of the proposed method on six different datasets\nfor object counting and localization, and show that it outperforms the existing\nstate-of-the-art methods. The code is accessible on GitHub at\nhttps://github.com/vicoslab/CeDiRNet.git.\n","authors":["Domen Tabernik","Jon Muhovič","Danijel Skočaj"],"pdf_url":"https://arxiv.org/pdf/2408.14457v1.pdf","comment":"Published in Pattern Recognition"},{"id":"http://arxiv.org/abs/2408.14456v1","updated":"2024-08-26T17:49:05Z","published":"2024-08-26T17:49:05Z","title":"Center Direction Network for Grasping Point Localization on Cloths","summary":"  Object grasping is a fundamental challenge in robotics and computer vision,\ncritical for advancing robotic manipulation capabilities. Deformable objects,\nlike fabrics and cloths, pose additional challenges due to their non-rigid\nnature. In this work, we introduce CeDiRNet-3DoF, a deep-learning model for\ngrasp point detection, with a particular focus on cloth objects. CeDiRNet-3DoF\nemploys center direction regression alongside a localization network, attaining\nfirst place in the perception task of ICRA 2023's Cloth Manipulation Challenge.\nRecognizing the lack of standardized benchmarks in the literature that hinder\neffective method comparison, we present the ViCoS Towel Dataset. This extensive\nbenchmark dataset comprises 8,000 real and 12,000 synthetic images, serving as\na robust resource for training and evaluating contemporary data-driven\ndeep-learning approaches. Extensive evaluation revealed CeDiRNet-3DoF's\nrobustness in real-world performance, outperforming state-of-the-art methods,\nincluding the latest transformer-based models. Our work bridges a crucial gap,\noffering a robust solution and benchmark for cloth grasping in computer vision\nand robotics. Code and dataset are available at:\nhttps://github.com/vicoslab/CeDiRNet-3DoF\n","authors":["Domen Tabernik","Jon Muhovič","Matej Urbas","Danijel Skočaj"],"pdf_url":"https://arxiv.org/pdf/2408.14456v1.pdf","comment":"Accepted for publication in IEEE Robotics and Automation Letters"},{"id":"http://arxiv.org/abs/2408.14442v1","updated":"2024-08-26T17:35:01Z","published":"2024-08-26T17:35:01Z","title":"Model Parallel Training and Transfer Learning for Convolutional Neural\n  Networks by Domain Decomposition","summary":"  Deep convolutional neural networks (CNNs) have been shown to be very\nsuccessful in a wide range of image processing applications. However, due to\ntheir increasing number of model parameters and an increasing availability of\nlarge amounts of training data, parallelization strategies to efficiently train\ncomplex CNNs are necessary. In previous work by the authors, a novel model\nparallel CNN architecture was proposed which is loosely inspired by domain\ndecomposition. In particular, the novel network architecture is based on a\ndecomposition of the input data into smaller subimages. For each of these\nsubimages, local CNNs with a proportionally smaller number of parameters are\ntrained in parallel and the resulting local classifications are then aggregated\nin a second step by a dense feedforward neural network (DNN). In the present\nwork, we compare the resulting CNN-DNN architecture to less costly alternatives\nto combine the local classifications into a final, global decision.\nAdditionally, we investigate the performance of the CNN-DNN trained as one\ncoherent model as well as using a transfer learning strategy, where the\nparameters of the pre-trained local CNNs are used as initial values for a\nsubsequently trained global coherent CNN-DNN model.\n","authors":["Axel Klawonn","Martin Lanser","Janine Weber"],"pdf_url":"https://arxiv.org/pdf/2408.14442v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14441v1","updated":"2024-08-26T17:33:47Z","published":"2024-08-26T17:33:47Z","title":"Attend-Fusion: Efficient Audio-Visual Fusion for Video Classification","summary":"  Exploiting both audio and visual modalities for video classification is a\nchallenging task, as the existing methods require large model architectures,\nleading to high computational complexity and resource requirements. Smaller\narchitectures, on the other hand, struggle to achieve optimal performance. In\nthis paper, we propose Attend-Fusion, an audio-visual (AV) fusion approach that\nintroduces a compact model architecture specifically designed to capture\nintricate audio-visual relationships in video data. Through extensive\nexperiments on the challenging YouTube-8M dataset, we demonstrate that\nAttend-Fusion achieves an F1 score of 75.64\\% with only 72M parameters, which\nis comparable to the performance of larger baseline models such as\nFully-Connected Late Fusion (75.96\\% F1 score, 341M parameters). Attend-Fusion\nachieves similar performance to the larger baseline model while reducing the\nmodel size by nearly 80\\%, highlighting its efficiency in terms of model\ncomplexity. Our work demonstrates that the Attend-Fusion model effectively\ncombines audio and visual information for video classification, achieving\ncompetitive performance with significantly reduced model size. This approach\nopens new possibilities for deploying high-performance video understanding\nsystems in resource-constrained environments across various applications.\n","authors":["Mahrukh Awan","Asmar Nadeem","Muhammad Junaid Awan","Armin Mustafa","Syed Sameed Husain"],"pdf_url":"https://arxiv.org/pdf/2408.14441v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14435v1","updated":"2024-08-26T17:21:54Z","published":"2024-08-26T17:21:54Z","title":"Social perception of faces in a vision-language model","summary":"  We explore social perception of human faces in CLIP, a widely used\nopen-source vision-language model. To this end, we compare the similarity in\nCLIP embeddings between different textual prompts and a set of face images. Our\ntextual prompts are constructed from well-validated social psychology terms\ndenoting social perception. The face images are synthetic and are\nsystematically and independently varied along six dimensions: the legally\nprotected attributes of age, gender, and race, as well as facial expression,\nlighting, and pose. Independently and systematically manipulating face\nattributes allows us to study the effect of each on social perception and\navoids confounds that can occur in wild-collected data due to uncontrolled\nsystematic correlations between attributes. Thus, our findings are experimental\nrather than observational. Our main findings are three. First, while CLIP is\ntrained on the widest variety of images and texts, it is able to make\nfine-grained human-like social judgments on face images. Second, age, gender,\nand race do systematically impact CLIP's social perception of faces, suggesting\nan undesirable bias in CLIP vis-a-vis legally protected attributes. Most\nstrikingly, we find a strong pattern of bias concerning the faces of Black\nwomen, where CLIP produces extreme values of social perception across different\nages and facial expressions. Third, facial expression impacts social perception\nmore than age and lighting as much as age. The last finding predicts that\nstudies that do not control for unprotected visual attributes may reach the\nwrong conclusions on bias. Our novel method of investigation, which is founded\non the social psychology literature and on the experiments involving the\nmanipulation of individual attributes, yields sharper and more reliable\nobservations than previous observational methods and may be applied to study\nbiases in any vision-language model.\n","authors":["Carina I. Hausladen","Manuel Knott","Colin F. Camerer","Pietro Perona"],"pdf_url":"https://arxiv.org/pdf/2408.14435v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14427v1","updated":"2024-08-26T17:15:37Z","published":"2024-08-26T17:15:37Z","title":"Few-Shot 3D Volumetric Segmentation with Multi-Surrogate Fusion","summary":"  Conventional 3D medical image segmentation methods typically require learning\nheavy 3D networks (e.g., 3D-UNet), as well as large amounts of in-domain data\nwith accurate pixel/voxel-level labels to avoid overfitting. These solutions\nare thus extremely time- and labor-expensive, but also may easily fail to\ngeneralize to unseen objects during training. To alleviate this issue, we\npresent MSFSeg, a novel few-shot 3D segmentation framework with a lightweight\nmulti-surrogate fusion (MSF). MSFSeg is able to automatically segment unseen 3D\nobjects/organs (during training) provided with one or a few annotated 2D slices\nor 3D sequence segments, via learning dense query-support organ/lesion anatomy\ncorrelations across patient populations. Our proposed MSF module mines\ncomprehensive and diversified morphology correlations between unlabeled and the\nfew labeled slices/sequences through multiple designated surrogates, making it\nable to generate accurate cross-domain 3D segmentation masks given annotated\nslices or sequences. We demonstrate the effectiveness of our proposed framework\nby showing superior performance on conventional few-shot segmentation\nbenchmarks compared to prior art, and remarkable cross-domain cross-volume\nsegmentation performance on proprietary 3D segmentation datasets for\nchallenging entities, i.e., tubular structures, with only limited 2D or 3D\nlabels.\n","authors":["Meng Zheng","Benjamin Planche","Zhongpai Gao","Terrence Chen","Richard J. Radke","Ziyan Wu"],"pdf_url":"https://arxiv.org/pdf/2408.14427v1.pdf","comment":"Accepted to MICCAI 2024"},{"id":"http://arxiv.org/abs/2408.14421v1","updated":"2024-08-26T17:04:52Z","published":"2024-08-26T17:04:52Z","title":"Evaluating saliency scores in point clouds of natural environments by\n  learning surface anomalies","summary":"  In recent years, three-dimensional point clouds are used increasingly to\ndocument natural environments. Each dataset contains a diverse set of objects,\nat varying shapes and sizes, distributed throughout the data and intricately\nintertwined with the topography. Therefore, regions of interest are difficult\nto find and consequent analyses become a challenge. Inspired from visual\nperception principles, we propose to differentiate objects of interest from the\ncluttered environment by evaluating how much they stand out from their\nsurroundings, i.e., their geometric salience. Previous saliency detection\napproaches suggested mostly handcrafted attributes for the task. However, such\nmethods fail when the data are too noisy or have high levels of texture. Here\nwe propose a learning-based mechanism that accommodates noise and textured\nsurfaces. We assume that within the natural environment any change from the\nprevalent surface would suggest a salient object. Thus, we first learn the\nunderlying surface and then search for anomalies within it. Initially, a deep\nneural network is trained to reconstruct the surface. Regions where the\nreconstructed part deviates significantly from the original point cloud yield a\nsubstantial reconstruction error, signifying an anomaly, i.e., saliency. We\ndemonstrate the effectiveness of the proposed approach by searching for salient\nfeatures in various natural scenarios, which were acquired by different\nacquisition platforms. We show the strong correlation between the\nreconstruction error and salient objects.\n","authors":["Reuma Arav","Dennis Wittich","Franz Rottensteiner"],"pdf_url":"https://arxiv.org/pdf/2408.14421v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14419v1","updated":"2024-08-26T17:04:23Z","published":"2024-08-26T17:04:23Z","title":"CHARTOM: A Visual Theory-of-Mind Benchmark for Multimodal Large Language\n  Models","summary":"  We introduce CHARTOM, a visual theory-of-mind benchmark for multimodal large\nlanguage models. CHARTOM consists of specially designed data visualizing\ncharts. Given a chart, a language model needs to not only correctly comprehend\nthe chart (the FACT question) but also judge if the chart will be misleading to\na human reader (the MIND question). Both questions have significant societal\nbenefits. We detail the construction of the CHARTOM benchmark including its\ncalibration on human performance.\n","authors":["Shubham Bharti","Shiyun Cheng","Jihyun Rho","Martina Rao","Xiaojin Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.14419v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14415v1","updated":"2024-08-26T17:02:25Z","published":"2024-08-26T17:02:25Z","title":"LoG-VMamba: Local-Global Vision Mamba for Medical Image Segmentation","summary":"  Mamba, a State Space Model (SSM), has recently shown competitive performance\nto Convolutional Neural Networks (CNNs) and Transformers in Natural Language\nProcessing and general sequence modeling. Various attempts have been made to\nadapt Mamba to Computer Vision tasks, including medical image segmentation\n(MIS). Vision Mamba (VM)-based networks are particularly attractive due to\ntheir ability to achieve global receptive fields, similar to Vision\nTransformers, while also maintaining linear complexity in the number of tokens.\nHowever, the existing VM models still struggle to maintain both spatially local\nand global dependencies of tokens in high dimensional arrays due to their\nsequential nature. Employing multiple and/or complicated scanning strategies is\ncomputationally costly, which hinders applications of SSMs to high-dimensional\n2D and 3D images that are common in MIS problems. In this work, we propose\nLocal-Global Vision Mamba, LoG-VMamba, that explicitly enforces spatially\nadjacent tokens to remain nearby on the channel axis, and retains the global\ncontext in a compressed form. Our method allows the SSMs to access the local\nand global contexts even before reaching the last token while requiring only a\nsimple scanning strategy. Our segmentation models are computationally efficient\nand substantially outperform both CNN and Transformers-based baselines on a\ndiverse set of 2D and 3D MIS tasks. The implementation of LoG-VMamba is\navailable at \\url{https://github.com/Oulu-IMEDS/LoG-VMamba}.\n","authors":["Trung Dinh Quoc Dang","Huy Hoang Nguyen","Aleksei Tiulpin"],"pdf_url":"https://arxiv.org/pdf/2408.14415v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2310.05873v6","updated":"2024-08-26T16:55:02Z","published":"2023-10-09T17:13:10Z","title":"Implicit Concept Removal of Diffusion Models","summary":"  Text-to-image (T2I) diffusion models often inadvertently generate unwanted\nconcepts such as watermarks and unsafe images. These concepts, termed as the\n\"implicit concepts\", could be unintentionally learned during training and then\nbe generated uncontrollably during inference. Existing removal methods still\nstruggle to eliminate implicit concepts primarily due to their dependency on\nthe model's ability to recognize concepts it actually can not discern. To\naddress this, we utilize the intrinsic geometric characteristics of implicit\nconcepts and present the Geom-Erasing, a novel concept removal method based on\nthe geometric-driven control. Specifically, once an unwanted implicit concept\nis identified, we integrate the existence and geometric information of the\nconcept into the text prompts with the help of an accessible classifier or\ndetector model. Subsequently, the model is optimized to identify and\ndisentangle this information, which is then adopted as negative prompts during\ngeneration. Moreover, we introduce the Implicit Concept Dataset (ICD), a novel\nimage-text dataset imbued with three typical implicit concepts (i.e., QR codes,\nwatermarks, and text), reflecting real-life situations where implicit concepts\nare easily injected. Geom-Erasing effectively mitigates the generation of\nimplicit concepts, achieving the state-of-the-art results on the Inappropriate\nImage Prompts (I2P) and our challenging Implicit Concept Dataset (ICD)\nbenchmarks.\n","authors":["Zhili Liu","Kai Chen","Yifan Zhang","Jianhua Han","Lanqing Hong","Hang Xu","Zhenguo Li","Dit-Yan Yeung","James Kwok"],"pdf_url":"https://arxiv.org/pdf/2310.05873v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02060v3","updated":"2024-08-26T16:42:45Z","published":"2023-10-03T14:03:20Z","title":"Global Attractor for a Reaction-Diffusion Model Arising in Biological\n  Dynamic in 3D Soil Structure","summary":"  Partial Differential Equations (PDEs) play a crucial role as tools for\nmodeling and comprehending intricate natural processes, notably within the\ndomain of biology. This research explores the domain of microbial activity\nwithin the complex matrix of 3D soil structures, providing valuable\nunderstanding into both the existence and uniqueness of solutions and the\nasymptotic behavior of the corresponding PDE model. Our investigation results\nin the discovery of a global attractor, a fundamental feature with significant\nimplications for long-term system behavior. To enhance the clarity of our\nfindings, numerical simulations are employed to visually illustrate the\nattributes of this global attractor.\n","authors":["Mohamed Elghandouri","Khalil Ezzinbi","Mouad Klai","Olivier Monga"],"pdf_url":"https://arxiv.org/pdf/2310.02060v3.pdf","comment":"Preprint submitted to Mathematical Modeling in Natural Phenomena"},{"id":"http://arxiv.org/abs/2408.14400v1","updated":"2024-08-26T16:34:13Z","published":"2024-08-26T16:34:13Z","title":"Satellite Sunroof: High-res Digital Surface Models and Roof Segmentation\n  for Global Solar Mapping","summary":"  The transition to renewable energy, particularly solar, is key to mitigating\nclimate change. Google's Solar API aids this transition by estimating solar\npotential from aerial imagery, but its impact is constrained by geographical\ncoverage. This paper proposes expanding the API's reach using satellite\nimagery, enabling global solar potential assessment. We tackle challenges\ninvolved in building a Digital Surface Model (DSM) and roof instance\nsegmentation from lower resolution and single oblique views using deep learning\nmodels. Our models, trained on aligned satellite and aerial datasets, produce\n25cm DSMs and roof segments. With ~1m DSM MAE on buildings, ~5deg roof pitch\nerror and ~56% IOU on roof segmentation, they significantly enhance the Solar\nAPI's potential to promote solar adoption.\n","authors":["Vishal Batchu","Alex Wilson","Betty Peng","Carl Elkin","Umangi Jain","Christopher Van Arsdale","Ross Goroshin","Varun Gulshan"],"pdf_url":"https://arxiv.org/pdf/2408.14400v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2408.14397v1","updated":"2024-08-26T16:28:56Z","published":"2024-08-26T16:28:56Z","title":"Uncovering Knowledge Gaps in Radiology Report Generation Models through\n  Knowledge Graphs","summary":"  Recent advancements in artificial intelligence have significantly improved\nthe automatic generation of radiology reports. However, existing evaluation\nmethods fail to reveal the models' understanding of radiological images and\ntheir capacity to achieve human-level granularity in descriptions. To bridge\nthis gap, we introduce a system, named ReXKG, which extracts structured\ninformation from processed reports to construct a comprehensive radiology\nknowledge graph. We then propose three metrics to evaluate the similarity of\nnodes (ReXKG-NSC), distribution of edges (ReXKG-AMS), and coverage of subgraphs\n(ReXKG-SCS) across various knowledge graphs. We conduct an in-depth comparative\nanalysis of AI-generated and human-written radiology reports, assessing the\nperformance of both specialist and generalist models. Our study provides a\ndeeper understanding of the capabilities and limitations of current AI models\nin radiology report generation, offering valuable insights for improving model\nperformance and clinical applicability.\n","authors":["Xiaoman Zhang","Julián N. Acosta","Hong-Yu Zhou","Pranav Rajpurkar"],"pdf_url":"https://arxiv.org/pdf/2408.14397v1.pdf","comment":"Code is available at: https://github.com/rajpurkarlab/ReXKG"},{"id":"http://arxiv.org/abs/2402.00752v4","updated":"2024-08-26T16:27:42Z","published":"2024-02-01T16:43:58Z","title":"On the Error Analysis of 3D Gaussian Splatting and an Optimal Projection\n  Strategy","summary":"  3D Gaussian Splatting has garnered extensive attention and application in\nreal-time neural rendering. Concurrently, concerns have been raised about the\nlimitations of this technology in aspects such as point cloud storage,\nperformance, and robustness in sparse viewpoints, leading to various\nimprovements. However, there has been a notable lack of attention to the\nfundamental problem of projection errors introduced by the local affine\napproximation inherent in the splatting itself, and the consequential impact of\nthese errors on the quality of photo-realistic rendering. This paper addresses\nthe projection error function of 3D Gaussian Splatting, commencing with the\nresidual error from the first-order Taylor expansion of the projection\nfunction. The analysis establishes a correlation between the error and the\nGaussian mean position. Subsequently, leveraging function optimization theory,\nthis paper analyzes the function's minima to provide an optimal projection\nstrategy for Gaussian Splatting referred to Optimal Gaussian Splatting, which\ncan accommodate a variety of camera models. Experimental validation further\nconfirms that this projection methodology reduces artifacts, resulting in a\nmore convincingly realistic rendering.\n","authors":["Letian Huang","Jiayang Bai","Jie Guo","Yuanqi Li","Yanwen Guo"],"pdf_url":"https://arxiv.org/pdf/2402.00752v4.pdf","comment":"Accepted by ECCV2024; Project Page:\n  https://letianhuang.github.io/op43dgs/"},{"id":"http://arxiv.org/abs/2404.03507v3","updated":"2024-08-26T16:22:35Z","published":"2024-04-04T15:10:24Z","title":"DQ-DETR: DETR with Dynamic Query for Tiny Object Detection","summary":"  Despite previous DETR-like methods having performed successfully in generic\nobject detection, tiny object detection is still a challenging task for them\nsince the positional information of object queries is not customized for\ndetecting tiny objects, whose scale is extraordinarily smaller than general\nobjects. Also, DETR-like methods using a fixed number of queries make them\nunsuitable for aerial datasets, which only contain tiny objects, and the\nnumbers of instances are imbalanced between different images. Thus, we present\na simple yet effective model, named DQ-DETR, which consists of three different\ncomponents: categorical counting module, counting-guided feature enhancement,\nand dynamic query selection to solve the above-mentioned problems. DQ-DETR uses\nthe prediction and density maps from the categorical counting module to\ndynamically adjust the number of object queries and improve the positional\ninformation of queries. Our model DQ-DETR outperforms previous CNN-based and\nDETR-like methods, achieving state-of-the-art mAP 30.2% on the AI-TOD-V2\ndataset, which mostly consists of tiny objects.\n","authors":["Yi-Xin Huang","Hou-I Liu","Hong-Han Shuai","Wen-Huang Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.03507v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03771v2","updated":"2024-08-26T16:15:57Z","published":"2024-07-04T09:32:12Z","title":"SpikeGS: Reconstruct 3D scene via fast-moving bio-inspired sensors","summary":"  3D Gaussian Splatting (3DGS) demonstrates unparalleled superior performance\nin 3D scene reconstruction. However, 3DGS heavily relies on the sharp images.\nFulfilling this requirement can be challenging in real-world scenarios\nespecially when the camera moves fast, which severely limits the application of\n3DGS. To address these challenges, we proposed Spike Gausian Splatting\n(SpikeGS), the first framework that integrates the spike streams into 3DGS\npipeline to reconstruct 3D scenes via a fast-moving bio-inspired camera. With\naccumulation rasterization, interval supervision, and a specially designed\npipeline, SpikeGS extracts detailed geometry and texture from high temporal\nresolution but texture lacking spike stream, reconstructs 3D scenes captured in\n1 second. Extensive experiments on multiple synthetic and real-world datasets\ndemonstrate the superiority of SpikeGS compared with existing spike-based and\ndeblur 3D scene reconstruction methods. Codes and data will be released soon.\n","authors":["Yijia Guo","Liwen Hu","Lei Ma","Tiejun Huang"],"pdf_url":"https://arxiv.org/pdf/2407.03771v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05180v2","updated":"2024-08-26T16:13:30Z","published":"2024-04-08T04:10:50Z","title":"GloSoFarID: Global multispectral dataset for Solar Farm IDentification\n  in satellite imagery","summary":"  Solar Photovoltaic (PV) technology is increasingly recognized as a pivotal\nsolution in the global pursuit of clean and renewable energy. This technology\naddresses the urgent need for sustainable energy alternatives by converting\nsolar power into electricity without greenhouse gas emissions. It not only\ncurtails global carbon emissions but also reduces reliance on finite,\nnon-renewable energy sources. In this context, monitoring solar panel farms\nbecomes essential for understanding and facilitating the worldwide shift toward\nclean energy. This study contributes to this effort by developing the first\ncomprehensive global dataset of multispectral satellite imagery of solar panel\nfarms. This dataset is intended to form the basis for training robust machine\nlearning models, which can accurately map and analyze the expansion and\ndistribution of solar panel farms globally. The insights gained from this\nendeavor will be instrumental in guiding informed decision-making for a\nsustainable energy future. https://github.com/yzyly1992/GloSoFarID\n","authors":["Zhiyuan Yang","Ryan Rad"],"pdf_url":"https://arxiv.org/pdf/2404.05180v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14381v1","updated":"2024-08-26T16:04:13Z","published":"2024-08-26T16:04:13Z","title":"Learning Tree-Structured Composition of Data Augmentation","summary":"  Data augmentation is widely used for training a neural network given little\nlabeled data. A common practice of augmentation training is applying a\ncomposition of multiple transformations sequentially to the data. Existing\naugmentation methods such as RandAugment randomly sample from a list of\npre-selected transformations, while methods such as AutoAugment apply advanced\nsearch to optimize over an augmentation set of size $k^d$, which is the number\nof transformation sequences of length $d$, given a list of $k$ transformations.\n  In this paper, we design efficient algorithms whose running time complexity\nis much faster than the worst-case complexity of $O(k^d)$, provably. We propose\na new algorithm to search for a binary tree-structured composition of $k$\ntransformations, where each tree node corresponds to one transformation. The\nbinary tree generalizes sequential augmentations, such as the SimCLR\naugmentation scheme for contrastive learning. Using a top-down, recursive\nsearch procedure, our algorithm achieves a runtime complexity of $O(2^d k)$,\nwhich is much faster than $O(k^d)$ as $k$ increases above $2$. We apply our\nalgorithm to tackle data distributions with heterogeneous subpopulations by\nsearching for one tree in each subpopulation and then learning a weighted\ncombination, resulting in a forest of trees.\n  We validate our proposed algorithms on numerous graph and image datasets,\nincluding a multi-label graph classification dataset we collected. The dataset\nexhibits significant variations in the sizes of graphs and their average\ndegrees, making it ideal for studying data augmentation. We show that our\napproach can reduce the computation cost by 43% over existing search methods\nwhile improving performance by 4.3%. The tree structures can be used to\ninterpret the relative importance of each transformation, such as identifying\nthe important transformations on small vs. large graphs.\n","authors":["Dongyue Li","Kailai Chen","Predrag Radivojac","Hongyang R. Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.14381v1.pdf","comment":"25 pages"},{"id":"http://arxiv.org/abs/2408.14371v1","updated":"2024-08-26T15:53:50Z","published":"2024-08-26T15:53:50Z","title":"SelEx: Self-Expertise in Fine-Grained Generalized Category Discovery","summary":"  In this paper, we address Generalized Category Discovery, aiming to\nsimultaneously uncover novel categories and accurately classify known ones.\nTraditional methods, which lean heavily on self-supervision and contrastive\nlearning, often fall short when distinguishing between fine-grained categories.\nTo address this, we introduce a novel concept called `self-expertise', which\nenhances the model's ability to recognize subtle differences and uncover\nunknown categories. Our approach combines unsupervised and supervised\nself-expertise strategies to refine the model's discernment and generalization.\nInitially, hierarchical pseudo-labeling is used to provide `soft supervision',\nimproving the effectiveness of self-expertise. Our supervised technique differs\nfrom traditional methods by utilizing more abstract positive and negative\nsamples, aiding in the formation of clusters that can generalize to novel\ncategories. Meanwhile, our unsupervised strategy encourages the model to\nsharpen its category distinctions by considering within-category examples as\n`hard' negatives. Supported by theoretical insights, our empirical results\nshowcase that our method outperforms existing state-of-the-art techniques in\nGeneralized Category Discovery across several fine-grained datasets. Our code\nis available at: https://github.com/SarahRastegar/SelEx.\n","authors":["Sarah Rastegar","Mohammadreza Salehi","Yuki M. Asano","Hazel Doughty","Cees G. M. Snoek"],"pdf_url":"https://arxiv.org/pdf/2408.14371v1.pdf","comment":"Accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2405.03762v2","updated":"2024-08-26T15:40:18Z","published":"2024-05-06T18:01:13Z","title":"Swin transformers are robust to distribution and concept drift in\n  endoscopy-based longitudinal rectal cancer assessment","summary":"  Endoscopic images are used at various stages of rectal cancer treatment\nstarting from cancer screening, diagnosis, during treatment to assess response\nand toxicity from treatments such as colitis, and at follow up to detect new\ntumor or local regrowth (LR). However, subjective assessment is highly variable\nand can underestimate the degree of response in some patients, subjecting them\nto unnecessary surgery, or overestimate response that places patients at risk\nof disease spread. Advances in deep learning has shown the ability to produce\nconsistent and objective response assessment for endoscopic images. However,\nmethods for detecting cancers, regrowth, and monitoring response during the\nentire course of patient treatment and follow-up are lacking. This is because,\nautomated diagnosis and rectal cancer response assessment requires methods that\nare robust to inherent imaging illumination variations and confounding\nconditions (blood, scope, blurring) present in endoscopy images as well as\nchanges to the normal lumen and tumor during treatment. Hence, a hierarchical\nshifted window (Swin) transformer was trained to distinguish rectal cancer from\nnormal lumen using endoscopy images. Swin as well as two convolutional\n(ResNet-50, WideResNet-50), and vision transformer (ViT) models were trained\nand evaluated on follow-up longitudinal images to detect LR on private dataset\nas well as on out-of-distribution (OOD) public colonoscopy datasets to detect\npre/non-cancerous polyps. Color shifts were applied using optimal transport to\nsimulate distribution shifts. Swin and ResNet models were similarly accurate in\nthe in-distribution dataset. Swin was more accurate than other methods\n(follow-up: 0.84, OOD: 0.83) even when subject to color shifts (follow-up:\n0.83, OOD: 0.87), indicating capability to provide robust performance for\nlongitudinal cancer assessment.\n","authors":["Jorge Tapias Gomez","Aneesh Rangnekar","Hannah Williams","Hannah Thompson","Julio Garcia-Aguilar","Joshua Jesse Smith","Harini Veeraraghavan"],"pdf_url":"https://arxiv.org/pdf/2405.03762v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14358v1","updated":"2024-08-26T15:32:31Z","published":"2024-08-26T15:32:31Z","title":"An Embedding is Worth a Thousand Noisy Labels","summary":"  The performance of deep neural networks scales with dataset size and label\nquality, rendering the efficient mitigation of low-quality data annotations\ncrucial for building robust and cost-effective systems. Existing strategies to\naddress label noise exhibit severe limitations due to computational complexity\nand application dependency. In this work, we propose WANN, a Weighted Adaptive\nNearest Neighbor approach that builds on self-supervised feature\nrepresentations obtained from foundation models. To guide the weighted voting\nscheme, we introduce a reliability score, which measures the likelihood of a\ndata label being correct. WANN outperforms reference methods, including a\nlinear layer trained with robust loss functions, on diverse datasets of varying\nsize and under various noise types and severities. WANN also exhibits superior\ngeneralization on imbalanced data compared to both Adaptive-NNs (ANN) and fixed\nk-NNs. Furthermore, the proposed weighting scheme enhances supervised\ndimensionality reduction under noisy labels. This yields a significant boost in\nclassification performance with 10x and 100x smaller image embeddings,\nminimizing latency and storage requirements. Our approach, emphasizing\nefficiency and explainability, emerges as a simple, robust solution to overcome\nthe inherent limitations of deep neural network training. The code is available\nat https://github.com/francescodisalvo05/wann-noisy-labels .\n","authors":["Francesco Di Salvo","Sebastian Doerrich","Ines Rieger","Christian Ledig"],"pdf_url":"https://arxiv.org/pdf/2408.14358v1.pdf","comment":"Preprint submitted to the International Journal of Computer Vision\n  (IJCV)"},{"id":"http://arxiv.org/abs/2408.14348v1","updated":"2024-08-26T15:26:27Z","published":"2024-08-26T15:26:27Z","title":"Deep learning-based ecological analysis of camera trap images is\n  impacted by training data quality and size","summary":"  Large wildlife image collections from camera traps are crucial for\nbiodiversity monitoring, offering insights into species richness, occupancy,\nand activity patterns. However, manual processing of these data is\ntime-consuming, hindering analytical processes. To address this, deep neural\nnetworks have been widely adopted to automate image analysis. Despite their\ngrowing use, the impact of model training decisions on downstream ecological\nmetrics remains unclear. Here, we analyse camera trap data from an African\nsavannah and an Asian sub-tropical dry forest to compare key ecological metrics\nderived from expert-generated species identifications with those generated from\ndeep neural networks. We assess the impact of model architecture, training data\nnoise, and dataset size on ecological metrics, including species richness,\noccupancy, and activity patterns. Our results show that while model\narchitecture has minimal impact, large amounts of noise and reduced dataset\nsize significantly affect these metrics. Nonetheless, estimated ecological\nmetrics are resilient to considerable noise, tolerating up to 10% error in\nspecies labels and a 50% reduction in training set size without changing\nsignificantly. We also highlight that conventional metrics like classification\nerror may not always be representative of a model's ability to accurately\nmeasure ecological metrics. We conclude that ecological metrics derived from\ndeep neural network predictions closely match those calculated from expert\nlabels and remain robust to variations in the factors explored. However,\ntraining decisions for deep neural networks can impact downstream ecological\nanalysis. Therefore, practitioners should prioritize creating large, clean\ntraining sets and evaluate deep neural network solutions based on their ability\nto measure the ecological metrics of interest.\n","authors":["Omiros Pantazis","Peggy Bevan","Holly Pringle","Guilherme Braga Ferreira","Daniel J. Ingram","Emily Madsen","Liam Thomas","Dol Raj Thanet","Thakur Silwal","Santosh Rayamajhi","Gabriel Brostow","Oisin Mac Aodha","Kate E. Jones"],"pdf_url":"https://arxiv.org/pdf/2408.14348v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11126v2","updated":"2024-08-26T15:19:12Z","published":"2024-08-20T18:26:09Z","title":"Binocular Model: A deep learning solution for online melt pool\n  temperature analysis using dual-wavelength Imaging Pyrometry","summary":"  In metal Additive Manufacturing (AM), monitoring the temperature of the Melt\nPool (MP) is crucial for ensuring part quality, process stability, defect\nprevention, and overall process optimization. Traditional methods, are slow to\nconverge and require extensive manual effort to translate data into actionable\ninsights, rendering them impractical for real-time monitoring and control. To\naddress this challenge, we propose an Artificial Intelligence (AI)-based\nsolution aimed at reducing manual data processing reliance and improving the\nefficiency of transitioning from data to insight. In our study, we utilize a\ndataset comprising dual-wavelength real-time process monitoring data and\ncorresponding temperature maps. We introduce a deep learning model called the\n\"Binocular model,\" which exploits dual input observations to perform a precise\nanalysis of MP temperature in Laser Powder Bed Fusion (L-PBF). Through advanced\ndeep learning techniques, we seamlessly convert raw data into temperature maps,\nsignificantly streamlining the process and enabling batch processing at a rate\nof up to 750 frames per second, approximately 1000 times faster than\nconventional methods. Our Binocular model achieves high accuracy in temperature\nestimation, evidenced by a 0.95 R-squared score, while simultaneously enhancing\nprocessing efficiency by a factor of $\\sim1000x$ times. This model directly\naddresses the challenge of real-time MP temperature monitoring and offers\ninsights into the encountered constraints and the benefits of our Deep\nLearning-based approach. By combining efficiency and precision, our work\ncontributes to the advancement of temperature monitoring in L-PBF, thus driving\nprogress in the field of metal AM.\n","authors":["Javid Akhavan","Chaitanya Krishna Vallabh","Xiayun Zhao","Souran Manoochehri"],"pdf_url":"https://arxiv.org/pdf/2408.11126v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14343v1","updated":"2024-08-26T15:16:28Z","published":"2024-08-26T15:16:28Z","title":"A Brief Analysis of the Iterative Next Boundary Detection Network for\n  Tree Rings Delineation in Images of Pinus taeda","summary":"  This work presents the INBD network proposed by Gillert et al. in CVPR-2023\nand studies its application for delineating tree rings in RGB images of Pinus\ntaeda cross sections captured by a smartphone (UruDendro dataset), which are\nimages with different characteristics from the ones used to train the method.\nThe INBD network operates in two stages: first, it segments the background,\npith, and ring boundaries. In the second stage, the image is transformed into\npolar coordinates, and ring boundaries are iteratively segmented from the pith\nto the bark. Both stages are based on the U-Net architecture. The method\nachieves an F-Score of 77.5, a mAR of 0.540, and an ARAND of 0.205 on the\nevaluation set. The code for the experiments is available at\nhttps://github.com/hmarichal93/mlbrief_inbd.\n","authors":["Henry Marichal","Gregory Randall"],"pdf_url":"https://arxiv.org/pdf/2408.14343v1.pdf","comment":"Submitted to IPOL ad an MLBriefs paper"},{"id":"http://arxiv.org/abs/2408.14339v1","updated":"2024-08-26T15:08:12Z","published":"2024-08-26T15:08:12Z","title":"ConceptMix: A Compositional Image Generation Benchmark with Controllable\n  Difficulty","summary":"  Compositionality is a critical capability in Text-to-Image (T2I) models, as\nit reflects their ability to understand and combine multiple concepts from text\ndescriptions. Existing evaluations of compositional capability rely heavily on\nhuman-designed text prompts or fixed templates, limiting their diversity and\ncomplexity, and yielding low discriminative power. We propose ConceptMix, a\nscalable, controllable, and customizable benchmark which automatically\nevaluates compositional generation ability of T2I models. This is done in two\nstages. First, ConceptMix generates the text prompts: concretely, using\ncategories of visual concepts (e.g., objects, colors, shapes, spatial\nrelationships), it randomly samples an object and k-tuples of visual concepts,\nthen uses GPT4-o to generate text prompts for image generation based on these\nsampled concepts. Second, ConceptMix evaluates the images generated in response\nto these prompts: concretely, it checks how many of the k concepts actually\nappeared in the image by generating one question per visual concept and using a\nstrong VLM to answer them. Through administering ConceptMix to a diverse set of\nT2I models (proprietary as well as open ones) using increasing values of k, we\nshow that our ConceptMix has higher discrimination power than earlier\nbenchmarks. Specifically, ConceptMix reveals that the performance of several\nmodels, especially open models, drops dramatically with increased k.\nImportantly, it also provides insight into the lack of prompt diversity in\nwidely-used training datasets. Additionally, we conduct extensive human studies\nto validate the design of ConceptMix and compare our automatic grading with\nhuman judgement. We hope it will guide future T2I model development.\n","authors":["Xindi Wu","Dingli Yu","Yangsibo Huang","Olga Russakovsky","Sanjeev Arora"],"pdf_url":"https://arxiv.org/pdf/2408.14339v1.pdf","comment":"43 pages"},{"id":"http://arxiv.org/abs/2408.14336v1","updated":"2024-08-26T15:07:01Z","published":"2024-08-26T15:07:01Z","title":"Equivariant Reinforcement Learning under Partial Observability","summary":"  Incorporating inductive biases is a promising approach for tackling\nchallenging robot learning domains with sample-efficient solutions. This paper\nidentifies partially observable domains where symmetries can be a useful\ninductive bias for efficient learning. Specifically, by encoding the\nequivariance regarding specific group symmetries into the neural networks, our\nactor-critic reinforcement learning agents can reuse solutions in the past for\nrelated scenarios. Consequently, our equivariant agents outperform\nnon-equivariant approaches significantly in terms of sample efficiency and\nfinal performance, demonstrated through experiments on a range of robotic tasks\nin simulation and real hardware.\n","authors":["Hai Nguyen","Andrea Baisero","David Klee","Dian Wang","Robert Platt","Christopher Amato"],"pdf_url":"https://arxiv.org/pdf/2408.14336v1.pdf","comment":"Conference on Robot Learning, 2023"},{"id":"http://arxiv.org/abs/2408.14329v1","updated":"2024-08-26T14:55:23Z","published":"2024-08-26T14:55:23Z","title":"PHEVA: A Privacy-preserving Human-centric Video Anomaly Detection\n  Dataset","summary":"  PHEVA, a Privacy-preserving Human-centric Ethical Video Anomaly detection\ndataset. By removing pixel information and providing only de-identified human\nannotations, PHEVA safeguards personally identifiable information. The dataset\nincludes seven indoor/outdoor scenes, featuring one novel, context-specific\ncamera, and offers over 5x the pose-annotated frames compared to the largest\nprevious dataset. This study benchmarks state-of-the-art methods on PHEVA using\na comprehensive set of metrics, including the 10% Error Rate (10ER), a metric\nused for anomaly detection for the first time providing insights relevant to\nreal-world deployment. As the first of its kind, PHEVA bridges the gap between\nconventional training and real-world deployment by introducing continual\nlearning benchmarks, with models outperforming traditional methods in 82.14% of\ncases. The dataset is publicly available at\nhttps://github.com/TeCSAR-UNCC/PHEVA.git.\n","authors":["Ghazal Alinezhad Noghre","Shanle Yao","Armin Danesh Pazho","Babak Rahimi Ardabili","Vinit Katariya","Hamed Tabkhi"],"pdf_url":"https://arxiv.org/pdf/2408.14329v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14326v1","updated":"2024-08-26T14:54:14Z","published":"2024-08-26T14:54:14Z","title":"Streamline tractography of the fetal brain in utero with machine\n  learning","summary":"  Diffusion-weighted magnetic resonance imaging (dMRI) is the only non-invasive\ntool for studying white matter tracts and structural connectivity of the brain.\nThese assessments rely heavily on tractography techniques, which reconstruct\nvirtual streamlines representing white matter fibers. Much effort has been\ndevoted to improving tractography methodology for adult brains, while\ntractography of the fetal brain has been largely neglected. Fetal tractography\nfaces unique difficulties due to low dMRI signal quality, immature and rapidly\ndeveloping brain structures, and paucity of reference data. This work presents\nthe first machine learning model for fetal tractography. The model input\nconsists of five sources of information: (1) Fiber orientation, inferred from a\ndiffusion tensor fit to the dMRI signal; (2) Directions of recent propagation\nsteps; (3) Global spatial information, encoded as distances to keypoints in the\nbrain cortex; (4) Tissue segmentation information; and (5) Prior information\nabout the expected local fiber orientations supplied with an atlas. In order to\nmitigate the local tensor estimation error, a large spatial context around the\ncurrent point in the diffusion tensor image is encoded using convolutional and\nattention neural network modules. Moreover, the diffusion tensor information at\na hypothetical next point is included in the model input. Filtering rules based\non anatomically constrained tractography are applied to prune implausible\nstreamlines. We trained the model on manually-refined whole-brain fetal\ntractograms and validated the trained model on an independent set of 11 test\nscans with gestational ages between 23 and 36 weeks. Results show that our\nproposed method achieves superior performance across all evaluated tracts. The\nnew method can significantly advance the capabilities of dMRI for studying\nnormal and abnormal brain development in utero.\n","authors":["Weide Liu","Camilo Calixto","Simon K. Warfield","Davood Karimi"],"pdf_url":"https://arxiv.org/pdf/2408.14326v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15699v3","updated":"2024-08-26T14:39:33Z","published":"2023-05-25T04:14:49Z","title":"Cross-view Action Recognition Understanding From Exocentric to\n  Egocentric Perspective","summary":"  Understanding action recognition in egocentric videos has emerged as a vital\nresearch topic with numerous practical applications. With the limitation in the\nscale of egocentric data collection, learning robust deep learning-based action\nrecognition models remains difficult. Transferring knowledge learned from the\nlarge-scale exocentric data to the egocentric data is challenging due to the\ndifference in videos across views. Our work introduces a novel cross-view\nlearning approach to action recognition (CVAR) that effectively transfers\nknowledge from the exocentric to the selfish view. First, we present a novel\ngeometric-based constraint into the self-attention mechanism in Transformer\nbased on analyzing the camera positions between two views. Then, we propose a\nnew cross-view self-attention loss learned on unpaired cross-view data to\nenforce the self-attention mechanism learning to transfer knowledge across\nviews. Finally, to further improve the performance of our cross-view learning\napproach, we present the metrics to measure the correlations in videos and\nattention maps effectively. Experimental results on standard egocentric action\nrecognition benchmarks, i.e., Charades-Ego, EPIC-Kitchens-55, and\nEPIC-Kitchens-100, have shown our approach's effectiveness and state-of-the-art\nperformance.\n","authors":["Thanh-Dat Truong","Khoa Luu"],"pdf_url":"https://arxiv.org/pdf/2305.15699v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14284v1","updated":"2024-08-26T14:09:40Z","published":"2024-08-26T14:09:40Z","title":"May the Forgetting Be with You: Alternate Replay for Learning with Noisy\n  Labels","summary":"  Forgetting presents a significant challenge during incremental training,\nmaking it particularly demanding for contemporary AI systems to assimilate new\nknowledge in streaming data environments. To address this issue, most\napproaches in Continual Learning (CL) rely on the replay of a restricted buffer\nof past data. However, the presence of noise in real-world scenarios, where\nhuman annotation is constrained by time limitations or where data is\nautomatically gathered from the web, frequently renders these strategies\nvulnerable. In this study, we address the problem of CL under Noisy Labels\n(CLN) by introducing Alternate Experience Replay (AER), which takes advantage\nof forgetting to maintain a clear distinction between clean, complex, and noisy\nsamples in the memory buffer. The idea is that complex or mislabeled examples,\nwhich hardly fit the previously learned data distribution, are most likely to\nbe forgotten. To grasp the benefits of such a separation, we equip AER with\nAsymmetric Balanced Sampling (ABS): a new sample selection strategy that\nprioritizes purity on the current task while retaining relevant samples from\nthe past. Through extensive computational comparisons, we demonstrate the\neffectiveness of our approach in terms of both accuracy and purity of the\nobtained buffer, resulting in a remarkable average gain of 4.71% points in\naccuracy with respect to existing loss-based purification strategies. Code is\navailable at https://github.com/aimagelab/mammoth.\n","authors":["Monica Millunzi","Lorenzo Bonicelli","Angelo Porrello","Jacopo Credi","Petter N. Kolm","Simone Calderara"],"pdf_url":"https://arxiv.org/pdf/2408.14284v1.pdf","comment":"25 pages, 5 figures. Accepted at the The 35th British Machine Vision\n  Conference 2024 (BMVC 2024), Glasgow, UK"},{"id":"http://arxiv.org/abs/2408.12615v2","updated":"2024-08-26T14:06:59Z","published":"2024-08-08T14:11:06Z","title":"Pediatric TSC-Related Epilepsy Classification from Clinical MR Images\n  Using Quantum Neural Network","summary":"  Tuberous sclerosis complex (TSC) manifests as a multisystem disorder with\nsignificant neurological implications. This study addresses the critical need\nfor robust classification models tailored to TSC in pediatric patients,\nintroducing QResNet,a novel deep learning model seamlessly integrating\nconventional convolutional neural networks with quantum neural networks. The\nmodel incorporates a two-layer quantum layer (QL), comprising ZZFeatureMap and\nAnsatz layers, strategically designed for processing classical data within a\nquantum framework. A comprehensive evaluation, demonstrates the superior\nperformance of QResNet in TSC MRI image classification compared to conventional\n3D-ResNet models. These compelling findings underscore the potential of quantum\ncomputing to revolutionize medical imaging and diagnostics.Remarkably, this\nmethod surpasses conventional CNNs in accuracy and Area Under the Curve (AUC)\nmetrics with the current dataset. Future research endeavors may focus on\nexploring the scalability and practical implementation of quantum algorithms in\nreal-world medical imaging scenarios.\n","authors":["Ling Lin","Yihang Zhou","Zhanqi Hu","Dian Jiang","Congcong Liu","Shuo Zhou","Yanjie Zhu","Jianxiang Liao","Dong Liang","Hairong Zheng","Haifeng Wang"],"pdf_url":"https://arxiv.org/pdf/2408.12615v2.pdf","comment":"5 pages,4 figures,2 tables,presented at ISBI 2024"},{"id":"http://arxiv.org/abs/2408.14281v1","updated":"2024-08-26T14:02:30Z","published":"2024-08-26T14:02:30Z","title":"Uncertainties of Latent Representations in Computer Vision","summary":"  Uncertainty quantification is a key pillar of trustworthy machine learning.\nIt enables safe reactions under unsafe inputs, like predicting only when the\nmachine learning model detects sufficient evidence, discarding anomalous data,\nor emitting warnings when an error is likely to be inbound. This is\nparticularly crucial in safety-critical areas like medical image classification\nor self-driving cars. Despite the plethora of proposed uncertainty\nquantification methods achieving increasingly higher scores on performance\nbenchmarks, uncertainty estimates are often shied away from in practice. Many\nmachine learning projects start from pretrained latent representations that\ncome without uncertainty estimates. Uncertainties would need to be trained by\npractitioners on their own, which is notoriously difficult and\nresource-intense.\n  This thesis makes uncertainty estimates easily accessible by adding them to\nthe latent representation vectors of pretrained computer vision models. Besides\nproposing approaches rooted in probability and decision theory, such as\nMonte-Carlo InfoNCE (MCInfoNCE) and loss prediction, we delve into both\ntheoretical and empirical questions. We show that these unobservable\nuncertainties about unobservable latent representations are indeed provably\ncorrect. We also provide an uncertainty-aware representation learning (URL)\nbenchmark to compare these unobservables against observable ground-truths.\nFinally, we compile our findings to pretrain lightweight representation\nuncertainties on large-scale computer vision models that transfer to unseen\ndatasets in a zero-shot manner.\n  Our findings do not only advance the current theoretical understanding of\nuncertainties over latent variables, but also facilitate the access to\nuncertainty quantification for future researchers inside and outside the field,\nenabling straightforward but trustworthy machine learning.\n","authors":["Michael Kirchhof"],"pdf_url":"https://arxiv.org/pdf/2408.14281v1.pdf","comment":"Doctoral thesis"},{"id":"http://arxiv.org/abs/2403.05451v2","updated":"2024-08-26T13:58:16Z","published":"2024-03-08T16:57:47Z","title":"Attention-guided Feature Distillation for Semantic Segmentation","summary":"  In contrast to existing complex methodologies commonly employed for\ndistilling knowledge from a teacher to a student, this paper showcases the\nefficacy of a simple yet powerful method for utilizing refined feature maps to\ntransfer attention. The proposed method has proven to be effective in\ndistilling rich information, outperforming existing methods in semantic\nsegmentation as a dense prediction task. The proposed Attention-guided Feature\nDistillation (AttnFD) method, employs the Convolutional Block Attention Module\n(CBAM), which refines feature maps by taking into account both channel-specific\nand spatial information content. Simply using the Mean Squared Error (MSE) loss\nfunction between the refined feature maps of the teacher and the student,\nAttnFD demonstrates outstanding performance in semantic segmentation, achieving\nstate-of-the-art results in terms of improving the mean Intersection over Union\n(mIoU) of the student network on the PascalVoc 2012, Cityscapes, COCO, and\nCamVid datasets.\n","authors":["Amir M. Mansourian","Arya Jalali","Rozhan Ahmadi","Shohreh Kasaei"],"pdf_url":"https://arxiv.org/pdf/2403.05451v2.pdf","comment":"9 pages, 8 figures, and 6 tables"},{"id":"http://arxiv.org/abs/2408.09869v2","updated":"2024-08-26T13:55:59Z","published":"2024-08-19T10:20:06Z","title":"Docling Technical Report","summary":"  This technical report introduces Docling, an easy to use, self-contained,\nMIT-licensed open-source package for PDF document conversion. It is powered by\nstate-of-the-art specialized AI models for layout analysis (DocLayNet) and\ntable structure recognition (TableFormer), and runs efficiently on commodity\nhardware in a small resource budget. The code interface allows for easy\nextensibility and addition of new features and models.\n","authors":["Christoph Auer","Maksym Lysak","Ahmed Nassar","Michele Dolfi","Nikolaos Livathinos","Panos Vagenas","Cesar Berrospi Ramis","Matteo Omenetti","Fabian Lindlbauer","Kasper Dinkla","Valery Weber","Lucas Morin","Ingmar Meijer","Viktor Kuropiatnyk","Peter W. J. Staar"],"pdf_url":"https://arxiv.org/pdf/2408.09869v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14279v1","updated":"2024-08-26T13:55:42Z","published":"2024-08-26T13:55:42Z","title":"Learning Local Pattern Modularization for Point Cloud Reconstruction\n  from Unseen Classes","summary":"  It is challenging to reconstruct 3D point clouds in unseen classes from\nsingle 2D images. Instead of object-centered coordinate system, current methods\ngeneralized global priors learned in seen classes to reconstruct 3D shapes from\nunseen classes in viewer-centered coordinate system. However, the\nreconstruction accuracy and interpretability are still eager to get improved.\nTo resolve this issue, we introduce to learn local pattern modularization for\nreconstructing 3D shapes in unseen classes, which achieves both good\ngeneralization ability and high reconstruction accuracy. Our insight is to\nlearn a local prior which is class-agnostic and easy to generalize in\nobject-centered coordinate system. Specifically, the local prior is learned via\na process of learning and customizing local pattern modularization in seen\nclasses. During this process, we first learn a set of patterns in local\nregions, which is the basis in the object-centered coordinate system to\nrepresent an arbitrary region on shapes across different classes. Then, we\nmodularize each region on an initially reconstructed shape using the learned\nlocal patterns. Based on that, we customize the local pattern modularization\nusing the input image by refining the reconstruction with more details. Our\nmethod enables to reconstruct high fidelity point clouds from unseen classes in\nobject-centered coordinate system without requiring a large number of patterns\nor any additional information, such as segmentation supervision or camera\nposes. Our experimental results under widely used benchmarks show that our\nmethod achieves the state-of-the-art reconstruction accuracy for shapes from\nunseen classes. The code is available at https://github.com/chenchao15/Unseen.\n","authors":["Chao Chen","Zhizhong Han","Yu-Shen Liu"],"pdf_url":"https://arxiv.org/pdf/2408.14279v1.pdf","comment":"14pages, 11figures, accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2312.06726v3","updated":"2024-08-26T13:52:18Z","published":"2023-12-11T05:57:09Z","title":"Filter & Align: Curating Image-Text Data with Human Knowledge","summary":"  The increasing availability of image-text pairs has largely fueled the rapid\nadvancement in vision-language foundation models. However, the vast scale of\nthese datasets inevitably introduces significant variability in data quality,\nwhich can adversely affect the model performance. This highlights the critical\nrole of data filtering, not only to enhance training efficiency but also to\nimprove overall data quality. Existing methods typically rely on metrics such\nas CLIP Score and BLIP Score, which are derived from pre-trained models.\nHowever, these models are often trained on uncurated, noisy datasets, which can\nperpetuate errors and misalignments in the filtered dataset. We present a novel\nalgorithm that incorporates human knowledge on image-text alignment to guide\nfiltering vast corpus of web-crawled image-text datasets into a compact and\nhigh-quality form. To systemically capture human preferences on image-text\nalignments, we collect a diverse image-text dataset where each image is\nassociated with multiple captions from various sources, and establish a\ncomprehensive set of both subjective and objective criteria for critically\nguiding the alignment assessment from labelers. Additionally, we train a reward\nmodel on these human-preference annotations to internalize the nuanced human\nunderstanding of image-text alignment. The resulting reward model thus can act\nas a human-like referee to filter image-text pairs. Extensive experiments\ndemonstrate that we can maintain, sometimes even improve, model performance\nwhile compressing the image-text datasets up to ~90%. An impressive example is\nthat, by aggressively reducing the total training sample from 130M to only\n15.5M, our BLIP-B/16 models consistently show an average improvement of 2.9% on\nretrieval tasks and 11.5% on captioning tasks compared to full-size-dataset\ncounterparts.\n","authors":["Lei Zhang","Fangxun Shu","Tianyang Liu","Sucheng Ren","Hao Jiang","Cihang Xie"],"pdf_url":"https://arxiv.org/pdf/2312.06726v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14270v1","updated":"2024-08-26T13:45:58Z","published":"2024-08-26T13:45:58Z","title":"Reliable Multi-modal Medical Image-to-image Translation Independent of\n  Pixel-wise Aligned Data","summary":"  The current mainstream multi-modal medical image-to-image translation methods\nface a contradiction. Supervised methods with outstanding performance rely on\npixel-wise aligned training data to constrain the model optimization. However,\nobtaining pixel-wise aligned multi-modal medical image datasets is challenging.\nUnsupervised methods can be trained without paired data, but their reliability\ncannot be guaranteed. At present, there is no ideal multi-modal medical\nimage-to-image translation method that can generate reliable translation\nresults without the need for pixel-wise aligned data. This work aims to develop\na novel medical image-to-image translation model that is independent of\npixel-wise aligned data (MITIA), enabling reliable multi-modal medical\nimage-to-image translation under the condition of misaligned training data. The\nproposed MITIA model utilizes a prior extraction network composed of a\nmulti-modal medical image registration module and a multi-modal misalignment\nerror detection module to extract pixel-level prior information from training\ndata with misalignment errors to the largest extent. The extracted prior\ninformation is then used to construct a regularization term to constrain the\noptimization of the unsupervised cycle-consistent GAN model, restricting its\nsolution space and thereby improving the performance and reliability of the\ngenerator. We trained the MITIA model using six datasets containing different\nmisalignment errors and two well-aligned datasets. Subsequently, we compared\nthe proposed method with six other state-of-the-art image-to-image translation\nmethods. The results of both quantitative analysis and qualitative visual\ninspection indicate that MITIA achieves superior performance compared to the\ncompeting state-of-the-art methods, both on misaligned data and aligned data.\n","authors":["Langrui Zhou","Guang Li"],"pdf_url":"https://arxiv.org/pdf/2408.14270v1.pdf","comment":"This paper has been accepted as a research article by Medical Physics"},{"id":"http://arxiv.org/abs/2210.07182v7","updated":"2024-08-26T13:43:46Z","published":"2022-10-13T17:03:36Z","title":"PDEBENCH: An Extensive Benchmark for Scientific Machine Learning","summary":"  Machine learning-based modeling of physical systems has experienced increased\ninterest in recent years. Despite some impressive progress, there is still a\nlack of benchmarks for Scientific ML that are easy to use but still challenging\nand representative of a wide range of problems. We introduce PDEBench, a\nbenchmark suite of time-dependent simulation tasks based on Partial\nDifferential Equations (PDEs). PDEBench comprises both code and data to\nbenchmark the performance of novel machine learning models against both\nclassical numerical simulations and machine learning baselines. Our proposed\nset of benchmark problems contribute the following unique features: (1) A much\nwider range of PDEs compared to existing benchmarks, ranging from relatively\ncommon examples to more realistic and difficult problems; (2) much larger\nready-to-use datasets compared to prior work, comprising multiple simulation\nruns across a larger number of initial and boundary conditions and PDE\nparameters; (3) more extensible source codes with user-friendly APIs for data\ngeneration and baseline results with popular machine learning models (FNO,\nU-Net, PINN, Gradient-Based Inverse Method). PDEBench allows researchers to\nextend the benchmark freely for their own purposes using a standardized API and\nto compare the performance of new models to existing baseline methods. We also\npropose new evaluation metrics with the aim to provide a more holistic\nunderstanding of learning methods in the context of Scientific ML. With those\nmetrics we identify tasks which are challenging for recent ML methods and\npropose these tasks as future challenges for the community. The code is\navailable at https://github.com/pdebench/PDEBench.\n","authors":["Makoto Takamoto","Timothy Praditia","Raphael Leiteritz","Dan MacKinlay","Francesco Alesiani","Dirk Pflüger","Mathias Niepert"],"pdf_url":"https://arxiv.org/pdf/2210.07182v7.pdf","comment":"16 pages (main body) + 34 pages (supplemental material), accepted for\n  publication in NeurIPS 2022 Track Datasets and Benchmarks"},{"id":"http://arxiv.org/abs/2408.14267v1","updated":"2024-08-26T13:42:43Z","published":"2024-08-26T13:42:43Z","title":"1-Bit FQT: Pushing the Limit of Fully Quantized Training to 1-bit","summary":"  Fully quantized training (FQT) accelerates the training of deep neural\nnetworks by quantizing the activations, weights, and gradients into lower\nprecision. To explore the ultimate limit of FQT (the lowest achievable\nprecision), we make a first attempt to 1-bit FQT. We provide a theoretical\nanalysis of FQT based on Adam and SGD, revealing that the gradient variance\ninfluences the convergence of FQT. Building on these theoretical results, we\nintroduce an Activation Gradient Pruning (AGP) strategy. The strategy leverages\nthe heterogeneity of gradients by pruning less informative gradients and\nenhancing the numerical precision of remaining gradients to mitigate gradient\nvariance. Additionally, we propose Sample Channel joint Quantization (SCQ),\nwhich utilizes different quantization strategies in the computation of weight\ngradients and activation gradients to ensure that the method is friendly to\nlow-bitwidth hardware. Finally, we present a framework to deploy our algorithm.\nFor fine-tuning VGGNet-16 and ResNet-18 on multiple datasets, our algorithm\nachieves an average accuracy improvement of approximately 6%, compared to\nper-sample quantization. Moreover, our training speedup can reach a maximum of\n5.13x compared to full precision training.\n","authors":["Chang Gao","Jianfei Chen","Kang Zhao","Jiaqi Wang","Liping Jing"],"pdf_url":"https://arxiv.org/pdf/2408.14267v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09431v2","updated":"2024-08-26T13:41:14Z","published":"2024-04-15T03:12:12Z","title":"VFMM3D: Releasing the Potential of Image by Vision Foundation Model for\n  Monocular 3D Object Detection","summary":"  Due to its cost-effectiveness and widespread availability, monocular 3D\nobject detection, which relies solely on a single camera during inference,\nholds significant importance across various applications, including autonomous\ndriving and robotics. Nevertheless, directly predicting the coordinates of\nobjects in 3D space from monocular images poses challenges. Therefore, an\neffective solution involves transforming monocular images into LiDAR-like\nrepresentations and employing a LiDAR-based 3D object detector to predict the\n3D coordinates of objects. The key step in this method is accurately converting\nthe monocular image into a reliable point cloud form. In this paper, we present\nVFMM3D, an innovative framework that leverages the capabilities of Vision\nFoundation Models (VFMs) to accurately transform single-view images into LiDAR\npoint cloud representations. VFMM3D utilizes the Segment Anything Model (SAM)\nand Depth Anything Model (DAM) to generate high-quality pseudo-LiDAR data\nenriched with rich foreground information. Specifically, the Depth Anything\nModel (DAM) is employed to generate dense depth maps. Subsequently, the Segment\nAnything Model (SAM) is utilized to differentiate foreground and background\nregions by predicting instance masks. These predicted instance masks and depth\nmaps are then combined and projected into 3D space to generate pseudo-LiDAR\npoints. Finally, any object detectors based on point clouds can be utilized to\npredict the 3D coordinates of objects. Comprehensive experiments are conducted\non two challenging 3D object detection datasets, KITTI and Waymo. Our VFMM3D\nestablishes a new state-of-the-art performance on both datasets. Additionally,\nexperimental results demonstrate the generality of VFMM3D, showcasing its\nseamless integration into various LiDAR-based 3D object detectors.\n","authors":["Bonan Ding","Jin Xie","Jing Nie","Jiale Cao","Xuelong Li","Yanwei Pang"],"pdf_url":"https://arxiv.org/pdf/2404.09431v2.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2408.14253v1","updated":"2024-08-26T13:16:03Z","published":"2024-08-26T13:16:03Z","title":"Text3DAug -- Prompted Instance Augmentation for LiDAR Perception","summary":"  LiDAR data of urban scenarios poses unique challenges, such as heterogeneous\ncharacteristics and inherent class imbalance. Therefore, large-scale datasets\nare necessary to apply deep learning methods. Instance augmentation has emerged\nas an efficient method to increase dataset diversity. However, current methods\nrequire the time-consuming curation of 3D models or costly manual data\nannotation. To overcome these limitations, we propose Text3DAug, a novel\napproach leveraging generative models for instance augmentation. Text3DAug does\nnot depend on labeled data and is the first of its kind to generate instances\nand annotations from text. This allows for a fully automated pipeline,\neliminating the need for manual effort in practical applications. Additionally,\nText3DAug is sensor agnostic and can be applied regardless of the LiDAR sensor\nused. Comprehensive experimental analysis on LiDAR segmentation, detection and\nnovel class discovery demonstrates that Text3DAug is effective in supplementing\nexisting methods or as a standalone method, performing on par or better than\nestablished methods, however while overcoming their specific drawbacks. The\ncode is publicly available.\n","authors":["Laurenz Reichardt","Luca Uhr","Oliver Wasenmüller"],"pdf_url":"https://arxiv.org/pdf/2408.14253v1.pdf","comment":"Accepted at the 2024 IEEE/RSJ International Conference on Intelligent\n  Robots and Systems (IROS 2024)"},{"id":"http://arxiv.org/abs/2408.14249v1","updated":"2024-08-26T13:09:23Z","published":"2024-08-26T13:09:23Z","title":"Beyond Few-shot Object Detection: A Detailed Survey","summary":"  Object detection is a critical field in computer vision focusing on\naccurately identifying and locating specific objects in images or videos.\nTraditional methods for object detection rely on large labeled training\ndatasets for each object category, which can be time-consuming and expensive to\ncollect and annotate. To address this issue, researchers have introduced\nfew-shot object detection (FSOD) approaches that merge few-shot learning and\nobject detection principles. These approaches allow models to quickly adapt to\nnew object categories with only a few annotated samples. While traditional FSOD\nmethods have been studied before, this survey paper comprehensively reviews\nFSOD research with a specific focus on covering different FSOD settings such as\nstandard FSOD, generalized FSOD, incremental FSOD, open-set FSOD, and domain\nadaptive FSOD. These approaches play a vital role in reducing the reliance on\nextensive labeled datasets, particularly as the need for efficient machine\nlearning models continues to rise. This survey paper aims to provide a\ncomprehensive understanding of the above-mentioned few-shot settings and\nexplore the methodologies for each FSOD task. It thoroughly compares\nstate-of-the-art methods across different FSOD settings, analyzing them in\ndetail based on their evaluation protocols. Additionally, it offers insights\ninto their applications, challenges, and potential future directions in the\nevolving field of object detection with limited data.\n","authors":["Vishal Chudasama","Hiran Sarkar","Pankaj Wasnik","Vineeth N Balasubramanian","Jayateja Kalla"],"pdf_url":"https://arxiv.org/pdf/2408.14249v1.pdf","comment":"43 pages, 8 figures"},{"id":"http://arxiv.org/abs/2406.08282v3","updated":"2024-08-26T13:01:39Z","published":"2024-06-12T14:47:51Z","title":"Interpretable Representation Learning of Cardiac MRI via Attribute\n  Regularization","summary":"  Interpretability is essential in medical imaging to ensure that clinicians\ncan comprehend and trust artificial intelligence models. Several approaches\nhave been recently considered to encode attributes in the latent space to\nenhance its interpretability. Notably, attribute regularization aims to encode\na set of attributes along the dimensions of a latent representation. However,\nthis approach is based on Variational AutoEncoder and suffers from blurry\nreconstruction. In this paper, we propose an Attributed-regularized Soft\nIntrospective Variational Autoencoder that combines attribute regularization of\nthe latent space within the framework of an adversarially trained variational\nautoencoder. We demonstrate on short-axis cardiac Magnetic Resonance images of\nthe UK Biobank the ability of the proposed method to address blurry\nreconstruction issues of variational autoencoder methods while preserving the\nlatent space interpretability.\n","authors":["Maxime Di Folco","Cosmin I. Bercea","Emily Chan","Julia A. Schnabel"],"pdf_url":"https://arxiv.org/pdf/2406.08282v3.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2312.08915"},{"id":"http://arxiv.org/abs/2408.14244v1","updated":"2024-08-26T12:59:32Z","published":"2024-08-26T12:59:32Z","title":"Cascaded Temporal Updating Network for Efficient Video Super-Resolution","summary":"  Existing video super-resolution (VSR) methods generally adopt a recurrent\npropagation network to extract spatio-temporal information from the entire\nvideo sequences, exhibiting impressive performance. However, the key components\nin recurrent-based VSR networks significantly impact model efficiency, e.g.,\nthe alignment module occupies a substantial portion of model parameters, while\nthe bidirectional propagation mechanism significantly amplifies the inference\ntime. Consequently, developing a compact and efficient VSR method that can be\ndeployed on resource-constrained devices, e.g., smartphones, remains\nchallenging. To this end, we propose a cascaded temporal updating network\n(CTUN) for efficient VSR. We first develop an implicit cascaded alignment\nmodule to explore spatio-temporal correspondences from adjacent frames.\nMoreover, we propose a unidirectional propagation updating network to\nefficiently explore long-range temporal information, which is crucial for\nhigh-quality video reconstruction. Specifically, we develop a simple yet\neffective hidden updater that can leverage future information to update hidden\nfeatures during forward propagation, significantly reducing inference time\nwhile maintaining performance. Finally, we formulate all of these components\ninto an end-to-end trainable VSR network. Extensive experimental results show\nthat our CTUN achieves a favorable trade-off between efficiency and performance\ncompared to existing methods. Notably, compared with BasicVSR, our method\nobtains better results while employing only about 30% of the parameters and\nrunning time. The source code and pre-trained models will be available at\nhttps://github.com/House-Leo/CTUN.\n","authors":["Hao Li","Jiangxin Dong","Jinshan Pan"],"pdf_url":"https://arxiv.org/pdf/2408.14244v1.pdf","comment":"Project website: https://github.com/House-Leo/CTUN"},{"id":"http://arxiv.org/abs/2403.12848v2","updated":"2024-08-26T12:55:44Z","published":"2024-03-19T15:54:48Z","title":"Planner3D: LLM-enhanced graph prior meets 3D indoor scene explicit\n  regularization","summary":"  Compositional 3D scene synthesis has diverse applications across a spectrum\nof industries such as robotics, films, and video games, as it closely mirrors\nthe complexity of real-world multi-object environments. Conventional works\ntypically employ shape retrieval based frameworks which naturally suffer from\nlimited shape diversity. Recent progresses have been made in object shape\ngeneration with generative models such as diffusion models, which increases the\nshape fidelity. However, these approaches separately treat 3D shape generation\nand layout generation. The synthesized scenes are usually hampered by layout\ncollision, which suggests that the scene-level fidelity is still\nunder-explored. In this paper, we aim at generating realistic and reasonable 3D\nindoor scenes from scene graph. To enrich the priors of the given scene graph\ninputs, large language model is utilized to aggregate the global-wise features\nwith local node-wise and edge-wise features. With a unified graph encoder,\ngraph features are extracted to guide joint layout-shape generation. Additional\nregularization is introduced to explicitly constrain the produced 3D layouts.\nBenchmarked on the SG-FRONT dataset, our method achieves better 3D scene\nsynthesis, especially in terms of scene-level fidelity. The source code will be\nreleased after publication.\n","authors":["Yao Wei","Martin Renqiang Min","George Vosselman","Li Erran Li","Michael Ying Yang"],"pdf_url":"https://arxiv.org/pdf/2403.12848v2.pdf","comment":"16 pages, 10 figures"},{"id":"http://arxiv.org/abs/2401.16712v2","updated":"2024-08-26T12:52:25Z","published":"2024-01-30T03:17:02Z","title":"LF Tracy: A Unified Single-Pipeline Approach for Salient Object\n  Detection in Light Field Cameras","summary":"  Leveraging rich information is crucial for dense prediction tasks. Light\nfield (LF) cameras are instrumental in this regard, as they allow data to be\nsampled from various perspectives. This capability provides valuable spatial,\ndepth, and angular information, enhancing scene-parsing tasks. However, we have\nidentified two overlooked issues for the LF salient object detection (SOD)\ntask. (1): Previous approaches predominantly employ a customized two-stream\ndesign to discover the spatial and depth features within light field images.\nThe network struggles to learn the implicit angular information between\ndifferent images due to a lack of intra-network data connectivity. (2): Little\nresearch has been directed towards the data augmentation strategy for LF SOD.\nResearch on inter-network data connectivity is scant. In this study, we propose\nan efficient paradigm (LF Tracy) to address those issues. This comprises a\nsingle-pipeline encoder paired with a highly efficient information aggregation\n(IA) module (around 8M parameters) to establish an intra-network connection.\nThen, a simple yet effective data augmentation strategy called MixLD is\ndesigned to bridge the inter-network connections. Owing to this innovative\nparadigm, our model surpasses the existing state-of-the-art method through\nextensive experiments. Especially, LF Tracy demonstrates a 23% improvement over\nprevious results on the latest large-scale PKU dataset. The source code is\npublicly available at: https://github.com/FeiBryantkit/LF-Tracy.\n","authors":["Fei Teng","Jiaming Zhang","Jiawei Liu","Kunyu Peng","Xina Cheng","Zhiyong Li","Kailun Yang"],"pdf_url":"https://arxiv.org/pdf/2401.16712v2.pdf","comment":"Accepted to ICPR 2024. The source code is publicly available at:\n  https://github.com/FeiBryantkit/LF-Tracy"},{"id":"http://arxiv.org/abs/2408.14229v1","updated":"2024-08-26T12:44:17Z","published":"2024-08-26T12:44:17Z","title":"Gallery-Aware Uncertainty Estimation For Open-Set Face Recognition","summary":"  Accurately estimating image quality and model robustness improvement are\ncritical challenges in unconstrained face recognition, which can be addressed\nthrough uncertainty estimation via probabilistic face embeddings. Previous\nresearch mainly focused on uncertainty estimation in face verification, leaving\nthe open-set face recognition task underexplored. In open-set face recognition,\none seeks to classify an image, which could also be unknown. Here, the low\nvariance of probabilistic embedding does not imply a low error probability: an\nimage embedding could be close to several classes in a gallery, thus yielding\nhigh uncertainty. We propose a method aware of two sources of ambiguity in the\nopen-set recognition system: (1) the gallery uncertainty caused by overlapping\nclasses and (2) the uncertainty of the face embeddings. To detect both types,\nwe use a Bayesian probabilistic model of embedding distribution, which provides\na principled uncertainty estimate. Challenging open-set face recognition\ndatasets, such as IJB-C, serve as a testbed for our method. We also propose a\nnew open-set recognition protocol for whale and dolphin identification. The\nproposed approach better identifies recognition errors than uncertainty\nestimation methods based solely on image quality.\n","authors":["Leonid Erlygin","Alexey Zaytsev"],"pdf_url":"https://arxiv.org/pdf/2408.14229v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14227v1","updated":"2024-08-26T12:43:48Z","published":"2024-08-26T12:43:48Z","title":"TC-PDM: Temporally Consistent Patch Diffusion Models for\n  Infrared-to-Visible Video Translation","summary":"  Infrared imaging offers resilience against changing lighting conditions by\ncapturing object temperatures. Yet, in few scenarios, its lack of visual\ndetails compared to daytime visible images, poses a significant challenge for\nhuman and machine interpretation. This paper proposes a novel diffusion method,\ndubbed Temporally Consistent Patch Diffusion Models (TC-DPM), for\ninfrared-to-visible video translation. Our method, extending the Patch\nDiffusion Model, consists of two key components. Firstly, we propose a\nsemantic-guided denoising, leveraging the strong representations of\nfoundational models. As such, our method faithfully preserves the semantic\nstructure of generated visible images. Secondly, we propose a novel temporal\nblending module to guide the denoising trajectory, ensuring the temporal\nconsistency between consecutive frames. Experiment shows that TC-PDM\noutperforms state-of-the-art methods by 35.3% in FVD for infrared-to-visible\nvideo translation and by 6.1% in AP50 for day-to-night object detection. Our\ncode is publicly available at https://github.com/dzungdoan6/tc-pdm\n","authors":["Anh-Dzung Doan","Vu Minh Hieu Phan","Surabhi Gupta","Markus Wagner","Tat-Jun Chin","Ian Reid"],"pdf_url":"https://arxiv.org/pdf/2408.14227v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2408.14211v1","updated":"2024-08-26T12:10:52Z","published":"2024-08-26T12:10:52Z","title":"MagicMan: Generative Novel View Synthesis of Humans with 3D-Aware\n  Diffusion and Iterative Refinement","summary":"  Existing works in single-image human reconstruction suffer from weak\ngeneralizability due to insufficient training data or 3D inconsistencies for a\nlack of comprehensive multi-view knowledge. In this paper, we introduce\nMagicMan, a human-specific multi-view diffusion model designed to generate\nhigh-quality novel view images from a single reference image. As its core, we\nleverage a pre-trained 2D diffusion model as the generative prior for\ngeneralizability, with the parametric SMPL-X model as the 3D body prior to\npromote 3D awareness. To tackle the critical challenge of maintaining\nconsistency while achieving dense multi-view generation for improved 3D human\nreconstruction, we first introduce hybrid multi-view attention to facilitate\nboth efficient and thorough information interchange across different views.\nAdditionally, we present a geometry-aware dual branch to perform concurrent\ngeneration in both RGB and normal domains, further enhancing consistency via\ngeometry cues. Last but not least, to address ill-shaped issues arising from\ninaccurate SMPL-X estimation that conflicts with the reference image, we\npropose a novel iterative refinement strategy, which progressively optimizes\nSMPL-X accuracy while enhancing the quality and consistency of the generated\nmulti-views. Extensive experimental results demonstrate that our method\nsignificantly outperforms existing approaches in both novel view synthesis and\nsubsequent 3D human reconstruction tasks.\n","authors":["Xu He","Xiaoyu Li","Di Kang","Jiangnan Ye","Chaopeng Zhang","Liyang Chen","Xiangjun Gao","Han Zhang","Zhiyong Wu","Haolin Zhuang"],"pdf_url":"https://arxiv.org/pdf/2408.14211v1.pdf","comment":"Project Page: https://thuhcsi.github.io/MagicMan"},{"id":"http://arxiv.org/abs/2408.14197v1","updated":"2024-08-26T11:53:09Z","published":"2024-08-26T11:53:09Z","title":"Driving in the Occupancy World: Vision-Centric 4D Occupancy Forecasting\n  and Planning via World Models for Autonomous Driving","summary":"  World models envision potential future states based on various ego actions.\nThey embed extensive knowledge about the driving environment, facilitating safe\nand scalable autonomous driving. Most existing methods primarily focus on\neither data generation or the pretraining paradigms of world models. Unlike the\naforementioned prior works, we propose Drive-OccWorld, which adapts a\nvision-centric 4D forecasting world model to end-to-end planning for autonomous\ndriving. Specifically, we first introduce a semantic and motion-conditional\nnormalization in the memory module, which accumulates semantic and dynamic\ninformation from historical BEV embeddings. These BEV features are then\nconveyed to the world decoder for future occupancy and flow forecasting,\nconsidering both geometry and spatiotemporal modeling. Additionally, we propose\ninjecting flexible action conditions, such as velocity, steering angle,\ntrajectory, and commands, into the world model to enable controllable\ngeneration and facilitate a broader range of downstream applications.\nFurthermore, we explore integrating the generative capabilities of the 4D world\nmodel with end-to-end planning, enabling continuous forecasting of future\nstates and the selection of optimal trajectories using an occupancy-based cost\nfunction. Extensive experiments on the nuScenes dataset demonstrate that our\nmethod can generate plausible and controllable 4D occupancy, opening new\navenues for driving world generation and end-to-end planning.\n","authors":["Yu Yang","Jianbiao Mei","Yukai Ma","Siliang Du","Wenqing Chen","Yijie Qian","Yuxiang Feng","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2408.14197v1.pdf","comment":"18 pages, 10 figures"},{"id":"http://arxiv.org/abs/2408.14192v1","updated":"2024-08-26T11:36:38Z","published":"2024-08-26T11:36:38Z","title":"Feature Aligning Few shot Learning Method Using Local Descriptors\n  Weighted Rules","summary":"  Few-shot classification involves identifying new categories using a limited\nnumber of labeled samples. Current few-shot classification methods based on\nlocal descriptors primarily leverage underlying consistent features across\nvisible and invisible classes, facing challenges including redundant\nneighboring information, noisy representations, and limited interpretability.\nThis paper proposes a Feature Aligning Few-shot Learning Method Using Local\nDescriptors Weighted Rules (FAFD-LDWR). It innovatively introduces a\ncross-normalization method into few-shot image classification to preserve the\ndiscriminative information of local descriptors as much as possible; and\nenhances classification performance by aligning key local descriptors of\nsupport and query sets to remove background noise. FAFD-LDWR performs\nexcellently on three benchmark datasets , outperforming state-of-the-art\nmethods in both 1-shot and 5-shot settings. The designed visualization\nexperiments also demonstrate FAFD-LDWR's improvement in prediction\ninterpretability.\n","authors":["Bingchen Yan"],"pdf_url":"https://arxiv.org/pdf/2408.14192v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14189v1","updated":"2024-08-26T11:26:27Z","published":"2024-08-26T11:26:27Z","title":"EMDFNet: Efficient Multi-scale and Diverse Feature Network for Traffic\n  Sign Detection","summary":"  The detection of small objects, particularly traffic signs, is a critical\nsubtask within object detection and autonomous driving. Despite the notable\nachievements in previous research, two primary challenges persist. Firstly, the\nmain issue is the singleness of feature extraction. Secondly, the detection\nprocess fails to effectively integrate with objects of varying sizes or scales.\nThese issues are also prevalent in generic object detection. Motivated by these\nchallenges, in this paper, we propose a novel object detection network named\nEfficient Multi-scale and Diverse Feature Network (EMDFNet) for traffic sign\ndetection that integrates an Augmented Shortcut Module and an Efficient Hybrid\nEncoder to address the aforementioned issues simultaneously. Specifically, the\nAugmented Shortcut Module utilizes multiple branches to integrate various\nspatial semantic information and channel semantic information, thereby\nenhancing feature diversity. The Efficient Hybrid Encoder utilizes global\nfeature fusion and local feature interaction based on various features to\ngenerate distinctive classification features by integrating feature information\nin an adaptable manner. Extensive experiments on the Tsinghua-Tencent 100K\n(TT100K) benchmark and the German Traffic Sign Detection Benchmark (GTSDB)\ndemonstrate that our EMDFNet outperforms other state-of-the-art detectors in\nperformance while retaining the real-time processing capabilities of\nsingle-stage models. This substantiates the effectiveness of EMDFNet in\ndetecting small traffic signs.\n","authors":["Pengyu Li","Chenhe Liu","Tengfei Li","Xinyu Wang","Shihui Zhang","Dongyang Yu"],"pdf_url":"https://arxiv.org/pdf/2408.14189v1.pdf","comment":"15 pages,5 figures,accepted to ICANN"},{"id":"http://arxiv.org/abs/2408.14187v1","updated":"2024-08-26T11:24:13Z","published":"2024-08-26T11:24:13Z","title":"Ensemble Predicate Decoding for Unbiased Scene Graph Generation","summary":"  Scene Graph Generation (SGG) aims to generate a comprehensive graphical\nrepresentation that accurately captures the semantic information of a given\nscenario. However, the SGG model's performance in predicting more fine-grained\npredicates is hindered by a significant predicate bias. According to existing\nworks, the long-tail distribution of predicates in training data results in the\nbiased scene graph. However, the semantic overlap between predicate categories\nmakes predicate prediction difficult, and there is a significant difference in\nthe sample size of semantically similar predicates, making the predicate\nprediction more difficult. Therefore, higher requirements are placed on the\ndiscriminative ability of the model. In order to address this problem, this\npaper proposes Ensemble Predicate Decoding (EPD), which employs multiple\ndecoders to attain unbiased scene graph generation. Two auxiliary decoders\ntrained on lower-frequency predicates are used to improve the discriminative\nability of the model. Extensive experiments are conducted on the VG, and the\nexperiment results show that EPD enhances the model's representation capability\nfor predicates. In addition, we find that our approach ensures a relatively\nsuperior predictive capability for more frequent predicates compared to\nprevious unbiased SGG methods.\n","authors":["Jiasong Feng","Lichun Wang","Hongbo Xu","Kai Xu","Baocai Yin"],"pdf_url":"https://arxiv.org/pdf/2408.14187v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14186v1","updated":"2024-08-26T11:22:52Z","published":"2024-08-26T11:22:52Z","title":"Affine steerers for structured keypoint description","summary":"  We propose a way to train deep learning based keypoint descriptors that makes\nthem approximately equivariant for locally affine transformations of the image\nplane. The main idea is to use the representation theory of GL(2) to generalize\nthe recently introduced concept of steerers from rotations to affine\ntransformations. Affine steerers give high control over how keypoint\ndescriptions transform under image transformations. We demonstrate the\npotential of using this control for image matching. Finally, we propose a way\nto finetune keypoint descriptors with a set of steerers on upright images and\nobtain state-of-the-art results on several standard benchmarks. Code will be\npublished at github.com/georg-bn/affine-steerers.\n","authors":["Georg Bökman","Johan Edstedt","Michael Felsberg","Fredrik Kahl"],"pdf_url":"https://arxiv.org/pdf/2408.14186v1.pdf","comment":"To be presented at ECCV 2024"},{"id":"http://arxiv.org/abs/2408.14180v1","updated":"2024-08-26T11:08:44Z","published":"2024-08-26T11:08:44Z","title":"I2EBench: A Comprehensive Benchmark for Instruction-based Image Editing","summary":"  Significant progress has been made in the field of Instruction-based Image\nEditing (IIE). However, evaluating these models poses a significant challenge.\nA crucial requirement in this field is the establishment of a comprehensive\nevaluation benchmark for accurately assessing editing results and providing\nvaluable insights for its further development. In response to this need, we\npropose I2EBench, a comprehensive benchmark designed to automatically evaluate\nthe quality of edited images produced by IIE models from multiple dimensions.\nI2EBench consists of 2,000+ images for editing, along with 4,000+ corresponding\noriginal and diverse instructions. It offers three distinctive characteristics:\n1) Comprehensive Evaluation Dimensions: I2EBench comprises 16 evaluation\ndimensions that cover both high-level and low-level aspects, providing a\ncomprehensive assessment of each IIE model. 2) Human Perception Alignment: To\nensure the alignment of our benchmark with human perception, we conducted an\nextensive user study for each evaluation dimension. 3) Valuable Research\nInsights: By analyzing the advantages and disadvantages of existing IIE models\nacross the 16 dimensions, we offer valuable research insights to guide future\ndevelopment in the field. We will open-source I2EBench, including all\ninstructions, input images, human annotations, edited images from all evaluated\nmethods, and a simple script for evaluating the results from new IIE models.\nThe code, dataset and generated images from all IIE models are provided in\ngithub: https://github.com/cocoshe/I2EBench.\n","authors":["Yiwei Ma","Jiayi Ji","Ke Ye","Weihuang Lin","Zhibin Wang","Yonghan Zheng","Qiang Zhou","Xiaoshuai Sun","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2408.14180v1.pdf","comment":"Tech report, 39 pages, 41 figures"},{"id":"http://arxiv.org/abs/2408.13149v2","updated":"2024-08-26T11:05:58Z","published":"2024-08-23T15:16:01Z","title":"Focus on Neighbors and Know the Whole: Towards Consistent Dense\n  Multiview Text-to-Image Generator for 3D Creation","summary":"  Generating dense multiview images from text prompts is crucial for creating\nhigh-fidelity 3D assets. Nevertheless, existing methods struggle with\nspace-view correspondences, resulting in sparse and low-quality outputs. In\nthis paper, we introduce CoSER, a novel consistent dense Multiview\nText-to-Image Generator for Text-to-3D, achieving both efficiency and quality\nby meticulously learning neighbor-view coherence and further alleviating\nambiguity through the swift traversal of all views. For achieving neighbor-view\nconsistency, each viewpoint densely interacts with adjacent viewpoints to\nperceive the global spatial structure, and aggregates information along motion\npaths explicitly defined by physical principles to refine details. To further\nenhance cross-view consistency and alleviate content drift, CoSER rapidly scan\nall views in spiral bidirectional manner to aware holistic information and then\nscores each point based on semantic material. Subsequently, we conduct weighted\ndown-sampling along the spatial dimension based on scores, thereby facilitating\nprominent information fusion across all views with lightweight computation.\nTechnically, the core module is built by integrating the attention mechanism\nwith a selective state space model, exploiting the robust learning capabilities\nof the former and the low overhead of the latter. Extensive evaluation shows\nthat CoSER is capable of producing dense, high-fidelity, content-consistent\nmultiview images that can be flexibly integrated into various 3D generation\nmodels.\n","authors":["Bonan Li","Zicheng Zhang","Xingyi Yang","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2408.13149v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02474v3","updated":"2024-08-26T11:04:45Z","published":"2024-02-04T13:09:13Z","title":"Deep Spectral Improvement for Unsupervised Image Instance Segmentation","summary":"  Deep spectral methods reframe the image decomposition process as a graph\npartitioning task by extracting features using self-supervised learning and\nutilizing the Laplacian of the affinity matrix to obtain eigensegments.\nHowever, instance segmentation has received less attention compared to other\ntasks within the context of deep spectral methods. This paper addresses the\nfact that not all channels of the feature map extracted from a self-supervised\nbackbone contain sufficient information for instance segmentation purposes. In\nfact, Some channels are noisy and hinder the accuracy of the task. To overcome\nthis issue, this paper proposes two channel reduction modules: Noise Channel\nReduction (NCR) and Deviation-based Channel Reduction (DCR). The NCR retains\nchannels with lower entropy, as they are less likely to be noisy, while DCR\nprunes channels with low standard deviation, as they lack sufficient\ninformation for effective instance segmentation. Furthermore, the paper\ndemonstrates that the dot product, commonly used in deep spectral methods, is\nnot suitable for instance segmentation due to its sensitivity to feature map\nvalues, potentially leading to incorrect instance segments. A new similarity\nmetric called Bray-Curtis over Chebyshev (BoC) is proposed to address this\nissue. It takes into account the distribution of features in addition to their\nvalues, providing a more robust similarity measure for instance segmentation.\nQuantitative and qualitative results on the Youtube-VIS2019 dataset highlight\nthe improvements achieved by the proposed channel reduction methods and the use\nof BoC instead of the conventional dot product for creating the affinity\nmatrix. These improvements are observed in terms of mean Intersection over\nUnion and extracted instance segments, demonstrating enhanced instance\nsegmentation performance. The code is available on:\nhttps://github.com/farnooshar/SpecUnIIS\n","authors":["Farnoosh Arefi","Amir M. Mansourian","Shohreh Kasaei"],"pdf_url":"https://arxiv.org/pdf/2402.02474v3.pdf","comment":"11 pages, 13 figures and 5 tables"},{"id":"http://arxiv.org/abs/2408.04249v2","updated":"2024-08-26T10:57:15Z","published":"2024-08-08T06:29:32Z","title":"InstantStyleGaussian: Efficient Art Style Transfer with 3D Gaussian\n  Splatting","summary":"  We present InstantStyleGaussian, an innovative 3D style transfer method based\non the 3D Gaussian Splatting (3DGS) scene representation. By inputting a\ntarget-style image, it quickly generates new 3D GS scenes. Our method operates\non pre-reconstructed GS scenes, combining diffusion models with an improved\niterative dataset update strategy. It utilizes diffusion models to generate\ntarget style images, adds these new images to the training dataset, and uses\nthis dataset to iteratively update and optimize the GS scenes, significantly\naccelerating the style editing process while ensuring the quality of the\ngenerated scenes. Extensive experimental results demonstrate that our method\nensures high-quality stylized scenes while offering significant advantages in\nstyle transfer speed and consistency.\n","authors":["Xin-Yi Yu","Jun-Xin Yu","Li-Bo Zhou","Yan Wei","Lin-Lin Ou"],"pdf_url":"https://arxiv.org/pdf/2408.04249v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14177v1","updated":"2024-08-26T10:50:14Z","published":"2024-08-26T10:50:14Z","title":"NimbleD: Enhancing Self-supervised Monocular Depth Estimation with\n  Pseudo-labels and Large-scale Video Pre-training","summary":"  We introduce NimbleD, an efficient self-supervised monocular depth estimation\nlearning framework that incorporates supervision from pseudo-labels generated\nby a large vision model. This framework does not require camera intrinsics,\nenabling large-scale pre-training on publicly available videos. Our\nstraightforward yet effective learning strategy significantly enhances the\nperformance of fast and lightweight models without introducing any overhead,\nallowing them to achieve performance comparable to state-of-the-art\nself-supervised monocular depth estimation models. This advancement is\nparticularly beneficial for virtual and augmented reality applications\nrequiring low latency inference. The source code, model weights, and\nacknowledgments are available at https://github.com/xapaxca/nimbled .\n","authors":["Albert Luginov","Muhammad Shahzad"],"pdf_url":"https://arxiv.org/pdf/2408.14177v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17428v2","updated":"2024-08-26T10:49:29Z","published":"2023-11-29T08:09:01Z","title":"SigFormer: Sparse Signal-Guided Transformer for Multi-Modal Human Action\n  Segmentation","summary":"  Multi-modal human action segmentation is a critical and challenging task with\na wide range of applications. Nowadays, the majority of approaches concentrate\non the fusion of dense signals (i.e., RGB, optical flow, and depth maps).\nHowever, the potential contributions of sparse IoT sensor signals, which can be\ncrucial for achieving accurate recognition, have not been fully explored. To\nmake up for this, we introduce a Sparse signalguided Transformer (SigFormer) to\ncombine both dense and sparse signals. We employ mask attention to fuse\nlocalized features by constraining cross-attention within the regions where\nsparse signals are valid. However, since sparse signals are discrete, they lack\nsufficient information about the temporal action boundaries. Therefore, in\nSigFormer, we propose to emphasize the boundary information at two stages to\nalleviate this problem. In the first feature extraction stage, we introduce an\nintermediate bottleneck module to jointly learn both category and boundary\nfeatures of each dense modality through the inner loss functions. After the\nfusion of dense modalities and sparse signals, we then devise a two-branch\narchitecture that explicitly models the interrelationship between action\ncategory and temporal boundary. Experimental results demonstrate that SigFormer\noutperforms the state-of-the-art approaches on a multi-modal action\nsegmentation dataset from real industrial environments, reaching an outstanding\nF1 score of 0.958. The codes and pre-trained models have been available at\nhttps://github.com/LIUQI-creat/SigFormer.\n","authors":["Qi Liu","Xinchen Liu","Kun Liu","Xiaoyan Gu","Wu Liu"],"pdf_url":"https://arxiv.org/pdf/2311.17428v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14176v1","updated":"2024-08-26T10:42:53Z","published":"2024-08-26T10:42:53Z","title":"SwiftBrush v2: Make Your One-step Diffusion Model Better Than Its\n  Teacher","summary":"  In this paper, we aim to enhance the performance of SwiftBrush, a prominent\none-step text-to-image diffusion model, to be competitive with its multi-step\nStable Diffusion counterpart. Initially, we explore the quality-diversity\ntrade-off between SwiftBrush and SD Turbo: the former excels in image\ndiversity, while the latter excels in image quality. This observation motivates\nour proposed modifications in the training methodology, including better weight\ninitialization and efficient LoRA training. Moreover, our introduction of a\nnovel clamped CLIP loss enhances image-text alignment and results in improved\nimage quality. Remarkably, by combining the weights of models trained with\nefficient LoRA and full training, we achieve a new state-of-the-art one-step\ndiffusion model, achieving an FID of 8.14 and surpassing all GAN-based and\nmulti-step Stable Diffusion models. The evaluation code is available at:\nhttps://github.com/vinairesearch/swiftbrushv2.\n","authors":["Trung Dao","Thuan Hoang Nguyen","Thanh Le","Duc Vu","Khoi Nguyen","Cuong Pham","Anh Tran"],"pdf_url":"https://arxiv.org/pdf/2408.14176v1.pdf","comment":"Accepted to ECCV'24"},{"id":"http://arxiv.org/abs/2402.11237v2","updated":"2024-08-26T10:39:22Z","published":"2024-02-17T10:02:22Z","title":"Be Persistent: Towards a Unified Solution for Mitigating Shortcuts in\n  Deep Learning","summary":"  Deep neural networks (DNNs) are vulnerable to shortcut learning: rather than\nlearning the intended task, they tend to draw inconclusive relationships\nbetween their inputs and outputs. Shortcut learning is ubiquitous among many\nfailure cases of neural networks, and traces of this phenomenon can be seen in\ntheir generalizability issues, domain shift, adversarial vulnerability, and\neven bias towards majority groups. In this paper, we argue that this\ncommonality in the cause of various DNN issues creates a significant\nopportunity that should be leveraged to find a unified solution for shortcut\nlearning. To this end, we outline the recent advances in topological data\nanalysis (TDA), and persistent homology (PH) in particular, to sketch a unified\nroadmap for detecting shortcuts in deep learning. We demonstrate our arguments\nby investigating the topological features of computational graphs in DNNs using\ntwo cases of unlearnable examples and bias in decision-making as our test\nstudies. Our analysis of these two failure cases of DNNs reveals that finding a\nunified solution for shortcut learning in DNNs is not out of reach, and TDA can\nplay a significant role in forming such a framework.\n","authors":["Hadi M. Dolatabadi","Sarah M. Erfani","Christopher Leckie"],"pdf_url":"https://arxiv.org/pdf/2402.11237v2.pdf","comment":"Accepted to the 2024 European Conference on Artificial Intelligence\n  (ECAI)"},{"id":"http://arxiv.org/abs/2408.14173v1","updated":"2024-08-26T10:39:01Z","published":"2024-08-26T10:39:01Z","title":"BackFlip: The Impact of Local and Global Data Augmentations on Artistic\n  Image Aesthetic Assessment","summary":"  Assessing the aesthetic quality of artistic images presents unique challenges\ndue to the subjective nature of aesthetics and the complex visual\ncharacteristics inherent to artworks. Basic data augmentation techniques\ncommonly applied to natural images in computer vision may not be suitable for\nart images in aesthetic evaluation tasks, as they can change the composition of\nthe art images. In this paper, we explore the impact of local and global data\naugmentation techniques on artistic image aesthetic assessment (IAA). We\nintroduce BackFlip, a local data augmentation technique designed specifically\nfor artistic IAA. We evaluate the performance of BackFlip across three artistic\nimage datasets and four neural network architectures, comparing it with the\ncommonly used data augmentation techniques. Then, we analyze the effects of\ncomponents within the BackFlip pipeline through an ablation study. Our findings\ndemonstrate that local augmentations, such as BackFlip, tend to outperform\nglobal augmentations on artistic IAA in most cases, probably because they do\nnot perturb the composition of the art images. These results emphasize the\nimportance of considering both local and global augmentations in future\ncomputational aesthetics research.\n","authors":["Ombretta Strafforello","Gonzalo Muradas Odriozola","Fatemeh Behrad","Li-Wei Chen","Anne-Sofie Maerten","Derya Soydaner","Johan Wagemans"],"pdf_url":"https://arxiv.org/pdf/2408.14173v1.pdf","comment":"Published at the VISART VII workshop at ECCV 2024. Ombretta\n  Strafforello, Gonzalo Muradas Odriozola, Fatemeh Behrad, Li-Wei Chen,\n  Anne-Sofie Maerten and Derya Soydaner contributed equally to this work"},{"id":"http://arxiv.org/abs/2408.01224v3","updated":"2024-08-26T09:59:55Z","published":"2024-08-02T12:27:15Z","title":"Multi-head Spatial-Spectral Mamba for Hyperspectral Image Classification","summary":"  Spatial-Spectral Mamba (SSM) improves computational efficiency and captures\nlong-range dependencies, addressing Transformer limitations. However,\ntraditional Mamba models overlook rich spectral information in HSIs and\nstruggle with high dimensionality and sequential data. To address these issues,\nwe propose the SSM with multi-head self-attention and token enhancement\n(MHSSMamba). This model integrates spectral and spatial information by\nenhancing spectral tokens and using multi-head attention to capture complex\nrelationships between spectral bands and spatial locations. It also manages\nlong-range dependencies and the sequential nature of HSI data, preserving\ncontextual information across spectral bands. MHSSMamba achieved remarkable\nclassification accuracies of 97.62\\% on Pavia University, 96.92\\% on the\nUniversity of Houston, 96.85\\% on Salinas, and 99.49\\% on Wuhan-longKou\ndatasets. The source code is available at\n\\href{https://github.com/MHassaanButt/MHA\\_SS\\_Mamba}{GitHub}.\n","authors":["Muhammad Ahmad","Muhammad Hassaan Farooq Butt","Muhammad Usama","Hamad Ahmed Altuwaijri","Manuel Mazzara","Salvatore Distefano"],"pdf_url":"https://arxiv.org/pdf/2408.01224v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14153v1","updated":"2024-08-26T09:55:34Z","published":"2024-08-26T09:55:34Z","title":"Explaining Vision-Language Similarities in Dual Encoders with\n  Feature-Pair Attributions","summary":"  Dual encoder architectures like CLIP models map two types of inputs into a\nshared embedding space and learn similarities between them. However, it is not\nunderstood how such models compare two inputs. Here, we address this research\ngap with two contributions. First, we derive a method to attribute predictions\nof any differentiable dual encoder onto feature-pair interactions between its\ninputs. Second, we apply our method to CLIP-type models and show that they\nlearn fine-grained correspondences between parts of captions and regions in\nimages. They match objects across input modes and also account for mismatches.\nHowever, this visual-linguistic grounding ability heavily varies between object\nclasses, depends on the training data distribution, and largely improves after\nin-domain training. Using our method we can identify knowledge gaps about\nspecific object classes in individual models and can monitor their improvement\nupon fine-tuning.\n","authors":["Lucas Möller","Pascal Tilli","Ngoc Thang Vu","Sebastian Padó"],"pdf_url":"https://arxiv.org/pdf/2408.14153v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14152v1","updated":"2024-08-26T09:55:32Z","published":"2024-08-26T09:55:32Z","title":"Application of Disentanglement to Map Registration Problem","summary":"  Geospatial data come from various sources, such as satellites, aircraft, and\nLiDAR. The variability of the source is not limited to the types of data\nacquisition techniques, as we have maps from different time periods. To\nincorporate these data for a coherent analysis, it is essential to first align\ndifferent \"styles\" of geospatial data to its matching images that point to the\nsame location on the surface of the Earth. In this paper, we approach the image\nregistration as a two-step process of (1) extracting geospatial contents\ninvariant to visual (and any other non-content-related) information, and (2)\nmatching the data based on such (purely) geospatial contents. We hypothesize\nthat a combination of $\\beta$-VAE-like architecture [2] and adversarial\ntraining will achieve both the disentanglement of the geographic information\nand artistic styles and generation of new map tiles by composing the encoded\ngeographic information with any artistic style.\n","authors":["Hae Jin Song","Patrycja Krawczuk","Po-Hsuan Huang"],"pdf_url":"https://arxiv.org/pdf/2408.14152v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14143v1","updated":"2024-08-26T09:41:40Z","published":"2024-08-26T09:41:40Z","title":"2D-Malafide: Adversarial Attacks Against Face Deepfake Detection Systems","summary":"  We introduce 2D-Malafide, a novel and lightweight adversarial attack designed\nto deceive face deepfake detection systems. Building upon the concept of 1D\nconvolutional perturbations explored in the speech domain, our method leverages\n2D convolutional filters to craft perturbations which significantly degrade the\nperformance of state-of-the-art face deepfake detectors. Unlike traditional\nadditive noise approaches, 2D-Malafide optimises a small number of filter\ncoefficients to generate robust adversarial perturbations which are\ntransferable across different face images. Experiments, conducted using the\nFaceForensics++ dataset, demonstrate that 2D-Malafide substantially degrades\ndetection performance in both white-box and black-box settings, with larger\nfilter sizes having the greatest impact. Additionally, we report an\nexplainability analysis using GradCAM which illustrates how 2D-Malafide\nmisleads detection systems by altering the image areas used most for\nclassification. Our findings highlight the vulnerability of current deepfake\ndetection systems to convolutional adversarial attacks as well as the need for\nfuture work to enhance detection robustness through improved image fidelity\nconstraints.\n","authors":["Chiara Galdi","Michele Panariello","Massimiliano Todisco","Nicholas Evans"],"pdf_url":"https://arxiv.org/pdf/2408.14143v1.pdf","comment":"Accepted at BIOSIG 2024"},{"id":"http://arxiv.org/abs/2408.14135v1","updated":"2024-08-26T09:32:16Z","published":"2024-08-26T09:32:16Z","title":"Foodfusion: A Novel Approach for Food Image Composition via Diffusion\n  Models","summary":"  Food image composition requires the use of existing dish images and\nbackground images to synthesize a natural new image, while diffusion models\nhave made significant advancements in image generation, enabling the\nconstruction of end-to-end architectures that yield promising results. However,\nexisting diffusion models face challenges in processing and fusing information\nfrom multiple images and lack access to high-quality publicly available\ndatasets, which prevents the application of diffusion models in food image\ncomposition. In this paper, we introduce a large-scale, high-quality food image\ncomposite dataset, FC22k, which comprises 22,000 foreground, background, and\nground truth ternary image pairs. Additionally, we propose a novel food image\ncomposition method, Foodfusion, which leverages the capabilities of the\npre-trained diffusion models and incorporates a Fusion Module for processing\nand integrating foreground and background information. This fused information\naligns the foreground features with the background structure by merging the\nglobal structural information at the cross-attention layer of the denoising\nUNet. To further enhance the content and structure of the background, we also\nintegrate a Content-Structure Control Module. Extensive experiments demonstrate\nthe effectiveness and scalability of our proposed method.\n","authors":["Chaohua Shi","Xuan Wang","Si Shi","Xule Wang","Mingrui Zhu","Nannan Wang","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2408.14135v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2408.14131v1","updated":"2024-08-26T09:26:08Z","published":"2024-08-26T09:26:08Z","title":"GenFormer -- Generated Images are All You Need to Improve Robustness of\n  Transformers on Small Datasets","summary":"  Recent studies showcase the competitive accuracy of Vision Transformers\n(ViTs) in relation to Convolutional Neural Networks (CNNs), along with their\nremarkable robustness. However, ViTs demand a large amount of data to achieve\nadequate performance, which makes their application to small datasets\nchallenging, falling behind CNNs. To overcome this, we propose GenFormer, a\ndata augmentation strategy utilizing generated images, thereby improving\ntransformer accuracy and robustness on small-scale image classification tasks.\nIn our comprehensive evaluation we propose Tiny ImageNetV2, -R, and -A as new\ntest set variants of Tiny ImageNet by transferring established ImageNet\ngeneralization and robustness benchmarks to the small-scale data domain.\nSimilarly, we introduce MedMNIST-C and EuroSAT-C as corrupted test set variants\nof established fine-grained datasets in the medical and aerial domain. Through\na series of experiments conducted on small datasets of various domains,\nincluding Tiny ImageNet, CIFAR, EuroSAT and MedMNIST datasets, we demonstrate\nthe synergistic power of our method, in particular when combined with common\ntrain and test time augmentations, knowledge distillation, and architectural\ndesign choices. Additionally, we prove the effectiveness of our approach under\nchallenging conditions with limited training data, demonstrating significant\nimprovements in both accuracy and robustness, bridging the gap between CNNs and\nViTs in the small-scale dataset domain.\n","authors":["Sven Oehri","Nikolas Ebert","Ahmed Abdullah","Didier Stricker","Oliver Wasenmüller"],"pdf_url":"https://arxiv.org/pdf/2408.14131v1.pdf","comment":"This paper has been accepted at International Conference on Pattern\n  Recognition (ICPR), 2023"},{"id":"http://arxiv.org/abs/2406.02978v2","updated":"2024-08-26T09:23:44Z","published":"2024-06-05T06:21:54Z","title":"Self-Supervised Skeleton-Based Action Representation Learning: A\n  Benchmark and Beyond","summary":"  Self-supervised learning (SSL), which aims to learn meaningful prior\nrepresentations from unlabeled data, has been proven effective for\nskeleton-based action understanding. Different from the image domain, skeleton\ndata possesses sparser spatial structures and diverse representation forms,\nwith the absence of background clues and the additional temporal dimension,\npresenting new challenges for spatial-temporal motion pretext task design.\nRecently, many endeavors have been made for skeleton-based SSL, achieving\nremarkable progress. However, a systematic and thorough review is still\nlacking. In this paper, we conduct, for the first time, a comprehensive survey\non self-supervised skeleton-based action representation learning. Following the\ntaxonomy of context-based, generative learning, and contrastive learning\napproaches, we make a thorough review and benchmark of existing works and shed\nlight on the future possible directions. Remarkably, our investigation\ndemonstrates that most SSL works rely on the single paradigm, learning\nrepresentations of a single level, and are evaluated on the action recognition\ntask solely, which leaves the generalization power of skeleton SSL models\nunder-explored. To this end, a novel and effective SSL method for skeleton is\nfurther proposed, which integrates versatile representation learning objectives\nof different granularity, substantially boosting the generalization capacity\nfor multiple skeleton downstream tasks. Extensive experiments under three\nlarge-scale datasets demonstrate our method achieves superior generalization\nperformance on various downstream tasks, including recognition, retrieval,\ndetection, and few-shot learning.\n","authors":["Jiahang Zhang","Lilang Lin","Shuai Yang","Jiaying Liu"],"pdf_url":"https://arxiv.org/pdf/2406.02978v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15318v2","updated":"2024-08-26T09:21:35Z","published":"2024-04-03T13:33:07Z","title":"VASARI-auto: equitable, efficient, and economical featurisation of\n  glioma MRI","summary":"  The VASARI MRI feature set is a quantitative system designed to standardise\nglioma imaging descriptions. Though effective, deriving VASARI is\ntime-consuming and seldom used in clinical practice. This is a problem that\nmachine learning could plausibly automate. Using glioma data from 1172\npatients, we developed VASARI-auto, an automated labelling software applied to\nboth open-source lesion masks and our openly available tumour segmentation\nmodel. In parallel, two consultant neuroradiologists independently quantified\nVASARI features in a subsample of 100 glioblastoma cases. We quantified: 1)\nagreement across neuroradiologists and VASARI-auto; 2) calibration of\nperformance equity; 3) an economic workforce analysis; and 4) fidelity in\npredicting patient survival. Tumour segmentation was compatible with the\ncurrent state of the art and equally performant regardless of age or sex. A\nmodest inter-rater variability between in-house neuroradiologists was\ncomparable to between neuroradiologists and VASARI-auto, with far higher\nagreement between VASARI-auto methods. The time taken for neuroradiologists to\nderive VASARI was substantially higher than VASARI-auto (mean time per case 317\nvs. 3 seconds). A UK hospital workforce analysis forecast that three years of\nVASARI featurisation would demand 29,777 consultant neuroradiologist workforce\nhours ({\\pounds}1,574,935), reducible to 332 hours of computing time (and\n{\\pounds}146 of power) with VASARI-auto. The best-performing survival model\nutilised VASARI-auto features as opposed to those derived by neuroradiologists.\nVASARI-auto is a highly efficient automated labelling system with equitable\nperformance across patient age or sex, a favourable economic profile if used as\na decision support tool, and with non-inferior fidelity in downstream patient\nsurvival prediction. Future work should iterate upon and integrate such tools\nto enhance patient care.\n","authors":["James K Ruffle","Samia Mohinta","Kelly Pegoretti Baruteau","Rebekah Rajiah","Faith Lee","Sebastian Brandner","Parashkev Nachev","Harpreet Hyare"],"pdf_url":"https://arxiv.org/pdf/2404.15318v2.pdf","comment":"36 pages, 8 figures, 2 tables"},{"id":"http://arxiv.org/abs/2401.07729v2","updated":"2024-08-26T09:16:57Z","published":"2024-01-15T14:43:40Z","title":"SSL-Interactions: Pretext Tasks for Interactive Trajectory Prediction","summary":"  This paper addresses motion forecasting in multi-agent environments, pivotal\nfor ensuring safety of autonomous vehicles. Traditional as well as recent\ndata-driven marginal trajectory prediction methods struggle to properly learn\nnon-linear agent-to-agent interactions. We present SSL-Interactions that\nproposes pretext tasks to enhance interaction modeling for trajectory\nprediction. We introduce four interaction-aware pretext tasks to encapsulate\nvarious aspects of agent interactions: range gap prediction, closest distance\nprediction, direction of movement prediction, and type of interaction\nprediction. We further propose an approach to curate interaction-heavy\nscenarios from datasets. This curated data has two advantages: it provides a\nstronger learning signal to the interaction model, and facilitates generation\nof pseudo-labels for interaction-centric pretext tasks. We also propose three\nnew metrics specifically designed to evaluate predictions in interactive\nscenes. Our empirical evaluations indicate SSL-Interactions outperforms\nstate-of-the-art motion forecasting methods quantitatively with up to 8%\nimprovement, and qualitatively, for interaction-heavy scenarios.\n","authors":["Prarthana Bhattacharyya","Chengjie Huang","Krzysztof Czarnecki"],"pdf_url":"https://arxiv.org/pdf/2401.07729v2.pdf","comment":"Accepted at IV-2024. 13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.05206v4","updated":"2024-08-26T09:15:11Z","published":"2024-07-06T23:16:41Z","title":"Helios: An extremely low power event-based gesture recognition for\n  always-on smart eyewear","summary":"  This paper introduces Helios, the first extremely low-power, real-time,\nevent-based hand gesture recognition system designed for all-day on smart\neyewear. As augmented reality (AR) evolves, current smart glasses like the Meta\nRay-Bans prioritize visual and wearable comfort at the expense of\nfunctionality. Existing human-machine interfaces (HMIs) in these devices, such\nas capacitive touch and voice controls, present limitations in ergonomics,\nprivacy and power consumption. Helios addresses these challenges by leveraging\nnatural hand interactions for a more intuitive and comfortable user experience.\nOur system utilizes a extremely low-power and compact 3mmx4mm/20mW event camera\nto perform natural hand-based gesture recognition for always-on smart eyewear.\nThe camera's output is processed by a convolutional neural network (CNN)\nrunning on a NXP Nano UltraLite compute platform, consuming less than 350mW.\nHelios can recognize seven classes of gestures, including subtle microgestures\nlike swipes and pinches, with 91% accuracy. We also demonstrate real-time\nperformance across 20 users at a remarkably low latency of 60ms. Our user\ntesting results align with the positive feedback we received during our recent\nsuccessful demo at AWE-USA-2024.\n","authors":["Prarthana Bhattacharyya","Joshua Mitton","Ryan Page","Owen Morgan","Ben Menzies","Gabriel Homewood","Kemi Jacobs","Paolo Baesso","David Trickett","Chris Mair","Taru Muhonen","Rory Clark","Louis Berridge","Richard Vigars","Iain Wallace"],"pdf_url":"https://arxiv.org/pdf/2407.05206v4.pdf","comment":"Accepted at ECCV-Integrating Computer Vision in Smart Eyewear, 2024.\n  18 pages, 10 figures. First three authors contributed equally to this paper"},{"id":"http://arxiv.org/abs/2211.07546v2","updated":"2024-08-26T09:01:23Z","published":"2022-11-14T17:11:15Z","title":"Vision meets algae: A novel way for microalgae recognization and health\n  monitor","summary":"  Marine microalgae are widespread in the ocean and play a crucial role in the\necosystem. Automatic identification and location of marine microalgae in\nmicroscopy images would help establish marine ecological environment monitoring\nand water quality evaluation system. We proposed a new dataset for the\ndetection of marine microalgae and a range of detection methods, the dataset\nincluding images of different genus of algae and the same genus in different\nstates. We set the number of unbalanced classes in the data set and added\nimages of mixed water samples in the test set to simulate the actual situation\nin the field. Then we trained, validated and tested the, TOOD, YOLOv5, YOLOv8\nand variants of RCNN algorithms on this dataset. The results showed both\none-stage and two-stage object detection models can achieve high mean average\nprecision, which proves the ability of computer vision in multi-object\ndetection of microalgae, and provides basic data and models for real-time\ndetection of microalgal cells.\n","authors":["Shizheng Zhou","Juntao Jiang","Xiaohan Hong","Yan Hong","Pengcheng Fu"],"pdf_url":"https://arxiv.org/pdf/2211.07546v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14114v1","updated":"2024-08-26T08:59:22Z","published":"2024-08-26T08:59:22Z","title":"ShapeMamba-EM: Fine-Tuning Foundation Model with Local Shape Descriptors\n  and Mamba Blocks for 3D EM Image Segmentation","summary":"  Electron microscopy (EM) imaging offers unparalleled resolution for analyzing\nneural tissues, crucial for uncovering the intricacies of synaptic connections\nand neural processes fundamental to understanding behavioral mechanisms.\nRecently, the foundation models have demonstrated impressive performance across\nnumerous natural and medical image segmentation tasks. However, applying these\nfoundation models to EM segmentation faces significant challenges due to domain\ndisparities. This paper presents ShapeMamba-EM, a specialized fine-tuning\nmethod for 3D EM segmentation, which employs adapters for long-range dependency\nmodeling and an encoder for local shape description within the original\nfoundation model. This approach effectively addresses the unique volumetric and\nmorphological complexities of EM data. Tested over a wide range of EM images,\ncovering five segmentation tasks and 10 datasets, ShapeMamba-EM outperforms\nexisting methods, establishing a new standard in EM image segmentation and\nenhancing the understanding of neural tissue architecture.\n","authors":["Ruohua Shi","Qiufan Pang","Lei Ma","Lingyu Duan","Tiejun Huang","Tingting Jiang"],"pdf_url":"https://arxiv.org/pdf/2408.14114v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14111v1","updated":"2024-08-26T08:55:16Z","published":"2024-08-26T08:55:16Z","title":"Bengali Sign Language Recognition through Hand Pose Estimation using\n  Multi-Branch Spatial-Temporal Attention Model","summary":"  Hand gesture-based sign language recognition (SLR) is one of the most\nadvanced applications of machine learning, and computer vision uses hand\ngestures. Although, in the past few years, many researchers have widely\nexplored and studied how to address BSL problems, specific unaddressed issues\nremain, such as skeleton and transformer-based BSL recognition. In addition,\nthe lack of evaluation of the BSL model in various concealed environmental\nconditions can prove the generalized property of the existing model by facing\ndaily life signs. As a consequence, existing BSL recognition systems provide a\nlimited perspective of their generalisation ability as they are tested on\ndatasets containing few BSL alphabets that have a wide disparity in gestures\nand are easy to differentiate. To overcome these limitations, we propose a\nspatial-temporal attention-based BSL recognition model considering hand joint\nskeletons extracted from the sequence of images. The main aim of utilising hand\nskeleton-based BSL data is to ensure the privacy and low-resolution sequence of\nimages, which need minimum computational cost and low hardware configurations.\nOur model captures discriminative structural displacements and short-range\ndependency based on unified joint features projected onto high-dimensional\nfeature space. Specifically, the use of Separable TCN combined with a powerful\nmulti-head spatial-temporal attention architecture generated high-performance\naccuracy. The extensive experiments with a proposed dataset and two benchmark\nBSL datasets with a wide range of evaluations, such as intra- and inter-dataset\nevaluation settings, demonstrated that our proposed models achieve competitive\nperformance with extremely low computational complexity and run faster than\nexisting models.\n","authors":["Abu Saleh Musa Miah","Md. Al Mehedi Hasan","Md Hadiuzzaman","Muhammad Nazrul Islam","Jungpil Shin"],"pdf_url":"https://arxiv.org/pdf/2408.14111v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11311v2","updated":"2024-08-26T08:47:00Z","published":"2024-06-17T08:18:41Z","title":"Syn-to-Real Unsupervised Domain Adaptation for Indoor 3D Object\n  Detection","summary":"  The use of synthetic data in indoor 3D object detection offers the potential\nof greatly reducing the manual labor involved in 3D annotations and training\neffective zero-shot detectors. However, the complicated domain shifts across\nsyn-to-real indoor datasets remains underexplored. In this paper, we propose a\nnovel Object-wise Hierarchical Domain Alignment (OHDA) framework for\nsyn-to-real unsupervised domain adaptation in indoor 3D object detection. Our\napproach includes an object-aware augmentation strategy to effectively\ndiversify the source domain data, and we introduce a two-branch adaptation\nframework consisting of an adversarial training branch and a pseudo labeling\nbranch, in order to simultaneously reach holistic-level and class-level domain\nalignment. The pseudo labeling is further refined through two proposed schemes\nspecifically designed for indoor UDA. Our adaptation results from synthetic\ndataset 3D-FRONT to real-world datasets ScanNetV2 and SUN RGB-D demonstrate\nremarkable mAP25 improvements of 9.7% and 9.1% over Source-Only baselines,\nrespectively, and consistently outperform the methods adapted from 2D and 3D\noutdoor scenarios. The code will be publicly available upon paper acceptance.\n","authors":["Yunsong Wang","Na Zhao","Gim Hee Lee"],"pdf_url":"https://arxiv.org/pdf/2406.11311v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14087v1","updated":"2024-08-26T08:16:58Z","published":"2024-08-26T08:16:58Z","title":"LSM-YOLO: A Compact and Effective ROI Detector for Medical Detection","summary":"  In existing medical Region of Interest (ROI) detection, there lacks an\nalgorithm that can simultaneously satisfy both real-time performance and\naccuracy, not meeting the growing demand for automatic detection in medicine.\nAlthough the basic YOLO framework ensures real-time detection due to its fast\nspeed, it still faces challenges in maintaining precision concurrently. To\nalleviate the above problems, we propose a novel model named Lightweight Shunt\nMatching-YOLO (LSM-YOLO), with Lightweight Adaptive Extraction (LAE) and\nMultipath Shunt Feature Matching (MSFM). Firstly, by using LAE to refine\nfeature extraction, the model can obtain more contextual information and\nhigh-resolution details from multiscale feature maps, thereby extracting\ndetailed features of ROI in medical images while reducing the influence of\nnoise. Secondly, MSFM is utilized to further refine the fusion of high-level\nsemantic features and low-level visual features, enabling better fusion between\nROI features and neighboring features, thereby improving the detection rate for\nbetter diagnostic assistance. Experimental results demonstrate that LSM-YOLO\nachieves 48.6% AP on a private dataset of pancreatic tumors, 65.1% AP on the\nBCCD blood cell detection public dataset, and 73.0% AP on the Br35h brain tumor\ndetection public dataset. Our model achieves state-of-the-art performance with\nminimal parameter cost on the above three datasets. The source codes are at:\nhttps://github.com/VincentYuuuuuu/LSM-YOLO.\n","authors":["Zhongwen Yu","Qiu Guan","Jianmin Yang","Zhiqiang Yang","Qianwei Zhou","Yang Chen","Feng Chen"],"pdf_url":"https://arxiv.org/pdf/2408.14087v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14084v1","updated":"2024-08-26T08:11:35Z","published":"2024-08-26T08:11:35Z","title":"HABD: a houma alliance book ancient handwritten character recognition\n  database","summary":"  The Houma Alliance Book, one of history's earliest calligraphic examples, was\nunearthed in the 1970s. These artifacts were meticulously organized,\nreproduced, and copied by the Shanxi Provincial Institute of Cultural Relics.\nHowever, because of their ancient origins and severe ink erosion, identifying\ncharacters in the Houma Alliance Book is challenging, necessitating the use of\ndigital technology. In this paper, we propose a new ancient handwritten\ncharacter recognition database for the Houma alliance book, along with a novel\nbenchmark based on deep learning architectures. More specifically, a collection\nof 26,732 characters samples from the Houma Alliance Book were gathered,\nencompassing 327 different types of ancient characters through iterative\nannotation. Furthermore, benchmark algorithms were proposed by combining four\ndeep neural network classifiers with two data augmentation methods. This\nresearch provides valuable resources and technical support for further studies\non the Houma Alliance Book and other ancient characters. This contributes to\nour understanding of ancient culture and history, as well as the preservation\nand inheritance of humanity's cultural heritage.\n","authors":["Xiaoyu Yuan","Xiaohua Huang","Zibo Zhang","Yabo Sun"],"pdf_url":"https://arxiv.org/pdf/2408.14084v1.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.14080v1","updated":"2024-08-26T08:02:57Z","published":"2024-08-26T08:02:57Z","title":"SONICS: Synthetic Or Not -- Identifying Counterfeit Songs","summary":"  The recent surge in AI-generated songs presents exciting possibilities and\nchallenges. While these tools democratize music creation, they also necessitate\nthe ability to distinguish between human-composed and AI-generated songs for\nsafeguarding artistic integrity and content curation. Existing research and\ndatasets in fake song detection only focus on singing voice deepfake detection\n(SVDD), where the vocals are AI-generated but the instrumental music is sourced\nfrom real songs. However, this approach is inadequate for contemporary\nend-to-end AI-generated songs where all components (vocals, lyrics, music, and\nstyle) could be AI-generated. Additionally, existing datasets lack lyrics-music\ndiversity, long-duration songs, and open fake songs. To address these gaps, we\nintroduce SONICS, a novel dataset for end-to-end Synthetic Song Detection\n(SSD), comprising over 97k songs with over 49k synthetic songs from popular\nplatforms like Suno and Udio. Furthermore, we highlight the importance of\nmodeling long-range temporal dependencies in songs for effective authenticity\ndetection, an aspect overlooked in existing methods. To capture these patterns,\nwe propose a novel model, SpecTTTra, that is up to 3 times faster and 6 times\nmore memory efficient compared to popular CNN and Transformer-based models\nwhile maintaining competitive performance. Finally, we offer both AI-based and\nHuman evaluation benchmarks, addressing another deficiency in current research.\n","authors":["Md Awsafur Rahman","Zaber Ibn Abdul Hakim","Najibul Haque Sarker","Bishmoy Paul","Shaikh Anowarul Fattah"],"pdf_url":"https://arxiv.org/pdf/2408.14080v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14060v1","updated":"2024-08-26T07:37:59Z","published":"2024-08-26T07:37:59Z","title":"Evaluating the Visual Similarity of Southwest China's Ethnic Minority\n  Brocade Based on Deep Learning","summary":"  This paper employs deep learning methods to investigate the visual similarity\nof ethnic minority patterns in Southwest China. A customized SResNet-18 network\nwas developed, achieving an accuracy of 98.7% on the test set, outperforming\nResNet-18, VGGNet-16, and AlexNet. The extracted feature vectors from\nSResNet-18 were evaluated using three metrics: cosine similarity, Euclidean\ndistance, and Manhattan distance. The analysis results were visually\nrepresented on an ethnic thematic map, highlighting the connections between\nethnic patterns and their regional distributions.\n","authors":["Shichen Liu","Huaxing Lu"],"pdf_url":"https://arxiv.org/pdf/2408.14060v1.pdf","comment":"8 pages,2tables,5 figures"},{"id":"http://arxiv.org/abs/2405.17137v3","updated":"2024-08-26T07:36:03Z","published":"2024-05-27T12:54:09Z","title":"Jump-teaching: Ultra Efficient and Robust Learning with Noisy Label","summary":"  Sample selection is the most straightforward technique to combat label noise,\naiming to distinguish mislabeled samples during training and avoid the\ndegradation of the robustness of the model. In the workflow, $\\textit{selecting\npossibly clean data}$ and $\\textit{model update}$ are iterative. However, their\ninterplay and intrinsic characteristics hinder the robustness and efficiency of\nlearning with noisy labels: 1) The model chooses clean data with selection\nbias, leading to the accumulated error in the model update. 2) Most selection\nstrategies leverage partner networks or supplementary information to mitigate\nlabel corruption, albeit with increased computation resources and lower\nthroughput speed. Therefore, we employ only one network with the jump manner\nupdate to decouple the interplay and mine more semantic information from the\nloss for a more precise selection. Specifically, the selection of clean data\nfor each model update is based on one of the prior models, excluding the last\niteration. The strategy of model update exhibits a jump behavior in the form.\nMoreover, we map the outputs of the network and labels into the same semantic\nfeature space, respectively. In this space, a detailed and simple loss\ndistribution is generated to distinguish clean samples more effectively. Our\nproposed approach achieves almost up to $2.53\\times$ speedup, $0.46\\times$ peak\nmemory footprint, and superior robustness over state-of-the-art works with\nvarious noise settings.\n","authors":["Kangye Ji","Fei Cheng","Zeqing Wang","Bohu Huang"],"pdf_url":"https://arxiv.org/pdf/2405.17137v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14051v1","updated":"2024-08-26T07:17:05Z","published":"2024-08-26T07:17:05Z","title":"Let Video Teaches You More: Video-to-Image Knowledge Distillation using\n  DEtection TRansformer for Medical Video Lesion Detection","summary":"  AI-assisted lesion detection models play a crucial role in the early\nscreening of cancer. However, previous image-based models ignore the\ninter-frame contextual information present in videos. On the other hand,\nvideo-based models capture the inter-frame context but are computationally\nexpensive. To mitigate this contradiction, we delve into Video-to-Image\nknowledge distillation leveraging DEtection TRansformer (V2I-DETR) for the task\nof medical video lesion detection. V2I-DETR adopts a teacher-student network\nparadigm. The teacher network aims at extracting temporal contexts from\nmultiple frames and transferring them to the student network, and the student\nnetwork is an image-based model dedicated to fast prediction in inference. By\ndistilling multi-frame contexts into a single frame, the proposed V2I-DETR\ncombines the advantages of utilizing temporal contexts from video-based models\nand the inference speed of image-based models. Through extensive experiments,\nV2I-DETR outperforms previous state-of-the-art methods by a large margin while\nachieving the real-time inference speed (30 FPS) as the image-based model.\n","authors":["Yuncheng Jiang","Zixun Zhang","Jun Wei","Chun-Mei Feng","Guanbin Li","Xiang Wan","Shuguang Cui","Zhen Li"],"pdf_url":"https://arxiv.org/pdf/2408.14051v1.pdf","comment":"BIBM2024"},{"id":"http://arxiv.org/abs/2305.12236v2","updated":"2024-08-26T07:09:52Z","published":"2023-05-20T17:01:52Z","title":"Searching a Compact Architecture for Robust Multi-Exposure Image Fusion","summary":"  In recent years, learning-based methods have achieved significant\nadvancements in multi-exposure image fusion. However, two major stumbling\nblocks hinder the development, including pixel misalignment and inefficient\ninference. Reliance on aligned image pairs in existing methods causes\nsusceptibility to artifacts due to device motion. Additionally, existing\ntechniques often rely on handcrafted architectures with huge network\nengineering, resulting in redundant parameters, adversely impacting inference\nefficiency and flexibility. To mitigate these limitations, this study\nintroduces an architecture search-based paradigm incorporating self-alignment\nand detail repletion modules for robust multi-exposure image fusion.\n  Specifically, targeting the extreme discrepancy of exposure, we propose the\nself-alignment module, leveraging scene relighting to constrain the\nillumination degree for following alignment and feature extraction. Detail\nrepletion is proposed to enhance the texture details of scenes. Additionally,\nincorporating a hardware-sensitive constraint, we present the fusion-oriented\narchitecture search to explore compact and efficient networks for fusion. The\nproposed method outperforms various competitive schemes, achieving a noteworthy\n3.19\\% improvement in PSNR for general scenarios and an impressive 23.5\\%\nenhancement in misaligned scenarios. Moreover, it significantly reduces\ninference time by 69.1\\%. The code will be available at\nhttps://github.com/LiuZhu-CV/CRMEF.\n","authors":["Zhu Liu","Jinyuan Liu","Guanyao Wu","Zihang Chen","Xin Fan","Risheng Liu"],"pdf_url":"https://arxiv.org/pdf/2305.12236v2.pdf","comment":"14 pages, 11 figures"},{"id":"http://arxiv.org/abs/2408.14047v1","updated":"2024-08-26T07:02:17Z","published":"2024-08-26T07:02:17Z","title":"Alleviating Class Imbalance in Semi-supervised Multi-organ Segmentation\n  via Balanced Subclass Regularization","summary":"  Semi-supervised learning (SSL) has shown notable potential in relieving the\nheavy demand of dense prediction tasks on large-scale well-annotated datasets,\nespecially for the challenging multi-organ segmentation (MoS). However, the\nprevailing class-imbalance problem in MoS, caused by the substantial variations\nin organ size, exacerbates the learning difficulty of the SSL network. To\nalleviate this issue, we present a two-phase semi-supervised network (BSR-Net)\nwith balanced subclass regularization for MoS. Concretely, in Phase I, we\nintroduce a class-balanced subclass generation strategy based on balanced\nclustering to effectively generate multiple balanced subclasses from original\nbiased ones according to their pixel proportions. Then, in Phase II, we design\nan auxiliary subclass segmentation (SCS) task within the multi-task framework\nof the main MoS task. The SCS task contributes a balanced subclass\nregularization to the main MoS task and transfers unbiased knowledge to the MoS\nnetwork, thus alleviating the influence of the class-imbalance problem.\nExtensive experiments conducted on two publicly available datasets, i.e., the\nMICCAI FLARE 2022 dataset and the WORD dataset, verify the superior performance\nof our method compared with other methods.\n","authors":["Zhenghao Feng","Lu Wen","Binyu Yan","Jiaqi Cui","Yan Wang"],"pdf_url":"https://arxiv.org/pdf/2408.14047v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.06607v4","updated":"2024-08-26T06:57:51Z","published":"2023-11-11T16:37:41Z","title":"Monkey: Image Resolution and Text Label Are Important Things for Large\n  Multi-modal Models","summary":"  Large Multimodal Models (LMMs) have shown promise in vision-language tasks\nbut struggle with high-resolution input and detailed scene understanding.\nAddressing these challenges, we introduce Monkey to enhance LMM capabilities.\nFirstly, Monkey processes input images by dividing them into uniform patches,\neach matching the size (e.g., 448x448) used in the original training of the\nwell-trained vision encoder. Equipped with individual adapter for each patch,\nMonkey can handle higher resolutions up to 1344x896 pixels, enabling the\ndetailed capture of complex visual information. Secondly, it employs a\nmulti-level description generation method, enriching the context for\nscene-object associations. This two-part strategy ensures more effective\nlearning from generated data: the higher resolution allows for a more detailed\ncapture of visuals, which in turn enhances the effectiveness of comprehensive\ndescriptions. Extensive ablative results validate the effectiveness of our\ndesigns. Additionally, experiments on 18 datasets further demonstrate that\nMonkey surpasses existing LMMs in many tasks like Image Captioning and various\nVisual Question Answering formats. Specially, in qualitative tests focused on\ndense text question answering, Monkey has exhibited encouraging results\ncompared with GPT4V. Code is available at\nhttps://github.com/Yuliang-Liu/Monkey.\n","authors":["Zhang Li","Biao Yang","Qiang Liu","Zhiyin Ma","Shuo Zhang","Jingxu Yang","Yabo Sun","Yuliang Liu","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2311.06607v4.pdf","comment":"CVPR 2024 Highlight"},{"id":"http://arxiv.org/abs/2406.10737v2","updated":"2024-08-26T06:37:24Z","published":"2024-06-15T20:47:38Z","title":"Dynamic Domains, Dynamic Solutions: DPCore for Continual Test-Time\n  Adaptation","summary":"  Continual Test-Time Adaptation (CTTA) seeks to adapt a source pre-trained\nmodel to continually changing, unlabeled target domains. Existing TTA methods\nare typically designed for environments where domain changes occur sequentially\nand can struggle in more dynamic scenarios, as illustrated in Figure\n\\ref{fig:settings}. Inspired by the principles of online K-Means, we introduce\na novel approach to CTTA through visual prompting. We propose a \\emph{Dynamic\nPrompt Coreset} that not only preserves knowledge from previously visited\ndomains but also accommodates learning from new potential domains. This is\ncomplemented by a distance-based \\emph{Weight Updating Mechanism} that ensures\nthe coreset remains current and relevant. Our approach employs a fixed model\narchitecture alongside the coreset and an innovative updating system to\neffectively mitigate challenges such as catastrophic forgetting and error\naccumulation. Extensive testing on four widely-used benchmarks demonstrates\nthat our method consistently outperforms state-of-the-art alternatives in both\nclassification and segmentation CTTA tasks across the structured and dynamic\nCTTA settings, with $99\\%$ fewer trainable parameters.\n","authors":["Yunbei Zhang","Akshay Mehra","Jihun Hamm"],"pdf_url":"https://arxiv.org/pdf/2406.10737v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2108.08158v4","updated":"2024-08-26T06:30:52Z","published":"2021-08-18T14:04:52Z","title":"Practical X-ray Gastric Cancer Screening Using Refined Stochastic Data\n  Augmentation and Hard Boundary Box Training","summary":"  Endoscopy is widely used to diagnose gastric cancer and has a high diagnostic\nperformance, but it must be performed by a physician, which limits the number\nof people who can be diagnosed. In contrast, gastric X-rays can be performed by\ntechnicians and screen a much larger number of patients, but accurate diagnosis\nrequires experience. We propose an unprecedented and practical gastric cancer\ndiagnosis support system for gastric X-ray images, enabling more people to be\nscreened. The system is based on a general deep learning-based object detection\nmodel and incorporates two novel techniques: refined probabilistic stomach\nimage augmentation (R-sGAIA) and hard boundary box training (HBBT). R-sGAIA\nenhances the probabilistic gastric fold region, providing more learning\npatterns for cancer detection models. HBBT is an efficient training method that\nimproves model performance by allowing the use of unannotated negative (i.e.,\nhealthy control) samples, which are typically unusable in conventional\ndetection models. The proposed system achieves a sensitivity (SE) for gastric\ncancer of 90.2%, higher than that of an expert (85.5%). Additionally, two out\nof five detected candidate boxes are cancerous, maintaining high precision\nwhile processing images at a speed of 0.51 seconds per image. The system also\noutperforms methods using the same object detection model and state-of-the-art\ndata augmentation, showing a 5.9-point improvement in the F1 score. In summary,\nthis system efficiently identifies areas for radiologists to examine within a\npractical timeframe, significantly reducing their workload.\n","authors":["Hideaki Okamoto","Quan Huu Cap","Takakiyo Nomura","Kazuhito Nabeshima","Jun Hashimoto","Hitoshi Iyatomi"],"pdf_url":"https://arxiv.org/pdf/2108.08158v4.pdf","comment":"20 pages, 6 figures"},{"id":"http://arxiv.org/abs/2408.14039v1","updated":"2024-08-26T06:22:54Z","published":"2024-08-26T06:22:54Z","title":"Collaborative Perception in Multi-Robot Systems: Case Studies in\n  Household Cleaning and Warehouse Operations","summary":"  This paper explores the paradigm of Collaborative Perception (CP), where\nmultiple robots and sensors in the environment share and integrate sensor data\nto construct a comprehensive representation of the surroundings. By aggregating\ndata from various sensors and utilizing advanced algorithms, the collaborative\nperception framework improves task efficiency, coverage, and safety. Two case\nstudies are presented to showcase the benefits of collaborative perception in\nmulti-robot systems. The first case study illustrates the benefits and\nadvantages of using CP for the task of household cleaning with a team of\ncleaning robots. The second case study performs a comparative analysis of the\nperformance of CP versus Standalone Perception (SP) for Autonomous Mobile\nRobots operating in a warehouse environment. The case studies validate the\neffectiveness of CP in enhancing multi-robot coordination, task completion, and\noverall system performance and its potential to impact operations in other\napplications as well. Future investigations will focus on optimizing the\nframework and validating its performance through empirical testing.\n","authors":["Bharath Rajiv Nair"],"pdf_url":"https://arxiv.org/pdf/2408.14039v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02462v2","updated":"2024-08-26T06:12:05Z","published":"2024-04-03T05:04:55Z","title":"A Unified Membership Inference Method for Visual Self-supervised Encoder\n  via Part-aware Capability","summary":"  Self-supervised learning shows promise in harnessing extensive unlabeled\ndata, but it also confronts significant privacy concerns, especially in vision.\nIn this paper, we aim to perform membership inference on visual self-supervised\nmodels in a more realistic setting: self-supervised training method and details\nare unknown for an adversary when attacking as he usually faces a black-box\nsystem in practice. In this setting, considering that self-supervised model\ncould be trained by completely different self-supervised paradigms, e.g.,\nmasked image modeling and contrastive learning, with complex training details,\nwe propose a unified membership inference method called PartCrop. It is\nmotivated by the shared part-aware capability among models and stronger part\nresponse on the training data. Specifically, PartCrop crops parts of objects in\nan image to query responses with the image in representation space. We conduct\nextensive attacks on self-supervised models with different training protocols\nand structures using three widely used image datasets. The results verify the\neffectiveness and generalization of PartCrop. Moreover, to defend against\nPartCrop, we evaluate two common approaches, i.e., early stop and differential\nprivacy, and propose a tailored method called shrinking crop scale range. The\ndefense experiments indicate that all of them are effective. Our code is\navailable at https://github.com/JiePKU/PartCrop.\n","authors":["Jie Zhu","Jirong Zha","Ding Li","Leye Wang"],"pdf_url":"https://arxiv.org/pdf/2404.02462v2.pdf","comment":"Accepted by ACM CCS2024, Full version"},{"id":"http://arxiv.org/abs/2408.14035v1","updated":"2024-08-26T06:01:54Z","published":"2024-08-26T06:01:54Z","title":"FAST-LIVO2: Fast, Direct LiDAR-Inertial-Visual Odometry","summary":"  This paper proposes FAST-LIVO2: a fast, direct LiDAR-inertial-visual odometry\nframework to achieve accurate and robust state estimation in SLAM tasks and\nprovide great potential in real-time, onboard robotic applications. FAST-LIVO2\nfuses the IMU, LiDAR and image measurements efficiently through an ESIKF. To\naddress the dimension mismatch between the heterogeneous LiDAR and image\nmeasurements, we use a sequential update strategy in the Kalman filter. To\nenhance the efficiency, we use direct methods for both the visual and LiDAR\nfusion, where the LiDAR module registers raw points without extracting edge or\nplane features and the visual module minimizes direct photometric errors\nwithout extracting ORB or FAST corner features. The fusion of both visual and\nLiDAR measurements is based on a single unified voxel map where the LiDAR\nmodule constructs the geometric structure for registering new LiDAR scans and\nthe visual module attaches image patches to the LiDAR points. To enhance the\naccuracy of image alignment, we use plane priors from the LiDAR points in the\nvoxel map (and even refine the plane prior) and update the reference patch\ndynamically after new images are aligned. Furthermore, to enhance the\nrobustness of image alignment, FAST-LIVO2 employs an on-demanding raycast\noperation and estimates the image exposure time in real time. Lastly, we detail\nthree applications of FAST-LIVO2: UAV onboard navigation demonstrating the\nsystem's computation efficiency for real-time onboard navigation, airborne\nmapping showcasing the system's mapping accuracy, and 3D model rendering\n(mesh-based and NeRF-based) underscoring the suitability of our reconstructed\ndense map for subsequent rendering tasks. We open source our code, dataset and\napplication on GitHub to benefit the robotics community.\n","authors":["Chunran Zheng","Wei Xu","Zuhao Zou","Tong Hua","Chongjian Yuan","Dongjiao He","Bingyang Zhou","Zheng Liu","Jiarong Lin","Fangcheng Zhu","Yunfan Ren","Rong Wang","Fanle Meng","Fu Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.14035v1.pdf","comment":"30 pages, 31 figures, due to the limitation that 'The abstract field\n  cannot exceed 1,920 characters', the abstract presented here is shorter than\n  the one in the PDF file"},{"id":"http://arxiv.org/abs/2403.18878v2","updated":"2024-08-26T05:54:21Z","published":"2024-03-27T10:46:24Z","title":"Teaching AI the Anatomy Behind the Scan: Addressing Anatomical Flaws in\n  Medical Image Segmentation with Learnable Prior","summary":"  Imposing key anatomical features, such as the number of organs, their shapes\nand relative positions, is crucial for building a robust multi-organ\nsegmentation model. Current attempts to incorporate anatomical features include\nbroadening the effective receptive field (ERF) size with data-intensive\nmodules, or introducing anatomical constraints that scales poorly to\nmulti-organ segmentation. We introduce a novel architecture called the\nAnatomy-Informed Cascaded Segmentation Network (AIC-Net). AIC-Net incorporates\na learnable input termed \"Anatomical Prior\", which can be adapted to\npatient-specific anatomy using a differentiable spatial deformation. The\ndeformed prior later guides decoder layers towards more anatomy-informed\npredictions. We repeat this process at a local patch level to enhance the\nrepresentation of intricate objects, resulting in a cascaded network structure.\nAIC-Net is a general method that enhances any existing segmentation models to\nbe more anatomy-aware. We have validated the performance of AIC-Net, with\nvarious backbones, on two multi-organ segmentation tasks: abdominal organs and\nvertebrae. For each respective task, our benchmarks demonstrate improved dice\nscore and Hausdorff distance.\n","authors":["Young Seok Jeon","Hongfei Yang","Huazhu Fu","Mengling Feng"],"pdf_url":"https://arxiv.org/pdf/2403.18878v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14032v1","updated":"2024-08-26T05:52:35Z","published":"2024-08-26T05:52:35Z","title":"More Pictures Say More: Visual Intersection Network for Open Set Object\n  Detection","summary":"  Open Set Object Detection has seen rapid development recently, but it\ncontinues to pose significant challenges. Language-based methods, grappling\nwith the substantial modal disparity between textual and visual modalities,\nrequire extensive computational resources to bridge this gap. Although\nintegrating visual prompts into these frameworks shows promise for enhancing\nperformance, it always comes with constraints related to textual semantics. In\ncontrast, viusal-only methods suffer from the low-quality fusion of multiple\nvisual prompts. In response, we introduce a strong DETR-based model, Visual\nIntersection Network for Open Set Object Detection (VINO), which constructs a\nmulti-image visual bank to preserve the semantic intersections of each category\nacross all time steps. Our innovative multi-image visual updating mechanism\nlearns to identify the semantic intersections from various visual prompts,\nenabling the flexible incorporation of new information and continuous\noptimization of feature representations. Our approach guarantees a more precise\nalignment between target category semantics and region semantics, while\nsignificantly reducing pre-training time and resource demands compared to\nlanguage-based methods. Furthermore, the integration of a segmentation head\nillustrates the broad applicability of visual intersection in various visual\ntasks. VINO, which requires only 7 RTX4090 GPU days to complete one epoch on\nthe Objects365v1 dataset, achieves competitive performance on par with\nvision-language models on benchmarks such as LVIS and ODinW35.\n","authors":["Bingcheng Dong","Yuning Ding","Jinrong Zhang","Sifan Zhang","Shenglan Liu"],"pdf_url":"https://arxiv.org/pdf/2408.14032v1.pdf","comment":"7pages"},{"id":"http://arxiv.org/abs/2408.14028v1","updated":"2024-08-26T05:38:27Z","published":"2024-08-26T05:38:27Z","title":"SurGen: Text-Guided Diffusion Model for Surgical Video Generation","summary":"  Diffusion-based video generation models have made significant strides,\nproducing outputs with improved visual fidelity, temporal coherence, and user\ncontrol. These advancements hold great promise for improving surgical education\nby enabling more realistic, diverse, and interactive simulation environments.\nIn this study, we introduce SurGen, a text-guided diffusion model tailored for\nsurgical video synthesis, producing the highest resolution and longest duration\nvideos among existing surgical video generation models. We validate the visual\nand temporal quality of the outputs using standard image and video generation\nmetrics. Additionally, we assess their alignment to the corresponding text\nprompts through a deep learning classifier trained on surgical data. Our\nresults demonstrate the potential of diffusion models to serve as valuable\neducational tools for surgical trainees.\n","authors":["Joseph Cho","Samuel Schmidgall","Cyril Zakka","Mrudang Mathur","Rohan Shad","William Hiesinger"],"pdf_url":"https://arxiv.org/pdf/2408.14028v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14023v1","updated":"2024-08-26T05:27:14Z","published":"2024-08-26T05:27:14Z","title":"Video-CCAM: Enhancing Video-Language Understanding with Causal\n  Cross-Attention Masks for Short and Long Videos","summary":"  Multi-modal large language models (MLLMs) have demonstrated considerable\npotential across various downstream tasks that require cross-domain knowledge.\nMLLMs capable of processing videos, known as Video-MLLMs, have attracted broad\ninterest in video-language understanding. However, videos, especially long\nvideos, contain more visual tokens than images, making them difficult for LLMs\nto process. Existing works either downsample visual features or extend the LLM\ncontext size, risking the loss of high-resolution information or slowing down\ninference speed. To address these limitations, we apply cross-attention layers\nin the intermediate projector between the visual encoder and the large language\nmodel (LLM). As the naive cross-attention mechanism is insensitive to temporal\norder, we further introduce causal cross-attention masks (CCAMs) within the\ncross-attention layers. This Video-MLLM, named Video-CCAM, is trained in a\nstraightforward two-stage fashion: feature alignment and visual instruction\ntuning. We develop several Video-CCAM models based on LLMs of different sizes\n(4B, 9B, and 14B). Video-CCAM proves to be a robust Video-MLLM and shows\noutstanding performance from short videos to long ones. Among standard video\nbenchmarks like MVBench and VideoChatGPT-QA, Video-CCAM shows outstanding\nperformances (1st/2nd/3rd in MVBench and TGIF-QA, 2nd/3rd/4th in MSVD-QA,\nMSRVTT-QA, and ActivityNet-QA). In benchmarks encompassing long videos,\nVideo-CCAM models can be directly adapted to long video understanding and still\nachieve exceptional scores despite being trained solely with images and\n16-frame videos. Using 96 frames (6$\\times$ the training number of frames),\nVideo-CCAM models rank 1st/2nd/3rd in VideoVista and 1st/2nd/4th in MLVU among\nall open-source Video-MLLMs, respectively. The code is publicly available in\n\\url{https://github.com/QQ-MM/Video-CCAM}.\n","authors":["Jiajun Fei","Dian Li","Zhidong Deng","Zekun Wang","Gang Liu","Hui Wang"],"pdf_url":"https://arxiv.org/pdf/2408.14023v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.11545v2","updated":"2024-08-26T05:21:35Z","published":"2024-08-21T11:53:53Z","title":"UNetMamba: An Efficient UNet-Like Mamba for Semantic Segmentation of\n  High-Resolution Remote Sensing Images","summary":"  Semantic segmentation of high-resolution remote sensing images is vital in\ndownstream applications such as land-cover mapping, urban planning and disaster\nassessment.Existing Transformer-based methods suffer from the constraint\nbetween accuracy and efficiency, while the recently proposed Mamba is renowned\nfor being efficient. Therefore, to overcome the dilemma, we propose UNetMamba,\na UNet-like semantic segmentation model based on Mamba. It incorporates a mamba\nsegmentation decoder (MSD) that can efficiently decode the complex information\nwithin high-resolution images, and a local supervision module (LSM), which is\ntrain-only but can significantly enhance the perception of local contents.\nExtensive experiments demonstrate that UNetMamba outperforms the\nstate-of-the-art methods with mIoU increased by 0.87% on LoveDA and 0.36% on\nISPRS Vaihingen, while achieving high efficiency through the lightweight\ndesign, less memory footprint and reduced computational cost. The source code\nis available at https://github.com/EnzeZhu2001/UNetMamba.\n","authors":["Enze Zhu","Zhan Chen","Dingkai Wang","Hanru Shi","Xiaoxuan Liu","Lei Wang"],"pdf_url":"https://arxiv.org/pdf/2408.11545v2.pdf","comment":"5 pages, 3 figures"},{"id":"http://arxiv.org/abs/2405.14213v2","updated":"2024-08-26T04:59:05Z","published":"2024-05-23T06:17:23Z","title":"From Text to Pixel: Advancing Long-Context Understanding in MLLMs","summary":"  The rapid progress in Multimodal Large Language Models (MLLMs) has\nsignificantly advanced their ability to process and understand complex visual\nand textual information. However, the integration of multiple images and\nextensive textual contexts remains a challenge due to the inherent limitation\nof the models' capacity to handle long input sequences efficiently. In this\npaper, we introduce SEEKER, a multimodal large language model designed to\ntackle this issue. SEEKER aims to optimize the compact encoding of long text by\ncompressing the text sequence into the visual pixel space via images, enabling\nthe model to handle long text within a fixed token-length budget efficiently.\nOur empirical experiments on six long-context multimodal tasks demonstrate that\nSEEKER can leverage fewer image tokens to convey the same amount of textual\ninformation compared with the OCR-based approach, and is more efficient in\nunderstanding long-form multimodal input and generating long-form textual\noutput, outperforming all existing proprietary and open-source MLLMs by large\nmargins.\n","authors":["Yujie Lu","Xiujun Li","Tsu-Jui Fu","Miguel Eckstein","William Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2405.14213v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14016v1","updated":"2024-08-26T04:56:41Z","published":"2024-08-26T04:56:41Z","title":"Pixel-Aligned Multi-View Generation with Depth Guided Decoder","summary":"  The task of image-to-multi-view generation refers to generating novel views\nof an instance from a single image. Recent methods achieve this by extending\ntext-to-image latent diffusion models to multi-view version, which contains an\nVAE image encoder and a U-Net diffusion model. Specifically, these generation\nmethods usually fix VAE and finetune the U-Net only. However, the significant\ndownscaling of the latent vectors computed from the input images and\nindependent decoding leads to notable pixel-level misalignment across multiple\nviews. To address this, we propose a novel method for pixel-level\nimage-to-multi-view generation. Unlike prior work, we incorporate attention\nlayers across multi-view images in the VAE decoder of a latent video diffusion\nmodel. Specifically, we introduce a depth-truncated epipolar attention,\nenabling the model to focus on spatially adjacent regions while remaining\nmemory efficient. Applying depth-truncated attn is challenging during inference\nas the ground-truth depth is usually difficult to obtain and pre-trained depth\nestimation models is hard to provide accurate depth. Thus, to enhance the\ngeneralization to inaccurate depth when ground truth depth is missing, we\nperturb depth inputs during training. During inference, we employ a rapid\nmulti-view to 3D reconstruction approach, NeuS, to obtain coarse depth for the\ndepth-truncated epipolar attention. Our model enables better pixel alignment\nacross multi-view images. Moreover, we demonstrate the efficacy of our approach\nin improving downstream multi-view to 3D reconstruction tasks.\n","authors":["Zhenggang Tang","Peiye Zhuang","Chaoyang Wang","Aliaksandr Siarohin","Yash Kant","Alexander Schwing","Sergey Tulyakov","Hsin-Ying Lee"],"pdf_url":"https://arxiv.org/pdf/2408.14016v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14013v1","updated":"2024-08-26T04:36:10Z","published":"2024-08-26T04:36:10Z","title":"A Multiscale Gradient Fusion Method for Edge Detection in Color Images\n  Utilizing the CBM3D Filter","summary":"  In this paper, a color edge detection strategy based on collaborative\nfiltering combined with multiscale gradient fusion is proposed. The\nblock-matching and 3D (BM3D) filter are used to enhance the sparse\nrepresentation in the transform domain and achieve the effect of denoising,\nwhereas the multiscale gradient fusion makes up for the defect of loss of\ndetails in single-scale edge detection and improves the edge detection\nresolution and quality. First, the RGB images in the dataset are converted to\nXYZ color space images through mathematical operations. Second, the colored\nblock-matching and 3D (CBM3D) filter are used on the sparse images and to\nremove noise interference. Then, the vector gradients of the color image and\nthe anisotropic Gaussian directional derivative of the two scale parameters are\ncalculated and averaged pixel-by-pixel to obtain a new edge strength map.\nFinally, the edge features are enhanced by image normalization and non-maximum\nsuppression technology, and on that basis, the edge contour is obtained by\ndouble threshold selection and a new morphological refinement method. Through\nan experimental analysis of the edge detection dataset, the method proposed has\ngood noise robustness and high edge quality, which is better than the Color\nSobel, Color Canny, SE and Color AGDD as shown by the PR curve, AUC, PSNR, MSE,\nand FOM indicators.\n","authors":["Zhuoyue Wang","Yiyi Tao","Danqing Ma"],"pdf_url":"https://arxiv.org/pdf/2408.14013v1.pdf","comment":"1 figure, 2 tables"},{"id":"http://arxiv.org/abs/2408.14008v1","updated":"2024-08-26T04:29:52Z","published":"2024-08-26T04:29:52Z","title":"LMM-VQA: Advancing Video Quality Assessment with Large Multimodal Models","summary":"  The explosive growth of videos on streaming media platforms has underscored\nthe urgent need for effective video quality assessment (VQA) algorithms to\nmonitor and perceptually optimize the quality of streaming videos. However, VQA\nremains an extremely challenging task due to the diverse video content and the\ncomplex spatial and temporal distortions, thus necessitating more advanced\nmethods to address these issues. Nowadays, large multimodal models (LMMs), such\nas GPT-4V, have exhibited strong capabilities for various visual understanding\ntasks, motivating us to leverage the powerful multimodal representation ability\nof LMMs to solve the VQA task. Therefore, we propose the first Large\nMulti-Modal Video Quality Assessment (LMM-VQA) model, which introduces a novel\nspatiotemporal visual modeling strategy for quality-aware feature extraction.\nSpecifically, we first reformulate the quality regression problem into a\nquestion and answering (Q&A) task and construct Q&A prompts for VQA instruction\ntuning. Then, we design a spatiotemporal vision encoder to extract spatial and\ntemporal features to represent the quality characteristics of videos, which are\nsubsequently mapped into the language space by the spatiotemporal projector for\nmodality alignment. Finally, the aligned visual tokens and the quality-inquired\ntext tokens are aggregated as inputs for the large language model (LLM) to\ngenerate the quality score and level. Extensive experiments demonstrate that\nLMM-VQA achieves state-of-the-art performance across five VQA benchmarks,\nexhibiting an average improvement of $5\\%$ in generalization ability over\nexisting methods. Furthermore, due to the advanced design of the spatiotemporal\nencoder and projector, LMM-VQA also performs exceptionally well on general\nvideo understanding tasks, further validating its effectiveness. Our code will\nbe released at https://github.com/Sueqk/LMM-VQA.\n","authors":["Qihang Ge","Wei Sun","Yu Zhang","Yunhao Li","Zhongpeng Ji","Fengyu Sun","Shangling Jui","Xiongkuo Min","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2408.14008v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12321v2","updated":"2024-08-26T04:27:54Z","published":"2024-08-22T11:57:16Z","title":"MaVEn: An Effective Multi-granularity Hybrid Visual Encoding Framework\n  for Multimodal Large Language Model","summary":"  This paper presents MaVEn, an innovative Multi-granularity Visual Encoding\nframework designed to enhance the capabilities of Multimodal Large Language\nModels (MLLMs) in multi-image reasoning. Current MLLMs primarily focus on\nsingle-image visual understanding, limiting their ability to interpret and\nintegrate information across multiple images. MaVEn addresses this limitation\nby combining discrete visual symbol sequences, which abstract coarse-grained\nsemantic concepts, with traditional continuous representation sequences that\nmodel fine-grained features. This dual approach bridges the semantic gap\nbetween visual and textual data, thereby improving the model's ability to\nprocess and interpret information from multiple images effectively.\nAdditionally, we design a dynamic reduction mechanism by for long-sequence\ncontinuous features to enhance multi-image processing efficiency. Experimental\nresults demonstrate that MaVEn significantly enhances MLLMs' understanding in\ncomplex multi-image scenarios, while also improving performance in single-image\ncontexts.\n","authors":["Chaoya Jiang","Jia Hongrui","Haiyang Xu","Wei Ye","Mengfan Dong","Ming Yan","Ji Zhang","Fei Huang","Shikun Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.12321v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08549v2","updated":"2024-08-26T03:59:17Z","published":"2024-04-12T15:45:26Z","title":"Practical Guidelines for Cell Segmentation Models Under Optical\n  Aberrations in Microscopy","summary":"  Cell segmentation is essential in biomedical research for analyzing cellular\nmorphology and behavior. Deep learning methods, particularly convolutional\nneural networks (CNNs), have revolutionized cell segmentation by extracting\nintricate features from images. However, the robustness of these methods under\nmicroscope optical aberrations remains a critical challenge. This study\nevaluates cell image segmentation models under optical aberrations from\nfluorescence and bright field microscopy. By simulating different types of\naberrations, including astigmatism, coma, spherical aberration, trefoil, and\nmixed aberrations, we conduct a thorough evaluation of various cell instance\nsegmentation models using the DynamicNuclearNet (DNN) and LIVECell datasets,\nrepresenting fluorescence and bright field microscopy cell datasets,\nrespectively. We train and test several segmentation models, including the Otsu\nthreshold method and Mask R-CNN with different network heads (FPN, C3) and\nbackbones (ResNet, VGG, Swin Transformer), under aberrated conditions.\nAdditionally, we provide usage recommendations for the Cellpose 2.0 Toolbox on\ncomplex cell degradation images. The results indicate that the combination of\nFPN and SwinS demonstrates superior robustness in handling simple cell images\naffected by minor aberrations. In contrast, Cellpose 2.0 proves effective for\ncomplex cell images under similar conditions. Furthermore, we innovatively\npropose the Point Spread Function Image Label Classification Model (PLCM). This\nmodel can quickly and accurately identify aberration types and amplitudes from\nPSF images, assisting researchers without optical training. Through PLCM,\nresearchers can better apply our proposed cell segmentation guidelines.\n","authors":["Boyuan Peng","Jiaju Chen","P. Bilha Githinji","Ijaz Gul","Qihui Ye","Minjiang Chen","Peiwu Qin","Xingru Huang","Chenggang Yan","Dongmei Yu","Jiansong Ji","Zhenglin Chen"],"pdf_url":"https://arxiv.org/pdf/2404.08549v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12616v2","updated":"2024-08-26T03:47:06Z","published":"2024-08-08T16:46:14Z","title":"Semantic Communication based on Large Language Model for Underwater\n  Image Transmission","summary":"  Underwater communication is essential for environmental monitoring, marine\nbiology research, and underwater exploration. Traditional underwater\ncommunication faces limitations like low bandwidth, high latency, and\nsusceptibility to noise, while semantic communication (SC) offers a promising\nsolution by focusing on the exchange of semantics rather than symbols or bits.\nHowever, SC encounters challenges in underwater environments, including\nsemantic information mismatch and difficulties in accurately identifying and\ntransmitting critical information that aligns with the diverse requirements of\nunderwater applications. To address these challenges, we propose a novel\nSemantic Communication (SC) framework based on Large Language Models (LLMs).\nOur framework leverages visual LLMs to perform semantic compression and\nprioritization of underwater image data according to the query from users. By\nidentifying and encoding key semantic elements within the images, the system\nselectively transmits high-priority information while applying higher\ncompression rates to less critical regions. On the receiver side, an LLM-based\nrecovery mechanism, along with Global Vision ControlNet and Key Region\nControlNet networks, aids in reconstructing the images, thereby enhancing\ncommunication efficiency and robustness. Our framework reduces the overall data\nsize to 0.8\\% of the original. Experimental results demonstrate that our method\nsignificantly outperforms existing approaches, ensuring high-quality,\nsemantically accurate image reconstruction.\n","authors":["Weilong Chen","Wenxuan Xu","Haoran Chen","Xinran Zhang","Zhijin Qin","Yanru Zhang","Zhu Han"],"pdf_url":"https://arxiv.org/pdf/2408.12616v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13995v1","updated":"2024-08-26T03:35:13Z","published":"2024-08-26T03:35:13Z","title":"Avatar Concept Slider: Manipulate Concepts In Your Human Avatar With\n  Fine-grained Control","summary":"  Language based editing of 3D human avatars to precisely match user\nrequirements is challenging due to the inherent ambiguity and limited\nexpressiveness of natural language. To overcome this, we propose the Avatar\nConcept Slider (ACS), a 3D avatar editing method that allows precise\nmanipulation of semantic concepts in human avatars towards a specified\nintermediate point between two extremes of concepts, akin to moving a knob\nalong a slider track. To achieve this, our ACS has three designs. 1) A Concept\nSliding Loss based on Linear Discriminant Analysis to pinpoint the\nconcept-specific axis for precise editing. 2) An Attribute Preserving Loss\nbased on Principal Component Analysis for improved preservation of avatar\nidentity during editing. 3) A 3D Gaussian Splatting primitive selection\nmechanism based on concept-sensitivity, which updates only the primitives that\nare the most sensitive to our target concept, to improve efficiency. Results\ndemonstrate that our ACS enables fine-grained 3D avatar editing with efficient\nfeedback, without harming the avatar quality or compromising the avatar's\nidentifying attributes.\n","authors":["Yixuan He","Lin Geng Foo","Ajmal Saeed Mian","Hossein Rahmani","Jun Jiu"],"pdf_url":"https://arxiv.org/pdf/2408.13995v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.06886v7","updated":"2024-08-26T03:25:12Z","published":"2024-07-09T14:14:47Z","title":"Aligning Cyber Space with Physical World: A Comprehensive Survey on\n  Embodied AI","summary":"  Embodied Artificial Intelligence (Embodied AI) is crucial for achieving\nArtificial General Intelligence (AGI) and serves as a foundation for various\napplications that bridge cyberspace and the physical world. Recently, the\nemergence of Multi-modal Large Models (MLMs) and World Models (WMs) have\nattracted significant attention due to their remarkable perception,\ninteraction, and reasoning capabilities, making them a promising architecture\nfor the brain of embodied agents. However, there is no comprehensive survey for\nEmbodied AI in the era of MLMs. In this survey, we give a comprehensive\nexploration of the latest advancements in Embodied AI. Our analysis firstly\nnavigates through the forefront of representative works of embodied robots and\nsimulators, to fully understand the research focuses and their limitations.\nThen, we analyze four main research targets: 1) embodied perception, 2)\nembodied interaction, 3) embodied agent, and 4) sim-to-real adaptation,\ncovering the state-of-the-art methods, essential paradigms, and comprehensive\ndatasets. Additionally, we explore the complexities of MLMs in virtual and real\nembodied agents, highlighting their significance in facilitating interactions\nin dynamic digital and physical environments. Finally, we summarize the\nchallenges and limitations of embodied AI and discuss their potential future\ndirections. We hope this survey will serve as a foundational reference for the\nresearch community and inspire continued innovation. The associated project can\nbe found at https://github.com/HCPLab-SYSU/Embodied_AI_Paper_List.\n","authors":["Yang Liu","Weixing Chen","Yongjie Bai","Xiaodan Liang","Guanbin Li","Wen Gao","Liang Lin"],"pdf_url":"https://arxiv.org/pdf/2407.06886v7.pdf","comment":"The first comprehensive review of Embodied AI in the era of MLMs, 39\n  pages. We also provide the paper list for Embodied AI:\n  https://github.com/HCPLab-SYSU/Embodied_AI_Paper_List"},{"id":"http://arxiv.org/abs/2303.17117v2","updated":"2024-08-26T03:22:08Z","published":"2023-03-30T03:09:25Z","title":"Reliable Representations Learning for Incomplete Multi-View Partial\n  Multi-Label Classification","summary":"  As a cross-topic of multi-view learning and multi-label classification,\nmulti-view multi-label classification has gradually gained traction in recent\nyears. The application of multi-view contrastive learning has further\nfacilitated this process, however, the existing multi-view contrastive learning\nmethods crudely separate the so-called negative pair, which largely results in\nthe separation of samples belonging to the same category or similar ones.\nBesides, plenty of multi-view multi-label learning methods ignore the possible\nabsence of views and labels. To address these issues, in this paper, we propose\nan incomplete multi-view partial multi-label classification network named RANK.\nIn this network, a label-driven multi-view contrastive learning strategy is\nproposed to leverage supervised information to preserve the structure within\nview and perform consistent alignment across views. Furthermore, we break\nthrough the view-level weights inherent in existing methods and propose a\nquality-aware sub-network to dynamically assign quality scores to each view of\neach sample. The label correlation information is fully utilized in the final\nmulti-label cross-entropy classification loss, effectively improving the\ndiscriminative power. Last but not least, our model is not only able to handle\ncomplete multi-view multi-label datasets, but also works on datasets with\nmissing instances and labels. Extensive experiments confirm that our RANK\noutperforms existing state-of-the-art methods.\n","authors":["Chengliang Liu","Jie Wen","Yong Xu","Bob Zhang","Liqiang Nie","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2303.17117v2.pdf","comment":"Please contact me if you have any questions: liucl1996@163.com"},{"id":"http://arxiv.org/abs/2408.10848v2","updated":"2024-08-26T03:19:45Z","published":"2024-08-20T13:40:25Z","title":"Perception-guided Jailbreak against Text-to-Image Models","summary":"  In recent years, Text-to-Image (T2I) models have garnered significant\nattention due to their remarkable advancements. However, security concerns have\nemerged due to their potential to generate inappropriate or Not-Safe-For-Work\n(NSFW) images. In this paper, inspired by the observation that texts with\ndifferent semantics can lead to similar human perceptions, we propose an\nLLM-driven perception-guided jailbreak method, termed PGJ. It is a black-box\njailbreak method that requires no specific T2I model (model-free) and generates\nhighly natural attack prompts. Specifically, we propose identifying a safe\nphrase that is similar in human perception yet inconsistent in text semantics\nwith the target unsafe word and using it as a substitution. The experiments\nconducted on six open-source models and commercial online services with\nthousands of prompts have verified the effectiveness of PGJ.\n","authors":["Yihao Huang","Le Liang","Tianlin Li","Xiaojun Jia","Run Wang","Weikai Miao","Geguang Pu","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2408.10848v2.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2408.13988v1","updated":"2024-08-26T03:02:41Z","published":"2024-08-26T03:02:41Z","title":"Automatic Medical Report Generation: Methods and Applications","summary":"  The increasing demand for medical imaging has surpassed the capacity of\navailable radiologists, leading to diagnostic delays and potential\nmisdiagnoses. Artificial intelligence (AI) techniques, particularly in\nautomatic medical report generation (AMRG), offer a promising solution to this\ndilemma. This review comprehensively examines AMRG methods from 2021 to 2024.\nIt (i) presents solutions to primary challenges in this field, (ii) explores\nAMRG applications across various imaging modalities, (iii) introduces publicly\navailable datasets, (iv) outlines evaluation metrics, (v) identifies techniques\nthat significantly enhance model performance, and (vi) discusses unresolved\nissues and potential future research directions. This paper aims to provide a\ncomprehensive understanding of the existing literature and inspire valuable\nfuture research.\n","authors":["Li Guo","Anas M. Tahir","Dong Zhang","Z. Jane Wang","Rabab K. Ward"],"pdf_url":"https://arxiv.org/pdf/2408.13988v1.pdf","comment":"42 pages and 9 figures"},{"id":"http://arxiv.org/abs/2401.12452v2","updated":"2024-08-26T02:50:28Z","published":"2024-01-23T02:41:06Z","title":"Self-supervised Learning of LiDAR 3D Point Clouds via 2D-3D Neural\n  Calibration","summary":"  This paper introduces a novel self-supervised learning framework for\nenhancing 3D perception in autonomous driving scenes. Specifically, our\napproach, namely NCLR, focuses on 2D-3D neural calibration, a novel pretext\ntask that estimates the rigid pose aligning camera and LiDAR coordinate\nsystems. First, we propose the learnable transformation alignment to bridge the\ndomain gap between image and point cloud data, converting features into a\nunified representation space for effective comparison and matching. Second, we\nidentify the overlapping area between the image and point cloud with the fused\nfeatures. Third, we establish dense 2D-3D correspondences to estimate the rigid\npose. The framework not only learns fine-grained matching from points to pixels\nbut also achieves alignment of the image and point cloud at a holistic level,\nunderstanding their relative pose. We demonstrate the efficacy of NCLR by\napplying the pre-trained backbone to downstream tasks, such as LiDAR-based 3D\nsemantic segmentation, object detection, and panoptic segmentation.\nComprehensive experiments on various datasets illustrate the superiority of\nNCLR over existing self-supervised methods. The results confirm that joint\nlearning from different modalities significantly enhances the network's\nunderstanding abilities and effectiveness of learned representation. The code\nis publicly available at https://github.com/Eaphan/NCLR.\n","authors":["Yifan Zhang","Siyu Ren","Junhui Hou","Jinjian Wu","Yixuan Yuan","Guangming Shi"],"pdf_url":"https://arxiv.org/pdf/2401.12452v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2305.07895v7","updated":"2024-08-26T02:37:14Z","published":"2023-05-13T11:28:37Z","title":"OCRBench: On the Hidden Mystery of OCR in Large Multimodal Models","summary":"  Large models have recently played a dominant role in natural language\nprocessing and multimodal vision-language learning. However, their\neffectiveness in text-related visual tasks remains relatively unexplored. In\nthis paper, we conducted a comprehensive evaluation of Large Multimodal Models,\nsuch as GPT4V and Gemini, in various text-related visual tasks including Text\nRecognition, Scene Text-Centric Visual Question Answering (VQA),\nDocument-Oriented VQA, Key Information Extraction (KIE), and Handwritten\nMathematical Expression Recognition (HMER). To facilitate the assessment of\nOptical Character Recognition (OCR) capabilities in Large Multimodal Models, we\npropose OCRBench, a comprehensive evaluation benchmark. OCRBench contains 29\ndatasets, making it the most comprehensive OCR evaluation benchmark available.\nFurthermore, our study reveals both the strengths and weaknesses of these\nmodels, particularly in handling multilingual text, handwritten text,\nnon-semantic text, and mathematical expression recognition. Most importantly,\nthe baseline results presented in this study could provide a foundational\nframework for the conception and assessment of innovative strategies targeted\nat enhancing zero-shot multimodal techniques. The evaluation pipeline and\nbenchmark are available at https://github.com/Yuliang-Liu/MultimodalOCR.\n","authors":["Yuliang Liu","Zhang Li","Mingxin Huang","Biao Yang","Wenwen Yu","Chunyuan Li","Xucheng Yin","Cheng-lin Liu","Lianwen Jin","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2305.07895v7.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13983v1","updated":"2024-08-26T02:33:47Z","published":"2024-08-26T02:33:47Z","title":"Dual-Path Adversarial Lifting for Domain Shift Correction in Online\n  Test-time Adaptation","summary":"  Transformer-based methods have achieved remarkable success in various machine\nlearning tasks. How to design efficient test-time adaptation methods for\ntransformer models becomes an important research task. In this work, motivated\nby the dual-subband wavelet lifting scheme developed in multi-scale signal\nprocessing which is able to efficiently separate the input signals into\nprincipal components and noise components, we introduce a dual-path token\nlifting for domain shift correction in test time adaptation. Specifically, we\nintroduce an extra token, referred to as \\textit{domain shift token}, at each\nlayer of the transformer network. We then perform dual-path lifting with\ninterleaved token prediction and update between the path of domain shift tokens\nand the path of class tokens at all network layers. The prediction and update\nnetworks are learned in an adversarial manner. Specifically, the task of the\nprediction network is to learn the residual noise of domain shift which should\nbe largely invariant across all classes and all samples in the target domain.\nIn other words, the predicted domain shift noise should be indistinguishable\nbetween all sample classes. On the other hand, the task of the update network\nis to update the class tokens by removing the domain shift from the input image\nsamples so that input samples become more discriminative between different\nclasses in the feature space. To effectively learn the prediction and update\nnetworks with two adversarial tasks, both theoretically and practically, we\ndemonstrate that it is necessary to use smooth optimization for the update\nnetwork but non-smooth optimization for the prediction network. Experimental\nresults on the benchmark datasets demonstrate that our proposed method\nsignificantly improves the online fully test-time domain adaptation\nperformance. Code is available at \\url{https://github.com/yushuntang/DPAL}.\n","authors":["Yushun Tang","Shuoshuo Chen","Zhihe Lu","Xinchao Wang","Zhihai He"],"pdf_url":"https://arxiv.org/pdf/2408.13983v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13981v1","updated":"2024-08-26T02:26:09Z","published":"2024-08-26T02:26:09Z","title":"ARANet: Attention-based Residual Adversarial Network with Deep\n  Supervision for Radiotherapy Dose Prediction of Cervical Cancer","summary":"  Radiation therapy is the mainstay treatment for cervical cancer, and its\nultimate goal is to ensure the planning target volume (PTV) reaches the\nprescribed dose while reducing dose deposition of organs-at-risk (OARs) as much\nas possible. To achieve these clinical requirements, the medical physicist\nneeds to manually tweak the radiotherapy plan repeatedly in a trial-anderror\nmanner until finding the optimal one in the clinic. However, such\ntrial-and-error processes are quite time-consuming, and the quality of plans\nhighly depends on the experience of the medical physicist. In this paper, we\npropose an end-to-end Attentionbased Residual Adversarial Network with deep\nsupervision, namely ARANet, to automatically predict the 3D dose distribution\nof cervical cancer. Specifically, given the computer tomography (CT) images and\ntheir corresponding segmentation masks of PTV and OARs, ARANet employs a\nprediction network to generate the dose maps. We also utilize a multi-scale\nresidual attention module and deep supervision mechanism to enforce the\nprediction network to extract more valuable dose features while suppressing\nirrelevant information. Our proposed method is validated on an in-house dataset\nincluding 54 cervical cancer patients, and experimental results have\ndemonstrated its obvious superiority compared to other state-of-the-art\nmethods.\n","authors":["Lu Wen","Wenxia Yin","Zhenghao Feng","Xi Wu","Deng Xiong","Yan Wang"],"pdf_url":"https://arxiv.org/pdf/2408.13981v1.pdf","comment":"Accepted by 2024 IEEE International Conference on Cybernetics and\n  Intelligent Systems (CIS) and IEEE Conference on Robotics, Automation and\n  Mechatronics (RAM)"},{"id":"http://arxiv.org/abs/2408.13980v1","updated":"2024-08-26T02:20:55Z","published":"2024-08-26T02:20:55Z","title":"FusionSAM: Latent Space driven Segment Anything Model for Multimodal\n  Fusion and Segmentation","summary":"  Multimodal image fusion and segmentation enhance scene understanding in\nautonomous driving by integrating data from various sensors. However, current\nmodels struggle to efficiently segment densely packed elements in such scenes,\ndue to the absence of comprehensive fusion features that can guide mid-process\nfine-tuning and focus attention on relevant areas. The Segment Anything Model\n(SAM) has emerged as a transformative segmentation method. It provides more\neffective prompts through its flexible prompt encoder, compared to transformers\nlacking fine-tuned control. Nevertheless, SAM has not been extensively studied\nin the domain of multimodal fusion for natural images. In this paper, we\nintroduce SAM into multimodal image segmentation for the first time, proposing\na novel framework that combines Latent Space Token Generation (LSTG) and Fusion\nMask Prompting (FMP) modules to enhance SAM's multimodal fusion and\nsegmentation capabilities. Specifically, we first obtain latent space features\nof the two modalities through vector quantization and embed them into a\ncross-attention-based inter-domain fusion module to establish long-range\ndependencies between modalities. Then, we use these comprehensive fusion\nfeatures as prompts to guide precise pixel-level segmentation. Extensive\nexperiments on several public datasets demonstrate that the proposed method\nsignificantly outperforms SAM and SAM2 in multimodal autonomous driving\nscenarios, achieving at least 3.9$\\%$ higher segmentation mIoU than the\nstate-of-the-art approaches.\n","authors":["Daixun Li","Weiying Xie","Mingxiang Cao","Yunke Wang","Jiaqing Zhang","Yunsong Li","Leyuan Fang","Chang Xu"],"pdf_url":"https://arxiv.org/pdf/2408.13980v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10650v2","updated":"2024-08-26T02:19:11Z","published":"2024-03-15T19:35:10Z","title":"PALM: Pushing Adaptive Learning Rate Mechanisms for Continual Test-Time\n  Adaptation","summary":"  Real-world vision models in dynamic environments face rapid shifts in domain\ndistributions, leading to decreased recognition performance. Using unlabeled\ntest data, continual test-time adaptation (CTTA) directly adjusts a pre-trained\nsource discriminative model to these changing domains. A highly effective CTTA\nmethod involves applying layer-wise adaptive learning rates for selectively\nadapting pre-trained layers. However, it suffers from the poor estimation of\ndomain shift and the inaccuracies arising from the pseudo-labels. This work\naims to overcome these limitations by identifying layers for adaptation via\nquantifying model prediction uncertainty without relying on pseudo-labels. We\nutilize the magnitude of gradients as a metric, calculated by backpropagating\nthe KL divergence between the softmax output and a uniform distribution, to\nselect layers for further adaptation. Subsequently, for the parameters\nexclusively belonging to these selected layers, with the remaining ones frozen,\nwe evaluate their sensitivity to approximate the domain shift and adjust their\nlearning rates accordingly. We conduct extensive image classification\nexperiments on CIFAR-10C, CIFAR-100C, and ImageNet-C, demonstrating the\nsuperior efficacy of our method compared to prior approaches.\n","authors":["Sarthak Kumar Maharana","Baoming Zhang","Yunhui Guo"],"pdf_url":"https://arxiv.org/pdf/2403.10650v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.04560v3","updated":"2024-08-26T02:11:58Z","published":"2024-01-09T13:56:37Z","title":"Phase-shifted remote photoplethysmography for estimating heart rate and\n  blood pressure from facial video","summary":"  Human health can be critically affected by cardiovascular diseases, such as\nhypertension, arrhythmias, and stroke. Heart rate and blood pressure are\nimportant biometric information for the monitoring of cardiovascular system and\nearly diagnosis of cardiovascular diseases. Existing methods for estimating the\nheart rate are based on electrocardiography and photoplethyomography, which\nrequire contacting the sensor to the skin surface. Moreover, catheter and\ncuff-based methods for measuring blood pressure cause inconvenience and have\nlimited applicability. Therefore, in this thesis, we propose a vision-based\nmethod for estimating the heart rate and blood pressure. This thesis proposes a\n2-stage deep learning framework consisting of a dual remote\nphotoplethysmography network (DRP-Net) and bounded blood pressure network\n(BBP-Net). In the first stage, DRP-Net infers remote photoplethysmography\n(rPPG) signals for the acral and facial regions, and these phase-shifted rPPG\nsignals are utilized to estimate the heart rate. In the second stage, BBP-Net\nintegrates temporal features and analyzes phase discrepancy between the acral\nand facial rPPG signals to estimate SBP and DBP values. To improve the accuracy\nof estimating the heart rate, we employed a data augmentation method based on a\nframe interpolation model. Moreover, we designed BBP-Net to infer blood\npressure within a predefined range by incorporating a scaled sigmoid function.\nOur method resulted in estimating the heart rate with the mean absolute error\n(MAE) of 1.78 BPM, reducing the MAE by 34.31 % compared to the recent method,\non the MMSE-HR dataset. The MAE for estimating the systolic blood pressure\n(SBP) and diastolic blood pressure (DBP) were 10.19 mmHg and 7.09 mmHg. On the\nV4V dataset, the MAE for the heart rate, SBP, and DBP were 3.83 BPM, 13.64\nmmHg, and 9.4 mmHg, respectively.\n","authors":["Gyutae Hwang","Sang Jun Lee"],"pdf_url":"https://arxiv.org/pdf/2401.04560v3.pdf","comment":"33 pages, 7 figures"},{"id":"http://arxiv.org/abs/2408.13979v1","updated":"2024-08-26T02:09:05Z","published":"2024-08-26T02:09:05Z","title":"Nemesis: Normalizing the Soft-prompt Vectors of Vision-Language Models","summary":"  With the prevalence of large-scale pretrained vision-language models (VLMs),\nsuch as CLIP, soft-prompt tuning has become a popular method for adapting these\nmodels to various downstream tasks. However, few works delve into the inherent\nproperties of learnable soft-prompt vectors, specifically the impact of their\nnorms to the performance of VLMs. This motivates us to pose an unexplored\nresearch question: ``Do we need to normalize the soft prompts in VLMs?'' To\nfill this research gap, we first uncover a phenomenon, called the\n\\textbf{Low-Norm Effect} by performing extensive corruption experiments,\nsuggesting that reducing the norms of certain learned prompts occasionally\nenhances the performance of VLMs, while increasing them often degrades it. To\nharness this effect, we propose a novel method named \\textbf{N}ormalizing\nth\\textbf{e} soft-pro\\textbf{m}pt v\\textbf{e}ctors of vi\\textbf{si}on-language\nmodel\\textbf{s} (\\textbf{Nemesis}) to normalize soft-prompt vectors in VLMs. To\nthe best of our knowledge, our work is the first to systematically investigate\nthe role of norms of soft-prompt vector in VLMs, offering valuable insights for\nfuture research in soft-prompt tuning. The code is available at\n\\texttt{\\href{https://github.com/ShyFoo/Nemesis}{https://github.com/ShyFoo/Nemesis}}.\n","authors":["Shuai Fu","Xiequn Wang","Qiushi Huang","Yu Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.13979v1.pdf","comment":"Accepted at ICLR 2024 (Spotlight)"},{"id":"http://arxiv.org/abs/2408.13978v1","updated":"2024-08-26T01:54:37Z","published":"2024-08-26T01:54:37Z","title":"Histology Virtual Staining with Mask-Guided Adversarial Transfer\n  Learning for Tertiary Lymphoid Structure Detection","summary":"  Histological Tertiary Lymphoid Structures (TLSs) are increasingly recognized\nfor their correlation with the efficacy of immunotherapy in various solid\ntumors. Traditionally, the identification and characterization of TLSs rely on\nimmunohistochemistry (IHC) staining techniques, utilizing markers such as CD20\nfor B cells. Despite the specificity of IHC, Hematoxylin-Eosin (H&E) staining\noffers a more accessible and cost-effective choice. Capitalizing on the\nprevalence of H&E staining slides, we introduce a novel Mask-Guided Adversarial\nTransfer Learning method designed for virtual pathological staining. This\nmethod adeptly captures the nuanced color variations across diverse tissue\ntypes under various staining conditions, such as nucleus, red blood cells,\npositive reaction regions, without explicit label information, and adeptly\nsynthesizes realistic IHC-like virtual staining patches, even replicating the\npositive reaction. Further, we propose the Virtual IHC Pathology Analysis\nNetwork (VIPA-Net), an integrated framework encompassing a Mask-Guided Transfer\nModule and an H&E-Based Virtual Staining TLS Detection Module. VIPA-Net\nsynergistically harnesses both H\\&E staining slides and the synthesized virtual\nIHC patches to enhance the detection of TLSs within H&E Whole Slide Images\n(WSIs). We evaluate the network with a comprehensive dataset comprising 1019\nannotated slides from The Cancer Genome Atlas (TCGA). Experimental results\ncompellingly illustrate that the VIPA-Net substantially elevates TLS detection\naccuracy, effectively circumventing the need for actual CD20 staining across\nthe public dataset.\n","authors":["Qiuli Wang","Yongxu Liu","Li Ma","Xianqi Wang","Wei Chen","Xiaohong Yao"],"pdf_url":"https://arxiv.org/pdf/2408.13978v1.pdf","comment":"8 pages, 8 figures"},{"id":"http://arxiv.org/abs/2408.13972v1","updated":"2024-08-26T01:36:46Z","published":"2024-08-26T01:36:46Z","title":"DynaSurfGS: Dynamic Surface Reconstruction with Planar-based Gaussian\n  Splatting","summary":"  Dynamic scene reconstruction has garnered significant attention in recent\nyears due to its capabilities in high-quality and real-time rendering. Among\nvarious methodologies, constructing a 4D spatial-temporal representation, such\nas 4D-GS, has gained popularity for its high-quality rendered images. However,\nthese methods often produce suboptimal surfaces, as the discrete 3D Gaussian\npoint clouds fail to align with the object's surface precisely. To address this\nproblem, we propose DynaSurfGS to achieve both photorealistic rendering and\nhigh-fidelity surface reconstruction of dynamic scenarios. Specifically, the\nDynaSurfGS framework first incorporates Gaussian features from 4D neural voxels\nwith the planar-based Gaussian Splatting to facilitate precise surface\nreconstruction. It leverages normal regularization to enforce the smoothness of\nthe surface of dynamic objects. It also incorporates the as-rigid-as-possible\n(ARAP) constraint to maintain the approximate rigidity of local neighborhoods\nof 3D Gaussians between timesteps and ensure that adjacent 3D Gaussians remain\nclosely aligned throughout. Extensive experiments demonstrate that DynaSurfGS\nsurpasses state-of-the-art methods in both high-fidelity surface reconstruction\nand photorealistic rendering.\n","authors":["Weiwei Cai","Weicai Ye","Peng Ye","Tong He","Tao Chen"],"pdf_url":"https://arxiv.org/pdf/2408.13972v1.pdf","comment":"homepage: https://open3dvlab.github.io/DynaSurfGS/, code:\n  https://github.com/Open3DVLab/DynaSurfGS"},{"id":"http://arxiv.org/abs/2408.09403v2","updated":"2024-08-26T01:26:35Z","published":"2024-08-18T08:23:51Z","title":"Obtaining Optimal Spiking Neural Network in Sequence Learning via\n  CRNN-SNN Conversion","summary":"  Spiking neural networks (SNNs) are becoming a promising alternative to\nconventional artificial neural networks (ANNs) due to their rich neural\ndynamics and the implementation of energy-efficient neuromorphic chips.\nHowever, the non-differential binary communication mechanism makes SNN hard to\nconverge to an ANN-level accuracy. When SNN encounters sequence learning, the\nsituation becomes worse due to the difficulties in modeling long-range\ndependencies. To overcome these difficulties, researchers developed variants of\nLIF neurons and different surrogate gradients but still failed to obtain good\nresults when the sequence became longer (e.g., $>$500). Unlike them, we obtain\nan optimal SNN in sequence learning by directly mapping parameters from a\nquantized CRNN. We design two sub-pipelines to support the end-to-end\nconversion of different structures in neural networks, which is called\nCNN-Morph (CNN $\\rightarrow$ QCNN $\\rightarrow$ BIFSNN) and RNN-Morph (RNN\n$\\rightarrow$ QRNN $\\rightarrow$ RBIFSNN). Using conversion pipelines and the\ns-analog encoding method, the conversion error of our framework is zero.\nFurthermore, we give the theoretical and experimental demonstration of the\nlossless CRNN-SNN conversion. Our results show the effectiveness of our method\nover short and long timescales tasks compared with the state-of-the-art\nlearning- and conversion-based methods. We reach the highest accuracy of 99.16%\n(0.46 $\\uparrow$) on S-MNIST, 94.95% (3.95 $\\uparrow$) on PS-MNIST (sequence\nlength of 784) respectively, and the lowest loss of 0.057 (0.013 $\\downarrow$)\nwithin 8 time-steps in collision avoidance dataset.\n","authors":["Jiahao Su","Kang You","Zekai Xu","Weizhi Xu","Zhezhi He"],"pdf_url":"https://arxiv.org/pdf/2408.09403v2.pdf","comment":"Accepted by 33rd International Conference on Artificial Neural\n  Networks"},{"id":"http://arxiv.org/abs/2408.12677v2","updated":"2024-08-26T01:08:36Z","published":"2024-08-22T18:32:50Z","title":"GSFusion: Online RGB-D Mapping Where Gaussian Splatting Meets TSDF\n  Fusion","summary":"  Traditional volumetric fusion algorithms preserve the spatial structure of 3D\nscenes, which is beneficial for many tasks in computer vision and robotics.\nHowever, they often lack realism in terms of visualization. Emerging 3D\nGaussian splatting bridges this gap, but existing Gaussian-based reconstruction\nmethods often suffer from artifacts and inconsistencies with the underlying 3D\nstructure, and struggle with real-time optimization, unable to provide users\nwith immediate feedback in high quality. One of the bottlenecks arises from the\nmassive amount of Gaussian parameters that need to be updated during\noptimization. Instead of using 3D Gaussian as a standalone map representation,\nwe incorporate it into a volumetric mapping system to take advantage of\ngeometric information and propose to use a quadtree data structure on images to\ndrastically reduce the number of splats initialized. In this way, we\nsimultaneously generate a compact 3D Gaussian map with fewer artifacts and a\nvolumetric map on the fly. Our method, GSFusion, significantly enhances\ncomputational efficiency without sacrificing rendering quality, as demonstrated\non both synthetic and real datasets. Code will be available at\nhttps://github.com/goldoak/GSFusion.\n","authors":["Jiaxin Wei","Stefan Leutenegger"],"pdf_url":"https://arxiv.org/pdf/2408.12677v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14698v1","updated":"2024-08-26T23:52:27Z","published":"2024-08-26T23:52:27Z","title":"Smart Multi-Modal Search: Contextual Sparse and Dense Embedding\n  Integration in Adobe Express","summary":"  As user content and queries become increasingly multi-modal, the need for\neffective multi-modal search systems has grown. Traditional search systems\noften rely on textual and metadata annotations for indexed images, while\nmulti-modal embeddings like CLIP enable direct search using text and image\nembeddings. However, embedding-based approaches face challenges in integrating\ncontextual features such as user locale and recency. Building a scalable\nmulti-modal search system requires fine-tuning several components. This paper\npresents a multi-modal search architecture and a series of AB tests that\noptimize embeddings and multi-modal technologies in Adobe Express template\nsearch. We address considerations such as embedding model selection, the roles\nof embeddings in matching and ranking, and the balance between dense and sparse\nembeddings. Our iterative approach demonstrates how utilizing sparse, dense,\nand contextual features enhances short and long query search, significantly\nreduces null rates (over 70\\%), and increases click-through rates (CTR). Our\nfindings provide insights into developing robust multi-modal search systems,\nthereby enhancing relevance for complex queries.\n","authors":["Cherag Aroraa","Tracy Holloway King","Jayant Kumar","Yi Lu","Sanat Sharma","Arvind Srikantan","David Uvalle","Josep Valls-Vargas","Harsha Vardhan"],"pdf_url":"https://arxiv.org/pdf/2408.14698v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14681v1","updated":"2024-08-26T23:10:42Z","published":"2024-08-26T23:10:42Z","title":"Enhancing Neural Network Interpretability Through Conductance-Based\n  Information Plane Analysis","summary":"  The Information Plane is a conceptual framework used to analyze the flow of\ninformation in neural networks, but traditional methods based on activations\nmay not fully capture the dynamics of information processing. This paper\nintroduces a new approach that uses layer conductance, a measure of sensitivity\nto input features, to enhance the Information Plane analysis. By incorporating\ngradient-based contributions, we provide a more precise characterization of\ninformation dynamics within the network. The proposed conductance-based\nInformation Plane and a new Information Transformation Efficiency (ITE) metric\nare evaluated on pretrained ResNet50 and VGG16 models using the ImageNet\ndataset. Our results demonstrate the ability to identify critical hidden layers\nthat contribute significantly to model performance and interpretability, giving\ninsights into information compression, preservation, and utilization across\nlayers. The conductance-based approach offers a granular perspective on feature\nattribution, enhancing our understanding of the decision-making processes\nwithin neural networks. Furthermore, our empirical findings challenge certain\ntheoretical predictions of the Information Bottleneck theory, highlighting the\ncomplexities of information dynamics in real-world data scenarios. The proposed\nmethod not only advances our understanding of information dynamics in neural\nnetworks but also has the potential to significantly impact the broader field\nof Artificial Intelligence by enabling the development of more interpretable,\nefficient, and robust models.\n","authors":["Jaouad Dabounou","Amine Baazzouz"],"pdf_url":"https://arxiv.org/pdf/2408.14681v1.pdf","comment":"16 pages, 10 figures"},{"id":"http://arxiv.org/abs/2407.02534v2","updated":"2024-08-26T22:56:28Z","published":"2024-07-01T16:58:55Z","title":"Image-to-Text Logic Jailbreak: Your Imagination can Help You Do Anything","summary":"  Large Visual Language Model\\textbfs (VLMs) such as GPT-4V have achieved\nremarkable success in generating comprehensive and nuanced responses.\nResearchers have proposed various benchmarks for evaluating the capabilities of\nVLMs. With the integration of visual and text inputs in VLMs, new security\nissues emerge, as malicious attackers can exploit multiple modalities to\nachieve their objectives. This has led to increasing attention on the\nvulnerabilities of VLMs to jailbreak. Most existing research focuses on\ngenerating adversarial images or nonsensical image to jailbreak these models.\nHowever, no researchers evaluate whether logic understanding capabilities of\nVLMs in flowchart can influence jailbreak. Therefore, to fill this gap, this\npaper first introduces a novel dataset Flow-JD specifically designed to\nevaluate the logic-based flowchart jailbreak capabilities of VLMs. We conduct\nan extensive evaluation on GPT-4o, GPT-4V, other 5 SOTA open source VLMs and\nthe jailbreak rate is up to 92.8%. Our research reveals significant\nvulnerabilities in current VLMs concerning image-to-text jailbreak and these\nfindings underscore the the urgency for the development of robust and effective\nfuture defenses.\n","authors":["Xiaotian Zou","Ke Li","Yongkang Chen"],"pdf_url":"https://arxiv.org/pdf/2407.02534v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14674v1","updated":"2024-08-26T22:50:59Z","published":"2024-08-26T22:50:59Z","title":"gWaveNet: Classification of Gravity Waves from Noisy Satellite Data\n  using Custom Kernel Integrated Deep Learning Method","summary":"  Atmospheric gravity waves occur in the Earths atmosphere caused by an\ninterplay between gravity and buoyancy forces. These waves have profound\nimpacts on various aspects of the atmosphere, including the patterns of\nprecipitation, cloud formation, ozone distribution, aerosols, and pollutant\ndispersion. Therefore, understanding gravity waves is essential to comprehend\nand monitor changes in a wide range of atmospheric behaviors. Limited studies\nhave been conducted to identify gravity waves from satellite data using machine\nlearning techniques. Particularly, without applying noise removal techniques,\nit remains an underexplored area of research. This study presents a novel\nkernel design aimed at identifying gravity waves within satellite images. The\nproposed kernel is seamlessly integrated into a deep convolutional neural\nnetwork, denoted as gWaveNet. Our proposed model exhibits impressive\nproficiency in detecting images containing gravity waves from noisy satellite\ndata without any feature engineering. The empirical results show our model\noutperforms related approaches by achieving over 98% training accuracy and over\n94% test accuracy which is known to be the best result for gravity waves\ndetection up to the time of this work. We open sourced our code at\nhttps://rb.gy/qn68ku.\n","authors":["Seraj Al Mahmud Mostafa","Omar Faruque","Chenxi Wang","Jia Yue","Sanjay Purushotham","Jianwu Wang"],"pdf_url":"https://arxiv.org/pdf/2408.14674v1.pdf","comment":"This paper has been accepted at the 27th International Conference on\n  Pattern Recognition (ICPR) 2024"},{"id":"http://arxiv.org/abs/2408.14672v1","updated":"2024-08-26T22:39:08Z","published":"2024-08-26T22:39:08Z","title":"Physically Feasible Semantic Segmentation","summary":"  State-of-the-art semantic segmentation models are typically optimized in a\ndata-driven fashion, minimizing solely per-pixel classification objectives on\ntheir training data. This purely data-driven paradigm often leads to absurd\nsegmentations, especially when the domain of input images is shifted from the\none encountered during training. For instance, state-of-the-art models may\nassign the label ``road'' to a segment which is located above a segment that is\nrespectively labeled as ``sky'', although our knowledge of the physical world\ndictates that such a configuration is not feasible for images captured by\nforward-facing upright cameras. Our method, Physically Feasible Semantic\nSegmentation (PhyFea), extracts explicit physical constraints that govern\nspatial class relations from the training sets of semantic segmentation\ndatasets and enforces a differentiable loss function that penalizes violations\nof these constraints to promote prediction feasibility. PhyFea yields\nsignificant performance improvements in mIoU over each state-of-the-art network\nwe use as baseline across ADE20K, Cityscapes and ACDC, notably a $1.5\\%$\nimprovement on ADE20K and a $2.1\\%$ improvement on ACDC.\n","authors":["Shamik Basu","Christos Sakaridis","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2408.14672v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14659v1","updated":"2024-08-26T21:55:06Z","published":"2024-08-26T21:55:06Z","title":"Comparative Analysis: Violence Recognition from Videos using Transfer\n  Learning","summary":"  Action recognition has become a hot topic in computer vision. However, the\nmain applications of computer vision in video processing have focused on\ndetection of relatively simple actions while complex events such as violence\ndetection have been comparatively less investigated. This study focuses on the\nbenchmarking of various deep learning techniques on a complex dataset. Next, a\nlarger dataset is utilized to test the uplift from increasing volume of data.\nThe dataset size increase from 500 to 1,600 videos resulted in a notable\naverage accuracy improvement of 6% across four models.\n","authors":["Dursun Dashdamirov"],"pdf_url":"https://arxiv.org/pdf/2408.14659v1.pdf","comment":"6 pages, 5 figures, The paper will be published in IEEE AICT 2024\n  Conference"},{"id":"http://arxiv.org/abs/2308.13651v5","updated":"2024-08-26T21:11:26Z","published":"2023-08-25T19:40:56Z","title":"PCNN: Probable-Class Nearest-Neighbor Explanations Improve Fine-Grained\n  Image Classification Accuracy for AIs and Humans","summary":"  Nearest neighbors (NN) are traditionally used to compute final decisions,\ne.g., in Support Vector Machines or k-NN classifiers, and to provide users with\nexplanations for the model's decision. In this paper, we show a novel utility\nof nearest neighbors: To improve predictions of a frozen, pretrained image\nclassifier C. We leverage an image comparator S that (1) compares the input\nimage with NN images from the top-K most probable classes given by C; and (2)\nuses scores from S to weight the confidence scores of C to refine predictions.\nOur method consistently improves fine-grained image classification accuracy on\nCUB-200, Cars-196, and Dogs-120. Also, a human study finds that showing users\nour probable-class nearest neighbors (PCNN) reduces over-reliance on AI, thus\nimproving their decision accuracy over prior work which only shows only the\nmost-probable (top-1) class examples.\n","authors":[" Giang"," Nguyen","Valerie Chen","Mohammad Reza Taesiri","Anh Totti Nguyen"],"pdf_url":"https://arxiv.org/pdf/2308.13651v5.pdf","comment":"Accepted to Transaction on Machine Learning Research 2024; 50 pages,\n  35 Figures & 17 Tables"},{"id":"http://arxiv.org/abs/2408.11836v3","updated":"2024-08-26T20:31:08Z","published":"2024-08-06T22:09:50Z","title":"Analysis of Unstructured High-Density Crowded Scenes for Crowd\n  Monitoring","summary":"  We are interested in developing an automated system for detection of\norganized movements in human crowds. Computer vision algorithms can extract\ninformation from videos of crowded scenes and automatically detect and track\ngroups of individuals undergoing organized motion that represents an anomalous\nbehavior in the context of conflict aversion. Our system can detect organized\ncohorts against the background of randomly moving objects and we can estimate\nthe number of participants in an organized cohort, the speed and direction of\nmotion in real time, within three to four video frames, which is less than one\nsecond from the onset of motion captured on a CCTV. We have performed\npreliminary analysis in this context in biological cell data containing up to\nfour thousand objects per frame and will extend this numerically to a\nhundred-fold for public safety applications.\n  We envisage using the existing infrastructure of video cameras for acquiring\nimage datasets on-the-fly and deploying an easy-to-use data-driven software\nsystem for parsing of significant events by analyzing image sequences taken\ninside and outside of sports stadiums or other public venues. Other prospective\nusers are organizers of political rallies, civic and wildlife organizations,\nsecurity firms, and the military. We will optimize the performance of the\nsoftware by implementing a classification method able to distinguish between\nactivities posing a threat and those not posing a threat.\n","authors":["Alexandre Matov"],"pdf_url":"https://arxiv.org/pdf/2408.11836v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06494v2","updated":"2024-08-26T20:10:52Z","published":"2024-08-12T21:04:16Z","title":"What Color Scheme is More Effective in Assisting Readers to Locate\n  Information in a Color-Coded Article?","summary":"  Color coding, a technique assigning specific colors to cluster information\ntypes, has proven advantages in aiding human cognitive activities, especially\nreading and comprehension. The rise of Large Language Models (LLMs) has\nstreamlined document coding, enabling simple automatic text labeling with\nvarious schemes. This has the potential to make color-coding more accessible\nand benefit more users. However, the impact of color choice on information\nseeking is understudied. We conducted a user study assessing various color\nschemes' effectiveness in LLM-coded text documents, standardizing contrast\nratios to approximately 5.55:1 across schemes. Participants performed timed\ninformation-seeking tasks in color-coded scholarly abstracts. Results showed\nnon-analogous and yellow-inclusive color schemes improved performance, with the\nlatter also being more preferred by participants. These findings can inform\nbetter color scheme choices for text annotation. As LLMs advance document\ncoding, we advocate for more research focusing on the \"color\" aspect of\ncolor-coding techniques.\n","authors":["Ho Yin Ng","Zeyu He","Ting-Hao 'Kenneth' Huang"],"pdf_url":"https://arxiv.org/pdf/2408.06494v2.pdf","comment":"This paper will appear at IEEE VIS 2024"},{"id":"http://arxiv.org/abs/2408.14606v1","updated":"2024-08-26T19:59:20Z","published":"2024-08-26T19:59:20Z","title":"BreakNet: Discontinuity-Resilient Multi-Scale Transformer Segmentation\n  of Retinal Layers","summary":"  Visible light optical coherence tomography (vis-OCT) is gaining traction for\nretinal imaging due to its high resolution and functional capabilities.\nHowever, the significant absorption of hemoglobin in the visible light range\nleads to pronounced shadow artifacts from retinal blood vessels, posing\nchallenges for accurate layer segmentation. In this study, we present BreakNet,\na multi-scale Transformer-based segmentation model designed to address boundary\ndiscontinuities caused by these shadow artifacts. BreakNet utilizes\nhierarchical Transformer and convolutional blocks to extract multi-scale global\nand local feature maps, capturing essential contextual, textural, and edge\ncharacteristics. The model incorporates decoder blocks that expand pathwaproys\nto enhance the extraction of fine details and semantic information, ensuring\nprecise segmentation. Evaluated on rodent retinal images acquired with\nprototype vis-OCT, BreakNet demonstrated superior performance over\nstate-of-the-art segmentation models, such as TCCT-BP and U-Net, even when\nfaced with limited-quality ground truth data. Our findings indicate that\nBreakNet has the potential to significantly improve retinal quantification and\nanalysis.\n","authors":["Razieh Ganjee","Bingjie Wang","Lingyun Wang","Chengcheng Zhao","José-Alain Sahel","Shaohua Pi"],"pdf_url":"https://arxiv.org/pdf/2408.14606v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14601v1","updated":"2024-08-26T19:44:18Z","published":"2024-08-26T19:44:18Z","title":"3D Point Cloud Network Pruning: When Some Weights Do not Matter","summary":"  A point cloud is a crucial geometric data structure utilized in numerous\napplications. The adoption of deep neural networks referred to as Point Cloud\nNeural Networks (PC- NNs), for processing 3D point clouds, has significantly\nadvanced fields that rely on 3D geometric data to enhance the efficiency of\ntasks. Expanding the size of both neural network models and 3D point clouds\nintroduces significant challenges in minimizing computational and memory\nrequirements. This is essential for meeting the demanding requirements of\nreal-world applications, which prioritize minimal energy consumption and low\nlatency. Therefore, investigating redundancy in PCNNs is crucial yet\nchallenging due to their sensitivity to parameters. Additionally, traditional\npruning methods face difficulties as these networks rely heavily on weights and\npoints. Nonetheless, our research reveals a promising phenomenon that could\nrefine standard PCNN pruning techniques. Our findings suggest that preserving\nonly the top p% of the highest magnitude weights is crucial for accuracy\npreservation. For example, pruning 99% of the weights from the PointNet model\nstill results in accuracy close to the base level. Specifically, in the\nModelNet40 dataset, where the base accuracy with the PointNet model was 87. 5%,\npreserving only 1% of the weights still achieves an accuracy of 86.8%. Codes\nare available in: https://github.com/apurba-nsu-rnd-lab/PCNN_Pruning\n","authors":["Amrijit Biswas","Md. Ismail Hossain","M M Lutfe Elahi","Ali Cheraghian","Fuad Rahman","Nabeel Mohammed","Shafin Rahman"],"pdf_url":"https://arxiv.org/pdf/2408.14601v1.pdf","comment":"Accepted in BMVC 2024"},{"id":"http://arxiv.org/abs/2408.14600v1","updated":"2024-08-26T19:43:01Z","published":"2024-08-26T19:43:01Z","title":"PVAFN: Point-Voxel Attention Fusion Network with Multi-Pooling Enhancing\n  for 3D Object Detection","summary":"  The integration of point and voxel representations is becoming more common in\nLiDAR-based 3D object detection. However, this combination often struggles with\ncapturing semantic information effectively. Moreover, relying solely on point\nfeatures within regions of interest can lead to information loss and\nlimitations in local feature representation. To tackle these challenges, we\npropose a novel two-stage 3D object detector, called Point-Voxel Attention\nFusion Network (PVAFN). PVAFN leverages an attention mechanism to improve\nmulti-modal feature fusion during the feature extraction phase. In the\nrefinement stage, it utilizes a multi-pooling strategy to integrate both\nmulti-scale and region-specific information effectively. The point-voxel\nattention mechanism adaptively combines point cloud and voxel-based\nBird's-Eye-View (BEV) features, resulting in richer object representations that\nhelp to reduce false detections. Additionally, a multi-pooling enhancement\nmodule is introduced to boost the model's perception capabilities. This module\nemploys cluster pooling and pyramid pooling techniques to efficiently capture\nkey geometric details and fine-grained shape structures, thereby enhancing the\nintegration of local and global features. Extensive experiments on the KITTI\nand Waymo datasets demonstrate that the proposed PVAFN achieves competitive\nperformance. The code and models will be available.\n","authors":["Yidi Li","Jiahao Wen","Bin Ren","Wenhao Li","Zhenhuan Xu","Hao Guo","Hong Liu","Nicu Sebe"],"pdf_url":"https://arxiv.org/pdf/2408.14600v1.pdf","comment":"3D Object Detection"},{"id":"http://arxiv.org/abs/2404.00491v2","updated":"2024-08-26T19:39:19Z","published":"2024-03-30T23:19:40Z","title":"Denoising Monte Carlo Renders with Diffusion Models","summary":"  Physically-based renderings contain Monte-Carlo noise, with variance that\nincreases as the number of rays per pixel decreases. This noise, while\nzero-mean for good modern renderers, can have heavy tails (most notably, for\nscenes containing specular or refractive objects). Learned methods for\nrestoring low fidelity renders are highly developed, because suppressing render\nnoise means one can save compute and use fast renders with few rays per pixel.\nWe demonstrate that a diffusion model can denoise low fidelity renders\nsuccessfully. Furthermore, our method can be conditioned on a variety of\nnatural render information, and this conditioning helps performance.\nQuantitative experiments show that our method is competitive with SOTA across a\nrange of sampling rates. Qualitative examination of the reconstructions\nsuggests that the image prior applied by a diffusion method strongly favors\nreconstructions that are like real images -- so have straight shadow\nboundaries, curved specularities and no fireflies.\n","authors":["Vaibhav Vavilala","Rahul Vasanth","David Forsyth"],"pdf_url":"https://arxiv.org/pdf/2404.00491v2.pdf","comment":"25 pages, 18 figures, 2 tables"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2408.14432v1","updated":"2024-08-26T17:20:34Z","published":"2024-08-26T17:20:34Z","title":"Contextual Bandit with Herding Effects: Algorithms and Recommendation\n  Applications","summary":"  Contextual bandits serve as a fundamental algorithmic framework for\noptimizing recommendation decisions online. Though extensive attention has been\npaid to tailoring contextual bandits for recommendation applications, the\n\"herding effects\" in user feedback have been ignored. These herding effects\nbias user feedback toward historical ratings, breaking down the assumption of\nunbiased feedback inherent in contextual bandits. This paper develops a novel\nvariant of the contextual bandit that is tailored to address the feedback bias\ncaused by the herding effects. A user feedback model is formulated to capture\nthis feedback bias. We design the TS-Conf (Thompson Sampling under Conformity)\nalgorithm, which employs posterior sampling to balance the exploration and\nexploitation tradeoff. We prove an upper bound for the regret of the algorithm,\nrevealing the impact of herding effects on learning speed. Extensive\nexperiments on datasets demonstrate that TS-Conf outperforms four benchmark\nalgorithms. Analysis reveals that TS-Conf effectively mitigates the negative\nimpact of herding effects, resulting in faster learning and improved\nrecommendation accuracy.\n","authors":["Luyue Xu","Liming Wang","Hong Xie","Mingqiang Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.14432v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14393v1","updated":"2024-08-26T16:21:50Z","published":"2024-08-26T16:21:50Z","title":"CURE4Rec: A Benchmark for Recommendation Unlearning with Deeper\n  Influence","summary":"  With increasing privacy concerns in artificial intelligence, regulations have\nmandated the right to be forgotten, granting individuals the right to withdraw\ntheir data from models. Machine unlearning has emerged as a potential solution\nto enable selective forgetting in models, particularly in recommender systems\nwhere historical data contains sensitive user information. Despite recent\nadvances in recommendation unlearning, evaluating unlearning methods\ncomprehensively remains challenging due to the absence of a unified evaluation\nframework and overlooked aspects of deeper influence, e.g., fairness. To\naddress these gaps, we propose CURE4Rec, the first comprehensive benchmark for\nrecommendation unlearning evaluation. CURE4Rec covers four aspects, i.e.,\nunlearning Completeness, recommendation Utility, unleaRning efficiency, and\nrecommendation fairnEss, under three data selection strategies, i.e., core\ndata, edge data, and random data. Specifically, we consider the deeper\ninfluence of unlearning on recommendation fairness and robustness towards data\nwith varying impact levels. We construct multiple datasets with CURE4Rec\nevaluation and conduct extensive experiments on existing recommendation\nunlearning methods. Our code is released at\nhttps://github.com/xiye7lai/CURE4Rec.\n","authors":["Chaochao Chen","Jiaming Zhang","Yizhao Zhang","Li Zhang","Lingjuan Lyu","Yuyuan Li","Biao Gong","Chenggang Yan"],"pdf_url":"https://arxiv.org/pdf/2408.14393v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14238v1","updated":"2024-08-26T12:52:02Z","published":"2024-08-26T12:52:02Z","title":"Are LLM-based Recommenders Already the Best? Simple Scaled Cross-entropy\n  Unleashes the Potential of Traditional Sequential Recommenders","summary":"  Large language models (LLMs) have been garnering increasing attention in the\nrecommendation community. Some studies have observed that LLMs, when fine-tuned\nby the cross-entropy (CE) loss with a full softmax, could achieve\n`state-of-the-art' performance in sequential recommendation. However, most of\nthe baselines used for comparison are trained using a pointwise/pairwise loss\nfunction. This inconsistent experimental setting leads to the underestimation\nof traditional methods and further fosters over-confidence in the ranking\ncapability of LLMs.\n  In this study, we provide theoretical justification for the superiority of\nthe cross-entropy loss by demonstrating its two desirable properties: tightness\nand coverage. Furthermore, this study sheds light on additional novel insights:\n1) Taking into account only the recommendation performance, CE is not yet\noptimal as it is not a quite tight bound in terms of some ranking metrics. 2)\nIn scenarios that full softmax cannot be performed, an effective alternative is\nto scale up the sampled normalizing term. These findings then help unleash the\npotential of traditional recommendation models, allowing them to surpass\nLLM-based counterparts. Given the substantial computational burden, existing\nLLM-based methods are not as effective as claimed for sequential\nrecommendation. We hope that these theoretical understandings in conjunction\nwith the empirical results will facilitate an objective evaluation of LLM-based\nrecommendation in the future.\n","authors":["Cong Xu","Zhangchi Zhu","Mo Yu","Jun Wang","Jianyong Wang","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.14238v1.pdf","comment":"18 pages. arXiv admin note: substantial text overlap with\n  arXiv:2402.06216"},{"id":"http://arxiv.org/abs/2408.05141v2","updated":"2024-08-26T10:53:28Z","published":"2024-08-09T15:53:55Z","title":"A Hybrid RAG System with Comprehensive Enhancement on Complex Reasoning","summary":"  Retrieval-augmented generation (RAG) is a framework enabling large language\nmodels (LLMs) to enhance their accuracy and reduce hallucinations by\nintegrating external knowledge bases. In this paper, we introduce a hybrid RAG\nsystem enhanced through a comprehensive suite of optimizations that\nsignificantly improve retrieval quality, augment reasoning capabilities, and\nrefine numerical computation ability. We refined the text chunks and tables in\nweb pages, added attribute predictors to reduce hallucinations, conducted LLM\nKnowledge Extractor and Knowledge Graph Extractor, and finally built a\nreasoning strategy with all the references. We evaluated our system on the CRAG\ndataset through the Meta CRAG KDD Cup 2024 Competition. Both the local and\nonline evaluations demonstrate that our system significantly enhances complex\nreasoning capabilities. In local evaluations, we have significantly improved\naccuracy and reduced error rates compared to the baseline model, achieving a\nnotable increase in scores. In the meanwhile, we have attained outstanding\nresults in online assessments, demonstrating the performance and generalization\ncapabilities of the proposed system. The source code for our system is released\nin \\url{https://gitlab.aicrowd.com/shizueyy/crag-new}.\n","authors":["Ye Yuan","Chengwu Liu","Jingyang Yuan","Gongbo Sun","Siqi Li","Ming Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.05141v2.pdf","comment":"Technical report for 3rd prize in Task 1 of Meta CRAG KDD Cup 2024"},{"id":"http://arxiv.org/abs/2408.14118v1","updated":"2024-08-26T09:06:35Z","published":"2024-08-26T09:06:35Z","title":"Towards Lifelong Learning Embeddings: An Algorithmic Approach to\n  Dynamically Extend Embeddings","summary":"  The rapid evolution of technology has transformed business operations and\ncustomer interactions worldwide, with personalization emerging as a key\nopportunity for e-commerce companies to engage customers more effectively. The\napplication of machine learning, particularly that of deep learning models, has\ngained significant traction due to its ability to rapidly recognize patterns in\nlarge datasets, thereby offering numerous possibilities for personalization.\nThese models use embeddings to map discrete information, such as product IDs,\ninto a latent vector space, a method increasingly popular in recent years.\nHowever, e-commerce's dynamic nature, characterized by frequent new product\nintroductions, poses challenges for these embeddings, which typically require\nfixed dimensions and inputs, leading to the need for periodic retraining from\nscratch. This paper introduces a modular algorithm that extends embedding input\nsize while preserving learned knowledge, addressing the challenges posed by\ne-commerce's dynamism. The proposed algorithm also incorporates strategies to\nmitigate the cold start problem associated with new products. The results of\ninitial experiments suggest that this method outperforms traditional\nembeddings.\n","authors":["Miguel Alves Gomes","Philipp Meisen","Tobias Meisen"],"pdf_url":"https://arxiv.org/pdf/2408.14118v1.pdf","comment":"Accepted Extended Abstract for 3rd Workshop on End-End Customer\n  Journey Optimization at KDD2024, Barcelona, Spain"},{"id":"http://arxiv.org/abs/2408.08713v2","updated":"2024-08-26T03:03:47Z","published":"2024-08-16T12:51:52Z","title":"Beyond KAN: Introducing KarSein for Adaptive High-Order Feature\n  Interaction Modeling in CTR Prediction","summary":"  Modeling feature interactions is crucial for click-through rate (CTR)\nprediction, particularly when it comes to high-order explicit interactions.\nTraditional methods struggle with this task because they often predefine a\nmaximum interaction order, which relies heavily on prior knowledge and can\nlimit the model's effectiveness. Additionally, modeling high-order interactions\ntypically leads to increased computational costs. Therefore, the challenge lies\nin adaptively modeling high-order feature interactions while maintaining\nefficiency. To address this issue, we introduce Kolmogorov-Arnold Represented\nSparse Efficient Interaction Network (KarSein), designed to optimize both\npredictive accuracy and computational efficiency. We firstly identify\nlimitations of directly applying Kolmogorov-Arnold Networks (KAN) to CTR and\nthen introduce KarSein to overcome these issues. It features a novel\narchitecture that reduces the computational costs of KAN and supports embedding\nvectors as feature inputs. Additionally, KarSein employs guided symbolic\nregression to address the challenge of KAN in spontaneously learning\nmultiplicative relationships. Extensive experiments demonstrate KarSein's\nsuperior performance, achieving significant predictive accuracy with minimal\ncomputational overhead. Furthermore, KarSein maintains strong global\nexplainability while enabling the removal of redundant features, resulting in a\nsparse network structure. These advantages also position KarSein as a promising\nmethod for efficient inference.\n","authors":["Yunxiao Shi","Wujiang Xu","Mingyu Jin","Haimin Zhang","Qiang Wu","Yongfeng Zhang","Min Xu"],"pdf_url":"https://arxiv.org/pdf/2408.08713v2.pdf","comment":"KarSein for CTR"},{"id":"http://arxiv.org/abs/2408.13986v1","updated":"2024-08-26T02:36:55Z","published":"2024-08-26T02:36:55Z","title":"AgentMove: Predicting Human Mobility Anywhere Using Large Language Model\n  based Agentic Framework","summary":"  Human mobility prediction plays a crucial role in various real-world\napplications. Although deep learning based models have shown promising results\nover the past decade, their reliance on extensive private mobility data for\ntraining and their inability to perform zero-shot predictions, have hindered\nfurther advancements. Recently, attempts have been made to apply large language\nmodels (LLMs) to mobility prediction task. However, their performance has been\nconstrained by the absence of a systematic design of workflow. They directly\ngenerate the final output using LLMs, which limits the potential of LLMs to\nuncover complex mobility patterns and underestimates their extensive reserve of\nglobal geospatial knowledge. In this paper, we introduce AgentMove, a\nsystematic agentic prediction framework to achieve generalized mobility\nprediction for any cities worldwide. In AgentMove, we first decompose the\nmobility prediction task into three sub-tasks and then design corresponding\nmodules to complete these subtasks, including spatial-temporal memory for\nindividual mobility pattern mining, world knowledge generator for modeling the\neffects of urban structure and collective knowledge extractor for capturing the\nshared patterns among population. Finally, we combine the results of three\nmodules and conduct a reasoning step to generate the final predictions.\nExtensive experiments on mobility data from two sources in 12 cities\ndemonstrate that AgentMove outperforms the best baseline more than 8% in\nvarious metrics and it shows robust predictions with various LLMs as base and\nalso less geographical bias across cities. Codes and data can be found in\nhttps://github.com/tsinghua-fib-lab/AgentMove.\n","authors":["Jie Feng","Yuwei Du","Jie Zhao","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2408.13986v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2406.18747v2","updated":"2024-08-26T01:07:11Z","published":"2024-06-26T20:25:53Z","title":"A Stem-Agnostic Single-Decoder System for Music Source Separation Beyond\n  Four Stems","summary":"  Despite significant recent progress across multiple subtasks of audio source\nseparation, few music source separation systems support separation beyond the\nfour-stem vocals, drums, bass, and other (VDBO) setup. Of the very few current\nsystems that support source separation beyond this setup, most continue to rely\non an inflexible decoder setup that can only support a fixed pre-defined set of\nstems. Increasing stem support in these inflexible systems correspondingly\nrequires increasing computational complexity, rendering extensions of these\nsystems computationally infeasible for long-tail instruments. In this work, we\npropose Banquet, a system that allows source separation of multiple stems using\njust one decoder. A bandsplit source separation model is extended to work in a\nquery-based setup in tandem with a music instrument recognition PaSST model. On\nthe MoisesDB dataset, Banquet, at only 24.9 M trainable parameters, approached\nthe performance level of the significantly more complex 6-stem Hybrid\nTransformer Demucs on VDBO stems and outperformed it on guitar and piano. The\nquery-based setup allows for the separation of narrow instrument classes such\nas clean acoustic guitars, and can be successfully applied to the extraction of\nless common stems such as reeds and organs. Implementation is available at\nhttps://github.com/kwatcharasupat/query-bandit.\n","authors":["Karn N. Watcharasupat","Alexander Lerch"],"pdf_url":"https://arxiv.org/pdf/2406.18747v2.pdf","comment":"Accepted to the 25th International Society for Music Information\n  Retrieval Conference (ISMIR 2024). Camera-ready version"},{"id":"http://arxiv.org/abs/2408.14698v1","updated":"2024-08-26T23:52:27Z","published":"2024-08-26T23:52:27Z","title":"Smart Multi-Modal Search: Contextual Sparse and Dense Embedding\n  Integration in Adobe Express","summary":"  As user content and queries become increasingly multi-modal, the need for\neffective multi-modal search systems has grown. Traditional search systems\noften rely on textual and metadata annotations for indexed images, while\nmulti-modal embeddings like CLIP enable direct search using text and image\nembeddings. However, embedding-based approaches face challenges in integrating\ncontextual features such as user locale and recency. Building a scalable\nmulti-modal search system requires fine-tuning several components. This paper\npresents a multi-modal search architecture and a series of AB tests that\noptimize embeddings and multi-modal technologies in Adobe Express template\nsearch. We address considerations such as embedding model selection, the roles\nof embeddings in matching and ranking, and the balance between dense and sparse\nembeddings. Our iterative approach demonstrates how utilizing sparse, dense,\nand contextual features enhances short and long query search, significantly\nreduces null rates (over 70\\%), and increases click-through rates (CTR). Our\nfindings provide insights into developing robust multi-modal search systems,\nthereby enhancing relevance for complex queries.\n","authors":["Cherag Aroraa","Tracy Holloway King","Jayant Kumar","Yi Lu","Sanat Sharma","Arvind Srikantan","David Uvalle","Josep Valls-Vargas","Harsha Vardhan"],"pdf_url":"https://arxiv.org/pdf/2408.14698v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14689v1","updated":"2024-08-26T23:29:03Z","published":"2024-08-26T23:29:03Z","title":"Federated User Preference Modeling for Privacy-Preserving Cross-Domain\n  Recommendation","summary":"  Cross-domain recommendation (CDR) aims to address the data-sparsity problem\nby transferring knowledge across domains. Existing CDR methods generally assume\nthat the user-item interaction data is shareable between domains, which leads\nto privacy leakage. Recently, some privacy-preserving CDR (PPCDR) models have\nbeen proposed to solve this problem. However, they primarily transfer simple\nrepresentations learned only from user-item interaction histories, overlooking\nother useful side information, leading to inaccurate user preferences.\nAdditionally, they transfer differentially private user-item interaction\nmatrices or embeddings across domains to protect privacy. However, these\nmethods offer limited privacy protection, as attackers may exploit external\ninformation to infer the original data. To address these challenges, we propose\na novel Federated User Preference Modeling (FUPM) framework. In FUPM, first, a\nnovel comprehensive preference exploration module is proposed to learn users'\ncomprehensive preferences from both interaction data and additional data\nincluding review texts and potentially positive items. Next, a private\npreference transfer module is designed to first learn differentially private\nlocal and global prototypes, and then privately transfer the global prototypes\nusing a federated learning strategy. These prototypes are generalized\nrepresentations of user groups, making it difficult for attackers to infer\nindividual information. Extensive experiments on four CDR tasks conducted on\nthe Amazon and Douban datasets validate the superiority of FUPM over SOTA\nbaselines. Code is available at https://github.com/Lili1013/FUPM.\n","authors":["Li Wang","Shoujin Wang","Quangui Zhang","Qiang Wu","Min Xu"],"pdf_url":"https://arxiv.org/pdf/2408.14689v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.12276v2","updated":"2024-08-26T23:05:42Z","published":"2024-02-19T16:40:38Z","title":"Explain then Rank: Scale Calibration of Neural Rankers Using Natural\n  Language Explanations from LLMs","summary":"  In search settings, calibrating the scores during the ranking process to\nquantities such as click-through rates or relevance levels enhances a system's\nusefulness and trustworthiness for downstream users. While previous research\nhas improved this notion of calibration for low complexity learning-to-rank\nmodels, the larger data demands and parameter count specific to modern neural\ntext rankers produce unique obstacles that hamper the efficacy of methods\nintended for the learning-to-rank setting.\n  This paper proposes exploiting large language models (LLMs) to provide\nrelevance and uncertainty signals for these neural text rankers to produce\nscale-calibrated scores through Monte Carlo sampling of natural language\nexplanations (NLEs). Our approach transforms the neural ranking task from\nranking textual query-document pairs to ranking corresponding synthesized NLEs.\nComprehensive experiments on two popular document ranking datasets show that\nthe NLE-based calibration approach consistently outperforms past calibration\nmethods and LLM-based methods for ranking, calibration, and query performance\nprediction tasks.\n","authors":["Puxuan Yu","Daniel Cohen","Hemank Lamba","Joel Tetreault","Alex Jaimes"],"pdf_url":"https://arxiv.org/pdf/2402.12276v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14678v1","updated":"2024-08-26T23:01:48Z","published":"2024-08-26T23:01:48Z","title":"Bridging the Gap: Unpacking the Hidden Challenges in Knowledge\n  Distillation for Online Ranking Systems","summary":"  Knowledge Distillation (KD) is a powerful approach for compressing a large\nmodel into a smaller, more efficient model, particularly beneficial for\nlatency-sensitive applications like recommender systems. However, current KD\nresearch predominantly focuses on Computer Vision (CV) and NLP tasks,\noverlooking unique data characteristics and challenges inherent to recommender\nsystems. This paper addresses these overlooked challenges, specifically: (1)\nmitigating data distribution shifts between teacher and student models, (2)\nefficiently identifying optimal teacher configurations within time and\nbudgetary constraints, and (3) enabling computationally efficient and rapid\nsharing of teacher labels to support multiple students. We present a robust KD\nsystem developed and rigorously evaluated on multiple large-scale personalized\nvideo recommendation systems within Google. Our live experiment results\ndemonstrate significant improvements in student model performance while\nensuring consistent and reliable generation of high quality teacher labels from\na continuous data stream of data.\n","authors":["Nikhil Khani","Shuo Yang","Aniruddh Nath","Yang Liu","Pendo Abbo","Li Wei","Shawn Andrews","Maciej Kula","Jarrod Kahn","Zhe Zhao","Lichan Hong","Ed Chi"],"pdf_url":"https://arxiv.org/pdf/2408.14678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14658v1","updated":"2024-08-26T21:47:49Z","published":"2024-08-26T21:47:49Z","title":"KGPrune: a Web Application to Extract Subgraphs of Interest from\n  Wikidata with Analogical Pruning","summary":"  Knowledge graphs (KGs) have become ubiquitous publicly available knowledge\nsources, and are nowadays covering an ever increasing array of domains.\nHowever, not all knowledge represented is useful or pertaining when considering\na new application or specific task. Also, due to their increasing size,\nhandling large KGs in their entirety entails scalability issues. These two\naspects asks for efficient methods to extract subgraphs of interest from\nexisting KGs. To this aim, we introduce KGPrune, a Web Application that, given\nseed entities of interest and properties to traverse, extracts their\nneighboring subgraphs from Wikidata. To avoid topical drift, KGPrune relies on\na frugal pruning algorithm based on analogical reasoning to only keep relevant\nneighbors while pruning irrelevant ones. The interest of KGPrune is illustrated\nby two concrete applications, namely, bootstrapping an enterprise KG and\nextracting knowledge related to looted artworks.\n","authors":["Pierre Monnin","Cherif-Hassan Nousradine","Lucas Jarnac","Laurel Zuckerman","Miguel Couceiro"],"pdf_url":"https://arxiv.org/pdf/2408.14658v1.pdf","comment":"Accepted as a demo paper at ECAI 2024"},{"id":"http://arxiv.org/abs/2408.14636v1","updated":"2024-08-26T21:00:25Z","published":"2024-08-26T21:00:25Z","title":"Relationships are Complicated! An Analysis of Relationships Between\n  Datasets on the Web","summary":"  The Web today has millions of datasets, and the number of datasets continues\nto grow at a rapid pace. These datasets are not standalone entities; rather,\nthey are intricately connected through complex relationships. Semantic\nrelationships between datasets provide critical insights for research and\ndecision-making processes. In this paper, we study dataset relationships from\nthe perspective of users who discover, use, and share datasets on the Web: what\nrelationships are important for different tasks? What contextual information\nmight users want to know? We first present a comprehensive taxonomy of\nrelationships between datasets on the Web and map these relationships to user\ntasks performed during dataset discovery. We develop a series of methods to\nidentify these relationships and compare their performance on a large corpus of\ndatasets generated from Web pages with schema.org markup. We demonstrate that\nmachine-learning based methods that use dataset metadata achieve multi-class\nclassification accuracy of 90%. Finally, we highlight gaps in available\nsemantic markup for datasets and discuss how incorporating comprehensive\nsemantics can facilitate the identification of dataset relationships. By\nproviding a comprehensive overview of dataset relationships at scale, this\npaper sets a benchmark for future research.\n","authors":["Kate Lin","Tarfah Alrashed","Natasha Noy"],"pdf_url":"https://arxiv.org/pdf/2408.14636v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14623v1","updated":"2024-08-26T20:36:52Z","published":"2024-08-26T20:36:52Z","title":"MODOC: A Modular Interface for Flexible Interlinking of Text Retrieval\n  and Text Generation Functions","summary":"  Large Language Models (LLMs) produce eloquent texts but often the content\nthey generate needs to be verified. Traditional information retrieval systems\ncan assist with this task, but most systems have not been designed with\nLLM-generated queries in mind. As such, there is a compelling need for\nintegrated systems that provide both retrieval and generation functionality\nwithin a single user interface.\n  We present MODOC, a modular user interface that leverages the capabilities of\nLLMs and provides assistance with detecting their confabulations, promoting\nintegrity in scientific writing. MODOC represents a significant step forward in\nscientific writing assistance. Its modular architecture supports flexible\nfunctions for retrieving information and for writing and generating text in a\nsingle, user-friendly interface.\n","authors":["Yingqiang Gao","Jhony Prada","Nianlong Gu","Jessica Lam","Richard H. R. Hahnloser"],"pdf_url":"https://arxiv.org/pdf/2408.14623v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2408.14471v1","updated":"2024-08-26T17:59:01Z","published":"2024-08-26T17:59:01Z","title":"A Practitioner's Guide to Continual Multimodal Pretraining","summary":"  Multimodal foundation models serve numerous applications at the intersection\nof vision and language. Still, despite being pretrained on extensive data, they\nbecome outdated over time. To keep models updated, research into continual\npretraining mainly explores scenarios with either (1) infrequent,\nindiscriminate updates on large-scale new data, or (2) frequent, sample-level\nupdates. However, practical model deployment often operates in the gap between\nthese two limit cases, as real-world applications often demand adaptation to\nspecific subdomains, tasks or concepts -- spread over the entire, varying life\ncycle of a model. In this work, we complement current perspectives on continual\npretraining through a research test bed as well as provide comprehensive\nguidance for effective continual model updates in such scenarios. We first\nintroduce FoMo-in-Flux, a continual multimodal pretraining benchmark with\nrealistic compute constraints and practical deployment requirements,\nconstructed over 63 datasets with diverse visual and semantic coverage. Using\nFoMo-in-Flux, we explore the complex landscape of practical continual\npretraining through multiple perspectives: (1) A data-centric investigation of\ndata mixtures and stream orderings that emulate real-world deployment\nsituations, (2) a method-centric investigation ranging from simple fine-tuning\nand traditional continual learning strategies to parameter-efficient updates\nand model merging, (3) meta learning rate schedules and mechanistic design\nchoices, and (4) the influence of model and compute scaling. Together, our\ninsights provide a practitioner's guide to continual multimodal pretraining for\nreal-world deployment. Our benchmark and code is here:\nhttps://github.com/ExplainableML/fomo_in_flux.\n","authors":["Karsten Roth","Vishaal Udandarao","Sebastian Dziadzio","Ameya Prabhu","Mehdi Cherti","Oriol Vinyals","Olivier Hénaff","Samuel Albanie","Matthias Bethge","Zeynep Akata"],"pdf_url":"https://arxiv.org/pdf/2408.14471v1.pdf","comment":"Technical Report. 52 pages"},{"id":"http://arxiv.org/abs/2408.14461v1","updated":"2024-08-26T17:50:47Z","published":"2024-08-26T17:50:47Z","title":"A domain decomposition-based autoregressive deep learning model for\n  unsteady and nonlinear partial differential equations","summary":"  In this paper, we propose a domain-decomposition-based deep learning (DL)\nframework, named transient-CoMLSim, for accurately modeling unsteady and\nnonlinear partial differential equations (PDEs). The framework consists of two\nkey components: (a) a convolutional neural network (CNN)-based autoencoder\narchitecture and (b) an autoregressive model composed of fully connected\nlayers. Unlike existing state-of-the-art methods that operate on the entire\ncomputational domain, our CNN-based autoencoder computes a lower-dimensional\nbasis for solution and condition fields represented on subdomains. Timestepping\nis performed entirely in the latent space, generating embeddings of the\nsolution variables from the time history of embeddings of solution and\ncondition variables. This approach not only reduces computational complexity\nbut also enhances scalability, making it well-suited for large-scale\nsimulations. Furthermore, to improve the stability of our rollouts, we employ a\ncurriculum learning (CL) approach during the training of the autoregressive\nmodel. The domain-decomposition strategy enables scaling to out-of-distribution\ndomain sizes while maintaining the accuracy of predictions -- a feature not\neasily integrated into popular DL-based approaches for physics simulations. We\nbenchmark our model against two widely-used DL architectures, Fourier Neural\nOperator (FNO) and U-Net, and demonstrate that our framework outperforms them\nin terms of accuracy, extrapolation to unseen timesteps, and stability for a\nwide range of use cases.\n","authors":["Sheel Nidhan","Haoliang Jiang","Lalit Ghule","Clancy Umphrey","Rishikesh Ranade","Jay Pathak"],"pdf_url":"https://arxiv.org/pdf/2408.14461v1.pdf","comment":"26 pages"},{"id":"http://arxiv.org/abs/2408.11796v2","updated":"2024-08-26T17:50:46Z","published":"2024-08-21T17:38:48Z","title":"LLM Pruning and Distillation in Practice: The Minitron Approach","summary":"  We present a comprehensive report on compressing the Llama 3.1 8B and Mistral\nNeMo 12B models to 4B and 8B parameters, respectively, using pruning and\ndistillation. We explore two distinct pruning strategies: (1) depth pruning and\n(2) joint hidden/attention/MLP (width) pruning, and evaluate the results on\ncommon benchmarks from the LM Evaluation Harness. The models are then aligned\nwith NeMo Aligner and tested in instruct-tuned versions. This approach produces\na compelling 4B model from Llama 3.1 8B and a state-of-the-art\nMistral-NeMo-Minitron-8B (MN-Minitron-8B for brevity) model from Mistral NeMo\n12B. We found that with no access to the original data, it is beneficial to\nslightly fine-tune teacher models on the distillation dataset. We open-source\nour base model weights on Hugging Face with a permissive license.\n","authors":["Sharath Turuvekere Sreenivas","Saurav Muralidharan","Raviraj Joshi","Marcin Chochowski","Mostofa Patwary","Mohammad Shoeybi","Bryan Catanzaro","Jan Kautz","Pavlo Molchanov"],"pdf_url":"https://arxiv.org/pdf/2408.11796v2.pdf","comment":"v2: Added missing references. Cleaned up runtime performance section"},{"id":"http://arxiv.org/abs/2408.14453v1","updated":"2024-08-26T17:48:42Z","published":"2024-08-26T17:48:42Z","title":"Reconstructing physiological signals from fMRI across the adult lifespan","summary":"  Interactions between the brain and body are of fundamental importance for\nhuman behavior and health. Functional magnetic resonance imaging (fMRI)\ncaptures whole-brain activity noninvasively, and modeling how fMRI signals\ninteract with physiological dynamics of the body can provide new insight into\nbrain function and offer potential biomarkers of disease. However,\nphysiological recordings are not always possible to acquire since they require\nextra equipment and setup, and even when they are, the recorded physiological\nsignals may contain substantial artifacts. To overcome this limitation, machine\nlearning models have been proposed to directly extract features of respiratory\nand cardiac activity from resting-state fMRI signals. To date, such work has\nbeen carried out only in healthy young adults and in a pediatric population,\nleaving open questions about the efficacy of these approaches on older adults.\nHere, we propose a novel framework that leverages Transformer-based\narchitectures for reconstructing two key physiological signals - low-frequency\nrespiratory volume (RV) and heart rate (HR) fluctuations - from fMRI data, and\ntest these models on a dataset of individuals aged 36-89 years old. Our\nframework outperforms previously proposed approaches (attaining median\ncorrelations between predicted and measured signals of r ~ .698 for RV and r ~\n.618 for HR), indicating the potential of leveraging attention mechanisms to\nmodel fMRI-physiological signal relationships. We also evaluate several model\ntraining and fine-tuning strategies, and find that incorporating young-adult\ndata during training improves the performance when predicting physiological\nsignals in the aging cohort. Overall, our approach successfully infers key\nphysiological variables directly from fMRI data from individuals across a wide\nrange of the adult lifespan.\n","authors":["Shiyu Wang","Ziyuan Xu","Yamin Li","Mara Mather","Roza G. Bayrak","Catie Chang"],"pdf_url":"https://arxiv.org/pdf/2408.14453v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14445v1","updated":"2024-08-26T17:36:51Z","published":"2024-08-26T17:36:51Z","title":"Symmetry & Critical Points","summary":"  Critical points of an invariant function may or may not be symmetric. We\nprove, however, that if a symmetric critical point exists, those adjacent to it\nare generically symmetry breaking. This mathematical mechanism is shown to\ncarry important implications for our ability to efficiently minimize invariant\nnonconvex functions, in particular those associated with neural networks.\n","authors":["Yossi Arjevani"],"pdf_url":"https://arxiv.org/pdf/2408.14445v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14442v1","updated":"2024-08-26T17:35:01Z","published":"2024-08-26T17:35:01Z","title":"Model Parallel Training and Transfer Learning for Convolutional Neural\n  Networks by Domain Decomposition","summary":"  Deep convolutional neural networks (CNNs) have been shown to be very\nsuccessful in a wide range of image processing applications. However, due to\ntheir increasing number of model parameters and an increasing availability of\nlarge amounts of training data, parallelization strategies to efficiently train\ncomplex CNNs are necessary. In previous work by the authors, a novel model\nparallel CNN architecture was proposed which is loosely inspired by domain\ndecomposition. In particular, the novel network architecture is based on a\ndecomposition of the input data into smaller subimages. For each of these\nsubimages, local CNNs with a proportionally smaller number of parameters are\ntrained in parallel and the resulting local classifications are then aggregated\nin a second step by a dense feedforward neural network (DNN). In the present\nwork, we compare the resulting CNN-DNN architecture to less costly alternatives\nto combine the local classifications into a final, global decision.\nAdditionally, we investigate the performance of the CNN-DNN trained as one\ncoherent model as well as using a transfer learning strategy, where the\nparameters of the pre-trained local CNNs are used as initial values for a\nsubsequently trained global coherent CNN-DNN model.\n","authors":["Axel Klawonn","Martin Lanser","Janine Weber"],"pdf_url":"https://arxiv.org/pdf/2408.14442v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13840v3","updated":"2024-08-26T17:34:44Z","published":"2023-06-24T02:25:56Z","title":"Beyond Scale: The Diversity Coefficient as a Data Quality Metric for\n  Variability in Natural Language Data","summary":"  Current trends in pre-training Large Language Models (LLMs) primarily focus\non the scaling of model and dataset size. While the quality of pre-training\ndata is considered an important factor for training powerful LLMs, it remains a\nnebulous concept that has not been rigorously characterized. To this end, we\npropose a formalization of one key aspect of data quality -- measuring the\nvariability of natural language data -- specifically via a measure we call the\ndiversity coefficient. Our empirical analysis shows that the proposed diversity\ncoefficient aligns with the intuitive properties of diversity and variability,\ne.g., it increases as the number of latent concepts increases. Then, we measure\nthe diversity coefficient of publicly available pre-training datasets and\ndemonstrate that their formal diversity is high compared to theoretical lower\nand upper bounds. Finally, we conduct a comprehensive set of controlled\ninterventional experiments with GPT-2 and LLaMAv2 that demonstrate the\ndiversity coefficient of pre-training data characterizes useful aspects of\ndownstream model evaluation performance -- totaling 44 models of various sizes\n(51M to 7B parameters). We conclude that our formal notion of diversity is an\nimportant aspect of data quality that captures variability and causally leads\nto improved evaluation performance.\n","authors":["Brando Miranda","Alycia Lee","Sudharsan Sundar","Allison Casasola","Sanmi Koyejo"],"pdf_url":"https://arxiv.org/pdf/2306.13840v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.10844v2","updated":"2024-08-26T17:31:16Z","published":"2024-07-15T15:59:39Z","title":"Improved Uncertainty Estimation of Graph Neural Network Potentials Using\n  Engineered Latent Space Distances","summary":"  Graph neural networks (GNNs) have been shown to be astonishingly capable\nmodels for molecular property prediction, particularly as surrogates for\nexpensive density functional theory calculations of relaxed energy for novel\nmaterial discovery. However, one limitation of GNNs in this context is the lack\nof useful uncertainty prediction methods, as this is critical to the material\ndiscovery pipeline. In this work, we show that uncertainty quantification for\nrelaxed energy calculations is more complex than uncertainty quantification for\nother kinds of molecular property prediction, due to the effect that structure\noptimizations have on the error distribution. We propose that distribution-free\ntechniques are more useful tools for assessing calibration, recalibrating, and\ndeveloping uncertainty prediction methods for GNNs performing relaxed energy\ncalculations. We also develop a relaxed energy task for evaluating uncertainty\nmethods for equivariant GNNs, based on distribution-free recalibration and\nusing the Open Catalyst Project dataset. We benchmark a set of popular\nuncertainty prediction methods on this task, and show that latent distance\nmethods, with our novel improvements, are the most well-calibrated and\neconomical approach for relaxed energy calculations. Finally, we demonstrate\nthat our latent space distance method produces results which align with our\nexpectations on a clustering example, and on specific equation of state and\nadsorbate coverage examples from outside the training dataset.\n","authors":["Joseph Musielewicz","Janice Lan","Matt Uyttendaele","John R. Kitchin"],"pdf_url":"https://arxiv.org/pdf/2407.10844v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10468v3","updated":"2024-08-26T17:28:23Z","published":"2024-08-20T00:40:49Z","title":"Tracing Privacy Leakage of Language Models to Training Data via Adjusted\n  Influence Functions","summary":"  The responses generated by Large Language Models (LLMs) can include sensitive\ninformation from individuals and organizations, leading to potential privacy\nleakage. This work implements Influence Functions (IFs) to trace privacy\nleakage back to the training data, thereby mitigating privacy concerns of\nLanguage Models (LMs). However, we notice that current IFs struggle to\naccurately estimate the influence of tokens with large gradient norms,\npotentially overestimating their influence. When tracing the most influential\nsamples, this leads to frequently tracing back to samples with large gradient\nnorm tokens, overshadowing the actual most influential samples even if their\ninfluences are well estimated. To address this issue, we propose Heuristically\nAdjusted IF (HAIF), which reduces the weight of tokens with large gradient\nnorms, thereby significantly improving the accuracy of tracing the most\ninfluential samples. To establish easily obtained groundtruth for tracing\nprivacy leakage, we construct two datasets, PII-E and PII-CR, representing two\ndistinct scenarios: one with identical text in the model outputs and\npre-training data, and the other where models leverage their reasoning\nabilities to generate text divergent from pre-training data. HAIF significantly\nimproves tracing accuracy, enhancing it by 20.96% to 73.71% on the PII-E\ndataset and 3.21% to 45.93% on the PII-CR dataset, compared to the best SOTA\nIFs against various GPT-2 and QWen-1.5 models. HAIF also outperforms SOTA IFs\non real-world pretraining data CLUECorpus2020, demonstrating strong robustness\nregardless prompt and response lengths.\n","authors":["Jinxin Liu","Zao Yang"],"pdf_url":"https://arxiv.org/pdf/2408.10468v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14435v1","updated":"2024-08-26T17:21:54Z","published":"2024-08-26T17:21:54Z","title":"Social perception of faces in a vision-language model","summary":"  We explore social perception of human faces in CLIP, a widely used\nopen-source vision-language model. To this end, we compare the similarity in\nCLIP embeddings between different textual prompts and a set of face images. Our\ntextual prompts are constructed from well-validated social psychology terms\ndenoting social perception. The face images are synthetic and are\nsystematically and independently varied along six dimensions: the legally\nprotected attributes of age, gender, and race, as well as facial expression,\nlighting, and pose. Independently and systematically manipulating face\nattributes allows us to study the effect of each on social perception and\navoids confounds that can occur in wild-collected data due to uncontrolled\nsystematic correlations between attributes. Thus, our findings are experimental\nrather than observational. Our main findings are three. First, while CLIP is\ntrained on the widest variety of images and texts, it is able to make\nfine-grained human-like social judgments on face images. Second, age, gender,\nand race do systematically impact CLIP's social perception of faces, suggesting\nan undesirable bias in CLIP vis-a-vis legally protected attributes. Most\nstrikingly, we find a strong pattern of bias concerning the faces of Black\nwomen, where CLIP produces extreme values of social perception across different\nages and facial expressions. Third, facial expression impacts social perception\nmore than age and lighting as much as age. The last finding predicts that\nstudies that do not control for unprotected visual attributes may reach the\nwrong conclusions on bias. Our novel method of investigation, which is founded\non the social psychology literature and on the experiments involving the\nmanipulation of individual attributes, yields sharper and more reliable\nobservations than previous observational methods and may be applied to study\nbiases in any vision-language model.\n","authors":["Carina I. Hausladen","Manuel Knott","Colin F. Camerer","Pietro Perona"],"pdf_url":"https://arxiv.org/pdf/2408.14435v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14434v1","updated":"2024-08-26T17:21:19Z","published":"2024-08-26T17:21:19Z","title":"Employing Artificial Intelligence to Steer Exascale Workflows with\n  Colmena","summary":"  Computational workflows are a common class of application on supercomputers,\nyet the loosely coupled and heterogeneous nature of workflows often fails to\ntake full advantage of their capabilities. We created Colmena to leverage the\nmassive parallelism of a supercomputer by using Artificial Intelligence (AI) to\nlearn from and adapt a workflow as it executes. Colmena allows scientists to\ndefine how their application should respond to events (e.g., task completion)\nas a series of cooperative agents. In this paper, we describe the design of\nColmena, the challenges we overcame while deploying applications on exascale\nsystems, and the science workflows we have enhanced through interweaving AI.\nThe scaling challenges we discuss include developing steering strategies that\nmaximize node utilization, introducing data fabrics that reduce communication\noverhead of data-intensive tasks, and implementing workflow tasks that cache\ncostly operations between invocations. These innovations coupled with a variety\nof application patterns accessible through our agent-based steering model have\nenabled science advances in chemistry, biophysics, and materials science using\ndifferent types of AI. Our vision is that Colmena will spur creative solutions\nthat harness AI across many domains of scientific computing.\n","authors":["Logan Ward","J. Gregory Pauloski","Valerie Hayot-Sasson","Yadu Babuji","Alexander Brace","Ryan Chard","Kyle Chard","Rajeev Thakur","Ian Foster"],"pdf_url":"https://arxiv.org/pdf/2408.14434v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14432v1","updated":"2024-08-26T17:20:34Z","published":"2024-08-26T17:20:34Z","title":"Contextual Bandit with Herding Effects: Algorithms and Recommendation\n  Applications","summary":"  Contextual bandits serve as a fundamental algorithmic framework for\noptimizing recommendation decisions online. Though extensive attention has been\npaid to tailoring contextual bandits for recommendation applications, the\n\"herding effects\" in user feedback have been ignored. These herding effects\nbias user feedback toward historical ratings, breaking down the assumption of\nunbiased feedback inherent in contextual bandits. This paper develops a novel\nvariant of the contextual bandit that is tailored to address the feedback bias\ncaused by the herding effects. A user feedback model is formulated to capture\nthis feedback bias. We design the TS-Conf (Thompson Sampling under Conformity)\nalgorithm, which employs posterior sampling to balance the exploration and\nexploitation tradeoff. We prove an upper bound for the regret of the algorithm,\nrevealing the impact of herding effects on learning speed. Extensive\nexperiments on datasets demonstrate that TS-Conf outperforms four benchmark\nalgorithms. Analysis reveals that TS-Conf effectively mitigates the negative\nimpact of herding effects, resulting in faster learning and improved\nrecommendation accuracy.\n","authors":["Luyue Xu","Liming Wang","Hong Xie","Mingqiang Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.14432v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03341v4","updated":"2024-08-26T17:12:07Z","published":"2024-06-05T14:58:32Z","title":"Tackling GenAI Copyright Issues: Originality Estimation and\n  Genericization","summary":"  The rapid progress of generative AI technology has sparked significant\ncopyright concerns, leading to numerous lawsuits filed against AI developers.\nWhile various techniques for mitigating copyright issues have been studied,\nsignificant risks remain. Here, we propose a genericization method that\nmodifies the outputs of a generative model to make them more generic and less\nlikely to infringe copyright. To achieve this, we introduce a metric for\nquantifying the level of originality of data in a manner that is consistent\nwith the legal framework. This metric can be practically estimated by drawing\nsamples from a generative model, which is then used for the genericization\nprocess. As a practical implementation, we introduce PREGen, which combines our\ngenericization method with an existing mitigation technique. Experiments\ndemonstrate that our genericization method successfully modifies the output of\na text-to-image generative model so that it produces more generic,\ncopyright-compliant images. Compared to the existing method, PREGen reduces the\nlikelihood of generating copyrighted characters by more than half when the\nnames of copyrighted characters are used as the prompt, dramatically improving\nthe performance. Additionally, while generative models can produce copyrighted\ncharacters even when their names are not directly mentioned in the prompt,\nPREGen almost entirely prevents the generation of such characters in these\ncases.\n","authors":["Hiroaki Chiba-Okabe","Weijie J. Su"],"pdf_url":"https://arxiv.org/pdf/2406.03341v4.pdf","comment":"19 pages, 10 figures"},{"id":"http://arxiv.org/abs/2405.12295v3","updated":"2024-08-26T17:10:41Z","published":"2024-05-20T18:01:15Z","title":"Efficient Model-Stealing Attacks Against Inductive Graph Neural Networks","summary":"  Graph Neural Networks (GNNs) are recognized as potent tools for processing\nreal-world data organized in graph structures. Especially inductive GNNs, which\nallow for the processing of graph-structured data without relying on predefined\ngraph structures, are becoming increasingly important in a wide range of\napplications. As such these networks become attractive targets for\nmodel-stealing attacks where an adversary seeks to replicate the functionality\nof the targeted network. Significant efforts have been devoted to developing\nmodel-stealing attacks that extract models trained on images and texts.\nHowever, little attention has been given to stealing GNNs trained on graph\ndata. This paper identifies a new method of performing unsupervised\nmodel-stealing attacks against inductive GNNs, utilizing graph contrastive\nlearning and spectral graph augmentations to efficiently extract information\nfrom the targeted model. The new type of attack is thoroughly evaluated on six\ndatasets and the results show that our approach outperforms the current\nstate-of-the-art by Shen et al. (2021). In particular, our attack surpasses the\nbaseline across all benchmarks, attaining superior fidelity and downstream\naccuracy of the stolen model while necessitating fewer queries directed toward\nthe target model.\n","authors":["Marcin Podhajski","Jan Dubiński","Franziska Boenisch","Adam Dziedzic","Agnieszka Pregowska And Tomasz Michalak"],"pdf_url":"https://arxiv.org/pdf/2405.12295v3.pdf","comment":"Accepted at ECAI - 27TH EUROPEAN CONFERENCE ON ARTIFICIAL\n  INTELLIGENCE"},{"id":"http://arxiv.org/abs/2408.14421v1","updated":"2024-08-26T17:04:52Z","published":"2024-08-26T17:04:52Z","title":"Evaluating saliency scores in point clouds of natural environments by\n  learning surface anomalies","summary":"  In recent years, three-dimensional point clouds are used increasingly to\ndocument natural environments. Each dataset contains a diverse set of objects,\nat varying shapes and sizes, distributed throughout the data and intricately\nintertwined with the topography. Therefore, regions of interest are difficult\nto find and consequent analyses become a challenge. Inspired from visual\nperception principles, we propose to differentiate objects of interest from the\ncluttered environment by evaluating how much they stand out from their\nsurroundings, i.e., their geometric salience. Previous saliency detection\napproaches suggested mostly handcrafted attributes for the task. However, such\nmethods fail when the data are too noisy or have high levels of texture. Here\nwe propose a learning-based mechanism that accommodates noise and textured\nsurfaces. We assume that within the natural environment any change from the\nprevalent surface would suggest a salient object. Thus, we first learn the\nunderlying surface and then search for anomalies within it. Initially, a deep\nneural network is trained to reconstruct the surface. Regions where the\nreconstructed part deviates significantly from the original point cloud yield a\nsubstantial reconstruction error, signifying an anomaly, i.e., saliency. We\ndemonstrate the effectiveness of the proposed approach by searching for salient\nfeatures in various natural scenarios, which were acquired by different\nacquisition platforms. We show the strong correlation between the\nreconstruction error and salient objects.\n","authors":["Reuma Arav","Dennis Wittich","Franz Rottensteiner"],"pdf_url":"https://arxiv.org/pdf/2408.14421v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14416v1","updated":"2024-08-26T17:03:14Z","published":"2024-08-26T17:03:14Z","title":"Hyperdimensional Computing Empowered Federated Foundation Model over\n  Wireless Networks for Metaverse","summary":"  The Metaverse, a burgeoning collective virtual space merging augmented\nreality and persistent virtual worlds, necessitates advanced artificial\nintelligence (AI) and communication technologies to support immersive and\ninteractive experiences. Federated learning (FL) has emerged as a promising\ntechnique for collaboratively training AI models while preserving data privacy.\nHowever, FL faces challenges such as high communication overhead and\nsubstantial computational demands, particularly for neural network (NN) models.\nTo address these issues, we propose an integrated federated split learning and\nhyperdimensional computing (FSL-HDC) framework for emerging foundation models.\nThis novel approach reduces communication costs, computation load, and privacy\nrisks, making it particularly suitable for resource-constrained edge devices in\nthe Metaverse, ensuring real-time responsive interactions. Additionally, we\nintroduce an optimization algorithm that concurrently optimizes transmission\npower and bandwidth to minimize the maximum transmission time among all users\nto the server. The simulation results based on the MNIST dataset indicate that\nFSL-HDC achieves an accuracy rate of approximately 87.5%, which is slightly\nlower than that of FL-HDC. However, FSL-HDC exhibits a significantly faster\nconvergence speed, approximately 3.733x that of FSL-NN, and demonstrates\nrobustness to non-IID data distributions. Moreover, our proposed optimization\nalgorithm can reduce the maximum transmission time by up to 64% compared with\nthe baseline.\n","authors":["Yahao Ding","Wen Shang","Minrui Xu","Zhaohui Yang","Ye Hu","Dusit Niyato","Mohammad Shikh-Bahaei"],"pdf_url":"https://arxiv.org/pdf/2408.14416v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14415v1","updated":"2024-08-26T17:02:25Z","published":"2024-08-26T17:02:25Z","title":"LoG-VMamba: Local-Global Vision Mamba for Medical Image Segmentation","summary":"  Mamba, a State Space Model (SSM), has recently shown competitive performance\nto Convolutional Neural Networks (CNNs) and Transformers in Natural Language\nProcessing and general sequence modeling. Various attempts have been made to\nadapt Mamba to Computer Vision tasks, including medical image segmentation\n(MIS). Vision Mamba (VM)-based networks are particularly attractive due to\ntheir ability to achieve global receptive fields, similar to Vision\nTransformers, while also maintaining linear complexity in the number of tokens.\nHowever, the existing VM models still struggle to maintain both spatially local\nand global dependencies of tokens in high dimensional arrays due to their\nsequential nature. Employing multiple and/or complicated scanning strategies is\ncomputationally costly, which hinders applications of SSMs to high-dimensional\n2D and 3D images that are common in MIS problems. In this work, we propose\nLocal-Global Vision Mamba, LoG-VMamba, that explicitly enforces spatially\nadjacent tokens to remain nearby on the channel axis, and retains the global\ncontext in a compressed form. Our method allows the SSMs to access the local\nand global contexts even before reaching the last token while requiring only a\nsimple scanning strategy. Our segmentation models are computationally efficient\nand substantially outperform both CNN and Transformers-based baselines on a\ndiverse set of 2D and 3D MIS tasks. The implementation of LoG-VMamba is\navailable at \\url{https://github.com/Oulu-IMEDS/LoG-VMamba}.\n","authors":["Trung Dinh Quoc Dang","Huy Hoang Nguyen","Aleksei Tiulpin"],"pdf_url":"https://arxiv.org/pdf/2408.14415v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2407.03194v5","updated":"2024-08-26T16:57:34Z","published":"2024-07-03T15:26:02Z","title":"Prediction Instability in Machine Learning Ensembles","summary":"  In machine learning ensembles predictions from multiple models are\naggregated. Despite widespread use and strong performance of ensembles in\napplied problems little is known about the mathematical properties of\naggregating models and associated consequences for safe, explainable use of\nsuch models. In this paper we prove a theorem that shows that any ensemble will\nexhibit at least one of the following forms of prediction instability. It will\neither ignore agreement among all underlying models, change its mind when none\nof the underlying models have done so, or be manipulable through inclusion or\nexclusion of options it would never actually predict. As a consequence,\nensemble aggregation procedures will always need to balance the benefits of\ninformation use against the risk of these prediction instabilities. This\nanalysis also sheds light on what specific forms of prediction instability to\nexpect from particular ensemble algorithms; for example popular tree ensembles\nlike random forest, or xgboost will violate basic, intuitive fairness\nproperties. Finally, we show that this can be ameliorated by using consistent\nmodels in asymptotic conditions.\n","authors":["Jeremy Kedziora"],"pdf_url":"https://arxiv.org/pdf/2407.03194v5.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2408.14407v1","updated":"2024-08-26T16:49:42Z","published":"2024-08-26T16:49:42Z","title":"Spectrally Informed Learning of Fluid Flows","summary":"  Accurate and efficient fluid flow models are essential for applications\nrelating to many physical phenomena including geophysical, aerodynamic, and\nbiological systems. While these flows may exhibit rich and multiscale dynamics,\nin many cases underlying low-rank structures exist which describe the bulk of\nthe motion. These structures tend to be spatially large and temporally slow,\nand may contain most of the energy in a given flow. The extraction and\nparsimonious representation of these low-rank dynamics from high-dimensional\ndata is a key challenge. Inspired by the success of physics-informed machine\nlearning methods, we propose a spectrally-informed approach to extract low-rank\nmodels of fluid flows by leveraging known spectral properties in the learning\nprocess. We incorporate this knowledge by imposing regularizations on the\nlearned dynamics, which bias the training process towards learning\nlow-frequency structures with corresponding higher power. We demonstrate the\neffectiveness of this method to improve prediction and produce learned models\nwhich better match the underlying spectral properties of prototypical fluid\nflows.\n","authors":["Benjamin D. Shaffer","Jeremy R. Vorenberg","M. Ani Hsieh"],"pdf_url":"https://arxiv.org/pdf/2408.14407v1.pdf","comment":"13 pages, 10 figures"},{"id":"http://arxiv.org/abs/2403.05720v2","updated":"2024-08-26T16:48:08Z","published":"2024-03-08T23:17:55Z","title":"A Dataset and Benchmark for Hospital Course Summarization with Adapted\n  Large Language Models","summary":"  Brief hospital course (BHC) summaries are clinical documents that summarize a\npatient's hospital stay. While large language models (LLMs) depict remarkable\ncapabilities in automating real-world tasks, their capabilities for healthcare\napplications such as synthesizing BHCs from clinical notes have not been shown.\nWe introduce a novel pre-processed dataset, the MIMIC-IV-BHC, encapsulating\nclinical note and brief hospital course (BHC) pairs to adapt LLMs for BHC\nsynthesis. Furthermore, we introduce a benchmark of the summarization\nperformance of two general-purpose LLMs and three healthcare-adapted LLMs.\n  Using clinical notes as input, we apply prompting-based (using in-context\nlearning) and fine-tuning-based adaptation strategies to three open-source LLMs\n(Clinical-T5-Large, Llama2-13B, FLAN-UL2) and two proprietary LLMs (GPT-3.5,\nGPT-4). We evaluate these LLMs across multiple context-length inputs using\nnatural language similarity metrics. We further conduct a clinical study with\nfive clinicians, comparing clinician-written and LLM-generated BHCs across 30\nsamples, focusing on their potential to enhance clinical decision-making\nthrough improved summary quality. We observe that the Llama2-13B fine-tuned LLM\noutperforms other domain-adapted models given quantitative evaluation metrics\nof BLEU and BERT-Score. GPT-4 with in-context learning shows more robustness to\nincreasing context lengths of clinical note inputs than fine-tuned Llama2-13B.\nDespite comparable quantitative metrics, the reader study depicts a significant\npreference for summaries generated by GPT-4 with in-context learning compared\nto both Llama2-13B fine-tuned summaries and the original summaries,\nhighlighting the need for qualitative clinical evaluation.\n","authors":["Asad Aali","Dave Van Veen","Yamin Ishraq Arefeen","Jason Hom","Christian Bluethgen","Eduardo Pontes Reis","Sergios Gatidis","Namuun Clifford","Joseph Daws","Arash S. Tehrani","Jangwon Kim","Akshay S. Chaudhari"],"pdf_url":"https://arxiv.org/pdf/2403.05720v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14404v1","updated":"2024-08-26T16:47:20Z","published":"2024-08-26T16:47:20Z","title":"Application of Neural Ordinary Differential Equations for ITER Burning\n  Plasma Dynamics","summary":"  The dynamics of burning plasmas in tokamaks are crucial for advancing\ncontrolled thermonuclear fusion. This study introduces the NeuralPlasmaODE, a\nmulti-region multi-timescale transport model to simulate the complex energy\ntransfer processes in ITER deuterium-tritium (D-T) plasmas. Our model captures\nthe interactions between energetic alpha particles, electrons, and ions, which\nare vital for understanding phenomena such as thermal runaway instability. We\nemploy neural ordinary differential equations (Neural ODEs) for the numerical\nderivation of diffusivity parameters, enabling precise modeling of energy\ninteractions between different plasma regions. By leveraging transfer learning,\nwe utilize model parameters derived from DIII-D experimental data, enhancing\nthe efficiency and accuracy of our simulations without training from scratch.\nApplying this model to ITER's inductive and non-inductive operational\nscenarios, our results demonstrate that radiation and transport processes\neffectively remove excess heat from the core plasma, preventing thermal runaway\ninstability. This study underscores the potential of machine learning in\nadvancing our understanding and control of burning plasma dynamics in fusion\nreactors.\n","authors":["Zefang Liu","Weston M. Stacey"],"pdf_url":"https://arxiv.org/pdf/2408.14404v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14398v1","updated":"2024-08-26T16:29:13Z","published":"2024-08-26T16:29:13Z","title":"Language-specific Calibration for Pruning Multilingual Language Models","summary":"  Recent advances in large language model (LLM) pruning have shown\nstate-of-the-art compression results in post-training and retraining-free\nsettings while maintaining high predictive performance. However, such research\nmainly considers calibrating pruning using English text, despite the\nmultilingual nature of modern LLMs and their frequent uses in non-English\nlanguages. In this paper, we set out to explore effective strategies for\ncalibrating the pruning of multilingual language models. We present the first\ncomprehensive empirical study, comparing different calibration languages for\npruning multilingual models across diverse tasks, models, and state-of-the-art\npruning techniques. Our results present practical suggestions, for example,\ncalibrating in the target language can efficiently yield lower perplexity, but\ndoes not necessarily benefit downstream tasks. Our further analysis experiments\nunveil that calibration in the target language mainly contributes to preserving\nlanguage-specific features related to fluency and coherence, but might not\ncontribute to capturing language-agnostic features such as language\nunderstanding and reasoning. Last, we provide practical recommendations for\nfuture practitioners.\n","authors":["Simon Kurz","Zhixue Zhao","Jian-Jia Chen","Lucie Flek"],"pdf_url":"https://arxiv.org/pdf/2408.14398v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14393v1","updated":"2024-08-26T16:21:50Z","published":"2024-08-26T16:21:50Z","title":"CURE4Rec: A Benchmark for Recommendation Unlearning with Deeper\n  Influence","summary":"  With increasing privacy concerns in artificial intelligence, regulations have\nmandated the right to be forgotten, granting individuals the right to withdraw\ntheir data from models. Machine unlearning has emerged as a potential solution\nto enable selective forgetting in models, particularly in recommender systems\nwhere historical data contains sensitive user information. Despite recent\nadvances in recommendation unlearning, evaluating unlearning methods\ncomprehensively remains challenging due to the absence of a unified evaluation\nframework and overlooked aspects of deeper influence, e.g., fairness. To\naddress these gaps, we propose CURE4Rec, the first comprehensive benchmark for\nrecommendation unlearning evaluation. CURE4Rec covers four aspects, i.e.,\nunlearning Completeness, recommendation Utility, unleaRning efficiency, and\nrecommendation fairnEss, under three data selection strategies, i.e., core\ndata, edge data, and random data. Specifically, we consider the deeper\ninfluence of unlearning on recommendation fairness and robustness towards data\nwith varying impact levels. We construct multiple datasets with CURE4Rec\nevaluation and conduct extensive experiments on existing recommendation\nunlearning methods. Our code is released at\nhttps://github.com/xiye7lai/CURE4Rec.\n","authors":["Chaochao Chen","Jiaming Zhang","Yizhao Zhang","Li Zhang","Lingjuan Lyu","Yuyuan Li","Biao Gong","Chenggang Yan"],"pdf_url":"https://arxiv.org/pdf/2408.14393v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14387v1","updated":"2024-08-26T16:11:53Z","published":"2024-08-26T16:11:53Z","title":"Reprogramming Foundational Large Language Models(LLMs) for Enterprise\n  Adoption for Spatio-Temporal Forecasting Applications: Unveiling a New Era in\n  Copilot-Guided Cross-Modal Time Series Representation Learning","summary":"  Spatio-temporal forecasting plays a crucial role in various sectors such as\ntransportation systems, logistics, and supply chain management. However,\nexisting methods are limited by their ability to handle large, complex\ndatasets. To overcome this limitation, we introduce a hybrid approach that\ncombines the strengths of open-source large and small-scale language models\n(LLMs and LMs) with traditional forecasting methods. We augment traditional\nmethods with dynamic prompting and a grouped-query, multi-head attention\nmechanism to more effectively capture both intra-series and inter-series\ndependencies in evolving nonlinear time series data. In addition, we facilitate\non-premises customization by fine-tuning smaller open-source LMs for time\nseries trend analysis utilizing descriptions generated by open-source large LMs\non consumer-grade hardware using Low-Rank Adaptation with Activation Memory\nReduction (LoRA-AMR) technique to reduce computational overhead and activation\nstorage memory demands while preserving inference latency. We combine language\nmodel processing for time series trend analysis with traditional time series\nrepresentation learning method for cross-modal integration, achieving robust\nand accurate forecasts. The framework effectiveness is demonstrated through\nextensive experiments on various real-world datasets, outperforming existing\nmethods by significant margins in terms of forecast accuracy.\n","authors":["Sakhinana Sagar Srinivas","Chidaksh Ravuru","Geethan Sannidhi","Venkataramana Runkana"],"pdf_url":"https://arxiv.org/pdf/2408.14387v1.pdf","comment":"Paper published at the Deployable AI (DAI) workshop at AAAI-2024"},{"id":"http://arxiv.org/abs/2408.14381v1","updated":"2024-08-26T16:04:13Z","published":"2024-08-26T16:04:13Z","title":"Learning Tree-Structured Composition of Data Augmentation","summary":"  Data augmentation is widely used for training a neural network given little\nlabeled data. A common practice of augmentation training is applying a\ncomposition of multiple transformations sequentially to the data. Existing\naugmentation methods such as RandAugment randomly sample from a list of\npre-selected transformations, while methods such as AutoAugment apply advanced\nsearch to optimize over an augmentation set of size $k^d$, which is the number\nof transformation sequences of length $d$, given a list of $k$ transformations.\n  In this paper, we design efficient algorithms whose running time complexity\nis much faster than the worst-case complexity of $O(k^d)$, provably. We propose\na new algorithm to search for a binary tree-structured composition of $k$\ntransformations, where each tree node corresponds to one transformation. The\nbinary tree generalizes sequential augmentations, such as the SimCLR\naugmentation scheme for contrastive learning. Using a top-down, recursive\nsearch procedure, our algorithm achieves a runtime complexity of $O(2^d k)$,\nwhich is much faster than $O(k^d)$ as $k$ increases above $2$. We apply our\nalgorithm to tackle data distributions with heterogeneous subpopulations by\nsearching for one tree in each subpopulation and then learning a weighted\ncombination, resulting in a forest of trees.\n  We validate our proposed algorithms on numerous graph and image datasets,\nincluding a multi-label graph classification dataset we collected. The dataset\nexhibits significant variations in the sizes of graphs and their average\ndegrees, making it ideal for studying data augmentation. We show that our\napproach can reduce the computation cost by 43% over existing search methods\nwhile improving performance by 4.3%. The tree structures can be used to\ninterpret the relative importance of each transformation, such as identifying\nthe important transformations on small vs. large graphs.\n","authors":["Dongyue Li","Kailai Chen","Predrag Radivojac","Hongyang R. Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.14381v1.pdf","comment":"25 pages"},{"id":"http://arxiv.org/abs/2408.14371v1","updated":"2024-08-26T15:53:50Z","published":"2024-08-26T15:53:50Z","title":"SelEx: Self-Expertise in Fine-Grained Generalized Category Discovery","summary":"  In this paper, we address Generalized Category Discovery, aiming to\nsimultaneously uncover novel categories and accurately classify known ones.\nTraditional methods, which lean heavily on self-supervision and contrastive\nlearning, often fall short when distinguishing between fine-grained categories.\nTo address this, we introduce a novel concept called `self-expertise', which\nenhances the model's ability to recognize subtle differences and uncover\nunknown categories. Our approach combines unsupervised and supervised\nself-expertise strategies to refine the model's discernment and generalization.\nInitially, hierarchical pseudo-labeling is used to provide `soft supervision',\nimproving the effectiveness of self-expertise. Our supervised technique differs\nfrom traditional methods by utilizing more abstract positive and negative\nsamples, aiding in the formation of clusters that can generalize to novel\ncategories. Meanwhile, our unsupervised strategy encourages the model to\nsharpen its category distinctions by considering within-category examples as\n`hard' negatives. Supported by theoretical insights, our empirical results\nshowcase that our method outperforms existing state-of-the-art techniques in\nGeneralized Category Discovery across several fine-grained datasets. Our code\nis available at: https://github.com/SarahRastegar/SelEx.\n","authors":["Sarah Rastegar","Mohammadreza Salehi","Yuki M. Asano","Hazel Doughty","Cees G. M. Snoek"],"pdf_url":"https://arxiv.org/pdf/2408.14371v1.pdf","comment":"Accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2310.07979v2","updated":"2024-08-26T15:51:38Z","published":"2023-10-12T01:57:27Z","title":"Graph-SCP: Accelerating Set Cover Problems with Graph Neural Networks","summary":"  Machine learning (ML) approaches are increasingly being used to accelerate\ncombinatorial optimization (CO) problems. We investigate the Set Cover Problem\n(SCP) and propose Graph-SCP, a graph neural network method that augments\nexisting optimization solvers by learning to identify a much smaller\nsub-problem that contains the solution space. Graph-SCP uses both supervised\nlearning from prior solved instances and unsupervised learning aimed at\nminimizing the SCP objective. We evaluate the performance of Graph-SCP on\nsynthetically weighted and unweighted SCP instances with diverse problem\ncharacteristics and complexities, and on instances from the OR Library, a\ncanonical benchmark for SCP. We show that Graph-SCP reduces the problem size by\n60-80% and achieves runtime speedups of up to 10x on average when compared to\nGurobi (a state-of-the-art commercial solver), while maintaining solution\nquality. This is in contrast to fast greedy solutions that significantly\ncompromise solution quality to achieve guaranteed polynomial runtime. We\nshowcase Graph-SCP's ability to generalize to larger problem sizes, training on\nSCP instances with up to 3,000 subsets and testing on SCP instances with up to\n10,000 subsets.\n","authors":["Zohair Shafi","Benjamin A. Miller","Tina Eliassi-Rad","Rajmonda S. Caceres"],"pdf_url":"https://arxiv.org/pdf/2310.07979v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14369v1","updated":"2024-08-26T15:49:31Z","published":"2024-08-26T15:49:31Z","title":"Exploiting Conjugate Label Information for Multi-Instance Partial-Label\n  Learning","summary":"  Multi-instance partial-label learning (MIPL) addresses scenarios where each\ntraining sample is represented as a multi-instance bag associated with a\ncandidate label set containing one true label and several false positives.\nExisting MIPL algorithms have primarily focused on mapping multi-instance bags\nto candidate label sets for disambiguation, disregarding the intrinsic\nproperties of the label space and the supervised information provided by\nnon-candidate label sets. In this paper, we propose an algorithm named ELIMIPL,\ni.e., Exploiting conjugate Label Information for Multi-Instance Partial-Label\nlearning, which exploits the conjugate label information to improve the\ndisambiguation performance. To achieve this, we extract the label information\nembedded in both candidate and non-candidate label sets, incorporating the\nintrinsic properties of the label space. Experimental results obtained from\nbenchmark and real-world datasets demonstrate the superiority of the proposed\nELIMIPL over existing MIPL algorithms and other well-established partial-label\nlearning algorithms.\n","authors":["Wei Tang","Weijia Zhang","Min-Ling Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.14369v1.pdf","comment":"Accepted at IJCAI 2024. The code can be found at\n  https://github.com/tangw-seu/ELIMIPL"},{"id":"http://arxiv.org/abs/2408.14358v1","updated":"2024-08-26T15:32:31Z","published":"2024-08-26T15:32:31Z","title":"An Embedding is Worth a Thousand Noisy Labels","summary":"  The performance of deep neural networks scales with dataset size and label\nquality, rendering the efficient mitigation of low-quality data annotations\ncrucial for building robust and cost-effective systems. Existing strategies to\naddress label noise exhibit severe limitations due to computational complexity\nand application dependency. In this work, we propose WANN, a Weighted Adaptive\nNearest Neighbor approach that builds on self-supervised feature\nrepresentations obtained from foundation models. To guide the weighted voting\nscheme, we introduce a reliability score, which measures the likelihood of a\ndata label being correct. WANN outperforms reference methods, including a\nlinear layer trained with robust loss functions, on diverse datasets of varying\nsize and under various noise types and severities. WANN also exhibits superior\ngeneralization on imbalanced data compared to both Adaptive-NNs (ANN) and fixed\nk-NNs. Furthermore, the proposed weighting scheme enhances supervised\ndimensionality reduction under noisy labels. This yields a significant boost in\nclassification performance with 10x and 100x smaller image embeddings,\nminimizing latency and storage requirements. Our approach, emphasizing\nefficiency and explainability, emerges as a simple, robust solution to overcome\nthe inherent limitations of deep neural network training. The code is available\nat https://github.com/francescodisalvo05/wann-noisy-labels .\n","authors":["Francesco Di Salvo","Sebastian Doerrich","Ines Rieger","Christian Ledig"],"pdf_url":"https://arxiv.org/pdf/2408.14358v1.pdf","comment":"Preprint submitted to the International Journal of Computer Vision\n  (IJCV)"},{"id":"http://arxiv.org/abs/2408.14352v1","updated":"2024-08-26T15:29:34Z","published":"2024-08-26T15:29:34Z","title":"Assessing Contamination in Large Language Models: Introducing the\n  LogProber method","summary":"  In machine learning, contamination refers to situations where testing data\nleak into the training set. The issue is particularly relevant for the\nevaluation of the performance of Large Language Models (LLMs), which are\ngenerally trained on gargantuan, and generally opaque, corpora of text scraped\nfrom the world wide web. Developing tools to detect contamination is therefore\ncrucial to be able to fairly and properly track the evolution of the\nperformance of LLMs. Most recent works in the field are not tailored to\nquantify contamination on short sequences of text like we find in psychology\nquestionnaires. In the present paper we introduce LogProber, a novel,\nefficient, algorithm that we show able to detect contamination using token\nprobability in given sentences. In the second part we investigate the\nlimitations of the method and discuss how different training methods can\ncontaminate models without leaving traces in the token probabilities.\n","authors":["Nicolas Yax","Pierre-Yves Oudeyer","Stefano Palminteri"],"pdf_url":"https://arxiv.org/pdf/2408.14352v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11126v2","updated":"2024-08-26T15:19:12Z","published":"2024-08-20T18:26:09Z","title":"Binocular Model: A deep learning solution for online melt pool\n  temperature analysis using dual-wavelength Imaging Pyrometry","summary":"  In metal Additive Manufacturing (AM), monitoring the temperature of the Melt\nPool (MP) is crucial for ensuring part quality, process stability, defect\nprevention, and overall process optimization. Traditional methods, are slow to\nconverge and require extensive manual effort to translate data into actionable\ninsights, rendering them impractical for real-time monitoring and control. To\naddress this challenge, we propose an Artificial Intelligence (AI)-based\nsolution aimed at reducing manual data processing reliance and improving the\nefficiency of transitioning from data to insight. In our study, we utilize a\ndataset comprising dual-wavelength real-time process monitoring data and\ncorresponding temperature maps. We introduce a deep learning model called the\n\"Binocular model,\" which exploits dual input observations to perform a precise\nanalysis of MP temperature in Laser Powder Bed Fusion (L-PBF). Through advanced\ndeep learning techniques, we seamlessly convert raw data into temperature maps,\nsignificantly streamlining the process and enabling batch processing at a rate\nof up to 750 frames per second, approximately 1000 times faster than\nconventional methods. Our Binocular model achieves high accuracy in temperature\nestimation, evidenced by a 0.95 R-squared score, while simultaneously enhancing\nprocessing efficiency by a factor of $\\sim1000x$ times. This model directly\naddresses the challenge of real-time MP temperature monitoring and offers\ninsights into the encountered constraints and the benefits of our Deep\nLearning-based approach. By combining efficiency and precision, our work\ncontributes to the advancement of temperature monitoring in L-PBF, thus driving\nprogress in the field of metal AM.\n","authors":["Javid Akhavan","Chaitanya Krishna Vallabh","Xiayun Zhao","Souran Manoochehri"],"pdf_url":"https://arxiv.org/pdf/2408.11126v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04522v3","updated":"2024-08-26T15:13:22Z","published":"2024-07-05T14:07:15Z","title":"Graph Reinforcement Learning for Power Grids: A Comprehensive Survey","summary":"  The rise of renewable energy and distributed generation requires new\napproaches to overcome the limitations of traditional methods. In this context,\nGraph Neural Networks are promising due to their ability to learn from\ngraph-structured data. Combined with Reinforcement Learning, they can serve as\ncontrol approaches to determine remedial network actions. This review analyses\nhow Graph Reinforcement Learning (GRL) can improve representation learning and\ndecision making in power grid use cases. Although GRL has demonstrated\nadaptability to unpredictable events and noisy data, it is primarily at a\nproof-of-concept stage. We highlight open challenges and limitations with\nrespect to real-world applications.\n","authors":["Mohamed Hassouna","Clara Holzhüter","Pawel Lytaev","Josephine Thomas","Bernhard Sick","Christoph Scholz"],"pdf_url":"https://arxiv.org/pdf/2407.04522v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14340v1","updated":"2024-08-26T15:13:14Z","published":"2024-08-26T15:13:14Z","title":"Foundation Models for Music: A Survey","summary":"  In recent years, foundation models (FMs) such as large language models (LLMs)\nand latent diffusion models (LDMs) have profoundly impacted diverse sectors,\nincluding music. This comprehensive review examines state-of-the-art (SOTA)\npre-trained models and foundation models in music, spanning from representation\nlearning, generative learning and multimodal learning. We first contextualise\nthe significance of music in various industries and trace the evolution of AI\nin music. By delineating the modalities targeted by foundation models, we\ndiscover many of the music representations are underexplored in FM development.\nThen, emphasis is placed on the lack of versatility of previous methods on\ndiverse music applications, along with the potential of FMs in music\nunderstanding, generation and medical application. By comprehensively exploring\nthe details of the model pre-training paradigm, architectural choices,\ntokenisation, finetuning methodologies and controllability, we emphasise the\nimportant topics that should have been well explored, like instruction tuning\nand in-context learning, scaling law and emergent ability, as well as\nlong-sequence modelling etc. A dedicated section presents insights into music\nagents, accompanied by a thorough analysis of datasets and evaluations\nessential for pre-training and downstream tasks. Finally, by underscoring the\nvital importance of ethical considerations, we advocate that following research\non FM for music should focus more on such issues as interpretability,\ntransparency, human responsibility, and copyright issues. The paper offers\ninsights into future challenges and trends on FMs for music, aiming to shape\nthe trajectory of human-AI collaboration in the music realm.\n","authors":["Yinghao Ma","Anders Øland","Anton Ragni","Bleiz MacSen Del Sette","Charalampos Saitis","Chris Donahue","Chenghua Lin","Christos Plachouras","Emmanouil Benetos","Elio Quinton","Elona Shatri","Fabio Morreale","Ge Zhang","György Fazekas","Gus Xia","Huan Zhang","Ilaria Manco","Jiawen Huang","Julien Guinot","Liwei Lin","Luca Marinelli","Max W. Y. Lam","Megha Sharma","Qiuqiang Kong","Roger B. Dannenberg","Ruibin Yuan","Shangda Wu","Shih-Lun Wu","Shuqi Dai","Shun Lei","Shiyin Kang","Simon Dixon","Wenhu Chen","Wehhao Huang","Xingjian Du","Xingwei Qu","Xu Tan","Yizhi Li","Zeyue Tian","Zhiyong Wu","Zhizheng Wu","Ziyang Ma","Ziyu Wang"],"pdf_url":"https://arxiv.org/pdf/2408.14340v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14338v1","updated":"2024-08-26T15:07:35Z","published":"2024-08-26T15:07:35Z","title":"Machine Learning for Quantifier Selection in cvc5","summary":"  In this work we considerably improve the state-of-the-art SMT solving on\nfirst-order quantified problems by efficient machine learning guidance of\nquantifier selection. Quantifiers represent a significant challenge for SMT and\nare technically a source of undecidability. In our approach, we train an\nefficient machine learning model that informs the solver which quantifiers\nshould be instantiated and which not. Each quantifier may be instantiated\nmultiple times and the set of the active quantifiers changes as the solving\nprogresses. Therefore, we invoke the ML predictor many times, during the whole\nrun of the solver. To make this efficient, we use fast ML models based on\ngradient boosting decision trees. We integrate our approach into the\nstate-of-the-art cvc5 SMT solver and show a considerable increase of the\nsystem's holdout-set performance after training it on a large set of\nfirst-order problems collected from the Mizar Mathematical Library.\n","authors":["Jan Jakubův","Mikoláš Janota","Jelle Piepenbrock","Josef Urban"],"pdf_url":"https://arxiv.org/pdf/2408.14338v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14332v1","updated":"2024-08-26T15:01:04Z","published":"2024-08-26T15:01:04Z","title":"One-layer transformers fail to solve the induction heads task","summary":"  A simple communication complexity argument proves that no one-layer\ntransformer can solve the induction heads task unless its size is exponentially\nlarger than the size sufficient for a two-layer transformer.\n","authors":["Clayton Sanford","Daniel Hsu","Matus Telgarsky"],"pdf_url":"https://arxiv.org/pdf/2408.14332v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16528v2","updated":"2024-08-26T14:59:53Z","published":"2024-05-26T11:29:57Z","title":"LoQT: Low Rank Adapters for Quantized Training","summary":"  Training of large neural networks requires significant computational\nresources. Despite advances using low-rank adapters and quantization,\npretraining of models such as LLMs on consumer hardware has not been possible\nwithout model sharding, offloading during training, or per-layer gradient\nupdates. To address these limitations, we propose LoQT, a method for\nefficiently training quantized models. LoQT uses gradient-based tensor\nfactorization to initialize low-rank trainable weight matrices that are\nperiodically merged into quantized full-rank weight matrices. Our approach is\nsuitable for both pretraining and fine-tuning of models, which we demonstrate\nexperimentally for language modeling and downstream task adaptation. We find\nthat LoQT enables efficient training of models up to 7B parameters on a\nconsumer-grade 24GB GPU. We also demonstrate the feasibility of training a 13B\nparameter model using per-layer gradient updates on the same hardware.\n","authors":["Sebastian Loeschcke","Mads Toftrup","Michael J. Kastoryano","Serge Belongie","Vésteinn Snæbjarnarson"],"pdf_url":"https://arxiv.org/pdf/2405.16528v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14331v1","updated":"2024-08-26T14:55:40Z","published":"2024-08-26T14:55:40Z","title":"Automated Machine Learning in Insurance","summary":"  Machine Learning (ML) has gained popularity in actuarial research and\ninsurance industrial applications. However, the performance of most ML tasks\nheavily depends on data preprocessing, model selection, and hyperparameter\noptimization, which are considered to be intensive in terms of domain\nknowledge, experience, and manual labor. Automated Machine Learning (AutoML)\naims to automatically complete the full life-cycle of ML tasks and provides\nstate-of-the-art ML models without human intervention or supervision. This\npaper introduces an AutoML workflow that allows users without domain knowledge\nor prior experience to achieve robust and effortless ML deployment by writing\nonly a few lines of code. This proposed AutoML is specifically tailored for the\ninsurance application, with features like the balancing step in data\npreprocessing, ensemble pipelines, and customized loss functions. These\nfeatures are designed to address the unique challenges of the insurance domain,\nincluding the imbalanced nature of common insurance datasets. The full code and\ndocumentation are available on the GitHub repository.\n(https://github.com/PanyiDong/InsurAutoML)\n","authors":["Panyi Dong","Zhiyu Quan"],"pdf_url":"https://arxiv.org/pdf/2408.14331v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14326v1","updated":"2024-08-26T14:54:14Z","published":"2024-08-26T14:54:14Z","title":"Streamline tractography of the fetal brain in utero with machine\n  learning","summary":"  Diffusion-weighted magnetic resonance imaging (dMRI) is the only non-invasive\ntool for studying white matter tracts and structural connectivity of the brain.\nThese assessments rely heavily on tractography techniques, which reconstruct\nvirtual streamlines representing white matter fibers. Much effort has been\ndevoted to improving tractography methodology for adult brains, while\ntractography of the fetal brain has been largely neglected. Fetal tractography\nfaces unique difficulties due to low dMRI signal quality, immature and rapidly\ndeveloping brain structures, and paucity of reference data. This work presents\nthe first machine learning model for fetal tractography. The model input\nconsists of five sources of information: (1) Fiber orientation, inferred from a\ndiffusion tensor fit to the dMRI signal; (2) Directions of recent propagation\nsteps; (3) Global spatial information, encoded as distances to keypoints in the\nbrain cortex; (4) Tissue segmentation information; and (5) Prior information\nabout the expected local fiber orientations supplied with an atlas. In order to\nmitigate the local tensor estimation error, a large spatial context around the\ncurrent point in the diffusion tensor image is encoded using convolutional and\nattention neural network modules. Moreover, the diffusion tensor information at\na hypothetical next point is included in the model input. Filtering rules based\non anatomically constrained tractography are applied to prune implausible\nstreamlines. We trained the model on manually-refined whole-brain fetal\ntractograms and validated the trained model on an independent set of 11 test\nscans with gestational ages between 23 and 36 weeks. Results show that our\nproposed method achieves superior performance across all evaluated tracts. The\nnew method can significantly advance the capabilities of dMRI for studying\nnormal and abnormal brain development in utero.\n","authors":["Weide Liu","Camilo Calixto","Simon K. Warfield","Davood Karimi"],"pdf_url":"https://arxiv.org/pdf/2408.14326v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14325v1","updated":"2024-08-26T14:54:13Z","published":"2024-08-26T14:54:13Z","title":"Function-Space MCMC for Bayesian Wide Neural Networks","summary":"  Bayesian Neural Networks represent a fascinating confluence of deep learning\nand probabilistic reasoning, offering a compelling framework for understanding\nuncertainty in complex predictive models. In this paper, we investigate the use\nof the preconditioned Crank-Nicolson algorithm and its Langevin version to\nsample from the reparametrised posterior distribution of the weights as the\nwidths of Bayesian Neural Networks grow larger. In addition to being robust in\nthe infinite-dimensional setting, we prove that the acceptance probabilities of\nthe proposed methods approach 1 as the width of the network increases,\nindependently of any stepsize tuning. Moreover, we examine and compare how the\nmixing speeds of the underdamped Langevin Monte Carlo, the preconditioned\nCrank-Nicolson and the preconditioned Crank-Nicolson Langevin samplers are\ninfluenced by changes in the network width in some real-world cases. Our\nfindings suggest that, in wide Bayesian Neural Networks configurations, the\npreconditioned Crank-Nicolson method allows for more efficient sampling of the\nreparametrised posterior distribution, as evidenced by a higher effective\nsample size and improved diagnostic results compared with the other analysed\nalgorithms.\n","authors":["Lucia Pezzetti","Stefano Favaro","Stefano Pelucchetti"],"pdf_url":"https://arxiv.org/pdf/2408.14325v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14319v1","updated":"2024-08-26T14:51:26Z","published":"2024-08-26T14:51:26Z","title":"Rethinking Knowledge Transfer in Learning Using Privileged Information","summary":"  In supervised machine learning, privileged information (PI) is information\nthat is unavailable at inference, but is accessible during training time.\nResearch on learning using privileged information (LUPI) aims to transfer the\nknowledge captured in PI onto a model that can perform inference without PI. It\nseems that this extra bit of information ought to make the resulting model\nbetter. However, finding conclusive theoretical or empirical evidence that\nsupports the ability to transfer knowledge using PI has been challenging. In\nthis paper, we critically examine the assumptions underlying existing\ntheoretical analyses and argue that there is little theoretical justification\nfor when LUPI should work. We analyze LUPI methods and reveal that apparent\nimprovements in empirical risk of existing research may not directly result\nfrom PI. Instead, these improvements often stem from dataset anomalies or\nmodifications in model design misguidedly attributed to PI. Our experiments for\na wide variety of application domains further demonstrate that state-of-the-art\nLUPI approaches fail to effectively transfer knowledge from PI. Thus, we\nadvocate for practitioners to exercise caution when working with PI to avoid\nunintended inductive biases.\n","authors":["Danil Provodin","Bram van den Akker","Christina Katsimerou","Maurits Kaptein","Mykola Pechenizkiy"],"pdf_url":"https://arxiv.org/pdf/2408.14319v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.07437v3","updated":"2024-08-26T14:46:08Z","published":"2023-02-15T02:58:09Z","title":"Bridging the Usability Gap: Theoretical and Methodological Advances for\n  Spectral Learning of Hidden Markov Models","summary":"  The Baum-Welch (B-W) algorithm is the most widely accepted method for\ninferring hidden Markov models (HMM). However, it is prone to getting stuck in\nlocal optima, and can be too slow for many real-time applications. Spectral\nlearning of HMMs (SHMM), based on the method of moments (MOM) has been proposed\nin the literature to overcome these obstacles. Despite its promises, asymptotic\ntheory for SHMM has been elusive, and the long-run performance of SHMM can\ndegrade due to unchecked propagation of error. In this paper, we (1) provide an\nasymptotic distribution for the approximate error of the likelihood estimated\nby SHMM, (2) propose a novel algorithm called projected SHMM (PSHMM) that\nmitigates the problem of error propagation, and (3) develop online learning\nvariants of both SHMM and PSHMM that accommodate potential nonstationarity. We\ncompare the performance of SHMM with PSHMM and estimation through the B-W\nalgorithm on both simulated data and data from real world applications, and\nfind that PSHMM not only retains the computational advantages of SHMM, but also\nprovides more robust estimation and forecasting.\n","authors":["Xiaoyuan Ma","Jordan Rodu"],"pdf_url":"https://arxiv.org/pdf/2302.07437v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14307v1","updated":"2024-08-26T14:38:19Z","published":"2024-08-26T14:38:19Z","title":"LLM-3D Print: Large Language Models To Monitor and Control 3D Printing","summary":"  Industry 4.0 has revolutionized manufacturing by driving digitalization and\nshifting the paradigm toward additive manufacturing (AM). Fused Deposition\nModeling (FDM), a key AM technology, enables the creation of highly customized,\ncost-effective products with minimal material waste through layer-by-layer\nextrusion, posing a significant challenge to traditional subtractive methods.\nHowever, the susceptibility of material extrusion techniques to errors often\nrequires expert intervention to detect and mitigate defects that can severely\ncompromise product quality. While automated error detection and machine\nlearning models exist, their generalizability across diverse 3D printer setups,\nfirmware, and sensors is limited, and deep learning methods require extensive\nlabeled datasets, hindering scalability and adaptability. To address these\nchallenges, we present a process monitoring and control framework that\nleverages pre-trained Large Language Models (LLMs) alongside 3D printers to\ndetect and address printing defects. The LLM evaluates print quality by\nanalyzing images captured after each layer or print segment, identifying\nfailure modes and querying the printer for relevant parameters. It then\ngenerates and executes a corrective action plan. We validated the effectiveness\nof the proposed framework in identifying defects by comparing it against a\ncontrol group of engineers with diverse AM expertise. Our evaluation\ndemonstrated that LLM-based agents not only accurately identify common 3D\nprinting errors, such as inconsistent extrusion, stringing, warping, and layer\nadhesion, but also effectively determine the parameters causing these failures\nand autonomously correct them without any need for human intervention.\n","authors":["Yayati Jadhav","Peter Pak","Amir Barati Farimani"],"pdf_url":"https://arxiv.org/pdf/2408.14307v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14284v1","updated":"2024-08-26T14:09:40Z","published":"2024-08-26T14:09:40Z","title":"May the Forgetting Be with You: Alternate Replay for Learning with Noisy\n  Labels","summary":"  Forgetting presents a significant challenge during incremental training,\nmaking it particularly demanding for contemporary AI systems to assimilate new\nknowledge in streaming data environments. To address this issue, most\napproaches in Continual Learning (CL) rely on the replay of a restricted buffer\nof past data. However, the presence of noise in real-world scenarios, where\nhuman annotation is constrained by time limitations or where data is\nautomatically gathered from the web, frequently renders these strategies\nvulnerable. In this study, we address the problem of CL under Noisy Labels\n(CLN) by introducing Alternate Experience Replay (AER), which takes advantage\nof forgetting to maintain a clear distinction between clean, complex, and noisy\nsamples in the memory buffer. The idea is that complex or mislabeled examples,\nwhich hardly fit the previously learned data distribution, are most likely to\nbe forgotten. To grasp the benefits of such a separation, we equip AER with\nAsymmetric Balanced Sampling (ABS): a new sample selection strategy that\nprioritizes purity on the current task while retaining relevant samples from\nthe past. Through extensive computational comparisons, we demonstrate the\neffectiveness of our approach in terms of both accuracy and purity of the\nobtained buffer, resulting in a remarkable average gain of 4.71% points in\naccuracy with respect to existing loss-based purification strategies. Code is\navailable at https://github.com/aimagelab/mammoth.\n","authors":["Monica Millunzi","Lorenzo Bonicelli","Angelo Porrello","Jacopo Credi","Petter N. Kolm","Simone Calderara"],"pdf_url":"https://arxiv.org/pdf/2408.14284v1.pdf","comment":"25 pages, 5 figures. Accepted at the The 35th British Machine Vision\n  Conference 2024 (BMVC 2024), Glasgow, UK"},{"id":"http://arxiv.org/abs/2305.07715v2","updated":"2024-08-26T14:09:37Z","published":"2023-05-12T18:14:21Z","title":"Field theory for optimal signal propagation in ResNets","summary":"  Residual networks have significantly better trainability and thus performance\nthan feed-forward networks at large depth. Introducing skip connections\nfacilitates signal propagation to deeper layers. In addition, previous works\nfound that adding a scaling parameter for the residual branch further improves\ngeneralization performance. While they empirically identified a particularly\nbeneficial range of values for this scaling parameter, the associated\nperformance improvement and its universality across network hyperparameters yet\nneed to be understood. For feed-forward networks, finite-size theories have led\nto important insights with regard to signal propagation and hyperparameter\ntuning. We here derive a systematic finite-size field theory for residual\nnetworks to study signal propagation and its dependence on the scaling for the\nresidual branch. We derive analytical expressions for the response function, a\nmeasure for the network's sensitivity to inputs, and show that for deep\nnetworks the empirically found values for the scaling parameter lie within the\nrange of maximal sensitivity. Furthermore, we obtain an analytical expression\nfor the optimal scaling parameter that depends only weakly on other network\nhyperparameters, such as the weight variance, thereby explaining its\nuniversality across hyperparameters. Overall, this work provides a theoretical\nframework to study ResNets at finite size.\n","authors":["Kirsten Fischer","David Dahmen","Moritz Helias"],"pdf_url":"https://arxiv.org/pdf/2305.07715v2.pdf","comment":"21 pages, 8 figures, under review"},{"id":"http://arxiv.org/abs/2408.12615v2","updated":"2024-08-26T14:06:59Z","published":"2024-08-08T14:11:06Z","title":"Pediatric TSC-Related Epilepsy Classification from Clinical MR Images\n  Using Quantum Neural Network","summary":"  Tuberous sclerosis complex (TSC) manifests as a multisystem disorder with\nsignificant neurological implications. This study addresses the critical need\nfor robust classification models tailored to TSC in pediatric patients,\nintroducing QResNet,a novel deep learning model seamlessly integrating\nconventional convolutional neural networks with quantum neural networks. The\nmodel incorporates a two-layer quantum layer (QL), comprising ZZFeatureMap and\nAnsatz layers, strategically designed for processing classical data within a\nquantum framework. A comprehensive evaluation, demonstrates the superior\nperformance of QResNet in TSC MRI image classification compared to conventional\n3D-ResNet models. These compelling findings underscore the potential of quantum\ncomputing to revolutionize medical imaging and diagnostics.Remarkably, this\nmethod surpasses conventional CNNs in accuracy and Area Under the Curve (AUC)\nmetrics with the current dataset. Future research endeavors may focus on\nexploring the scalability and practical implementation of quantum algorithms in\nreal-world medical imaging scenarios.\n","authors":["Ling Lin","Yihang Zhou","Zhanqi Hu","Dian Jiang","Congcong Liu","Shuo Zhou","Yanjie Zhu","Jianxiang Liao","Dong Liang","Hairong Zheng","Haifeng Wang"],"pdf_url":"https://arxiv.org/pdf/2408.12615v2.pdf","comment":"5 pages,4 figures,2 tables,presented at ISBI 2024"},{"id":"http://arxiv.org/abs/2408.14281v1","updated":"2024-08-26T14:02:30Z","published":"2024-08-26T14:02:30Z","title":"Uncertainties of Latent Representations in Computer Vision","summary":"  Uncertainty quantification is a key pillar of trustworthy machine learning.\nIt enables safe reactions under unsafe inputs, like predicting only when the\nmachine learning model detects sufficient evidence, discarding anomalous data,\nor emitting warnings when an error is likely to be inbound. This is\nparticularly crucial in safety-critical areas like medical image classification\nor self-driving cars. Despite the plethora of proposed uncertainty\nquantification methods achieving increasingly higher scores on performance\nbenchmarks, uncertainty estimates are often shied away from in practice. Many\nmachine learning projects start from pretrained latent representations that\ncome without uncertainty estimates. Uncertainties would need to be trained by\npractitioners on their own, which is notoriously difficult and\nresource-intense.\n  This thesis makes uncertainty estimates easily accessible by adding them to\nthe latent representation vectors of pretrained computer vision models. Besides\nproposing approaches rooted in probability and decision theory, such as\nMonte-Carlo InfoNCE (MCInfoNCE) and loss prediction, we delve into both\ntheoretical and empirical questions. We show that these unobservable\nuncertainties about unobservable latent representations are indeed provably\ncorrect. We also provide an uncertainty-aware representation learning (URL)\nbenchmark to compare these unobservables against observable ground-truths.\nFinally, we compile our findings to pretrain lightweight representation\nuncertainties on large-scale computer vision models that transfer to unseen\ndatasets in a zero-shot manner.\n  Our findings do not only advance the current theoretical understanding of\nuncertainties over latent variables, but also facilitate the access to\nuncertainty quantification for future researchers inside and outside the field,\nenabling straightforward but trustworthy machine learning.\n","authors":["Michael Kirchhof"],"pdf_url":"https://arxiv.org/pdf/2408.14281v1.pdf","comment":"Doctoral thesis"},{"id":"http://arxiv.org/abs/2312.01210v4","updated":"2024-08-26T13:57:31Z","published":"2023-12-02T19:39:50Z","title":"When accurate prediction models yield harmful self-fulfilling prophecies","summary":"  Prediction models are popular in medical research and practice. By predicting\nan outcome of interest for specific patients, these models may help inform\ndifficult treatment decisions, and are often hailed as the poster children for\npersonalized, data-driven healthcare. We show however, that using prediction\nmodels for decision making can lead to harmful decisions, even when the\npredictions exhibit good discrimination after deployment. These models are\nharmful self-fulfilling prophecies: their deployment harms a group of patients\nbut the worse outcome of these patients does not invalidate the predictive\npower of the model. Our main result is a formal characterization of a set of\nsuch prediction models. Next we show that models that are well calibrated\nbefore and after deployment are useless for decision making as they made no\nchange in the data distribution. These results point to the need to revise\nstandard practices for validation, deployment and evaluation of prediction\nmodels that are used in medical decisions.\n","authors":["Wouter A. C. van Amsterdam","Nan van Geloven","Jesse H. Krijthe","Rajesh Ranganath","Giovanni Ciná"],"pdf_url":"https://arxiv.org/pdf/2312.01210v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.09131v2","updated":"2024-08-26T13:48:32Z","published":"2024-06-13T14:02:18Z","title":"OLGA: One-cLass Graph Autoencoder","summary":"  One-class learning (OCL) comprises a set of techniques applied when\nreal-world problems have a single class of interest. The usual procedure for\nOCL is learning a hypersphere that comprises instances of this class and,\nideally, repels unseen instances from any other classes. Besides, several OCL\nalgorithms for graphs have been proposed since graph representation learning\nhas succeeded in various fields. These methods may use a two-step strategy,\ninitially representing the graph and, in a second step, classifying its nodes.\nOn the other hand, end-to-end methods learn the node representations while\nclassifying the nodes in one learning process. We highlight three main gaps in\nthe literature on OCL for graphs: (i) non-customized representations for OCL;\n(ii) the lack of constraints on hypersphere parameters learning; and (iii) the\nmethods' lack of interpretability and visualization. We propose One-cLass Graph\nAutoencoder (OLGA). OLGA is end-to-end and learns the representations for the\ngraph nodes while encapsulating the interest instances by combining two loss\nfunctions. We propose a new hypersphere loss function to encapsulate the\ninterest instances. OLGA combines this new hypersphere loss with the graph\nautoencoder reconstruction loss to improve model learning. OLGA achieved\nstate-of-the-art results and outperformed six other methods with a\nstatistically significant difference from five methods. Moreover, OLGA learns\nlow-dimensional representations maintaining the classification performance with\nan interpretable model representation learning and results.\n","authors":["M. P. S. Gôlo","J. G. B. M. Junior","D. F. Silva","R. M. Marcacini"],"pdf_url":"https://arxiv.org/pdf/2406.09131v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.07182v7","updated":"2024-08-26T13:43:46Z","published":"2022-10-13T17:03:36Z","title":"PDEBENCH: An Extensive Benchmark for Scientific Machine Learning","summary":"  Machine learning-based modeling of physical systems has experienced increased\ninterest in recent years. Despite some impressive progress, there is still a\nlack of benchmarks for Scientific ML that are easy to use but still challenging\nand representative of a wide range of problems. We introduce PDEBench, a\nbenchmark suite of time-dependent simulation tasks based on Partial\nDifferential Equations (PDEs). PDEBench comprises both code and data to\nbenchmark the performance of novel machine learning models against both\nclassical numerical simulations and machine learning baselines. Our proposed\nset of benchmark problems contribute the following unique features: (1) A much\nwider range of PDEs compared to existing benchmarks, ranging from relatively\ncommon examples to more realistic and difficult problems; (2) much larger\nready-to-use datasets compared to prior work, comprising multiple simulation\nruns across a larger number of initial and boundary conditions and PDE\nparameters; (3) more extensible source codes with user-friendly APIs for data\ngeneration and baseline results with popular machine learning models (FNO,\nU-Net, PINN, Gradient-Based Inverse Method). PDEBench allows researchers to\nextend the benchmark freely for their own purposes using a standardized API and\nto compare the performance of new models to existing baseline methods. We also\npropose new evaluation metrics with the aim to provide a more holistic\nunderstanding of learning methods in the context of Scientific ML. With those\nmetrics we identify tasks which are challenging for recent ML methods and\npropose these tasks as future challenges for the community. The code is\navailable at https://github.com/pdebench/PDEBench.\n","authors":["Makoto Takamoto","Timothy Praditia","Raphael Leiteritz","Dan MacKinlay","Francesco Alesiani","Dirk Pflüger","Mathias Niepert"],"pdf_url":"https://arxiv.org/pdf/2210.07182v7.pdf","comment":"16 pages (main body) + 34 pages (supplemental material), accepted for\n  publication in NeurIPS 2022 Track Datasets and Benchmarks"},{"id":"http://arxiv.org/abs/2408.14267v1","updated":"2024-08-26T13:42:43Z","published":"2024-08-26T13:42:43Z","title":"1-Bit FQT: Pushing the Limit of Fully Quantized Training to 1-bit","summary":"  Fully quantized training (FQT) accelerates the training of deep neural\nnetworks by quantizing the activations, weights, and gradients into lower\nprecision. To explore the ultimate limit of FQT (the lowest achievable\nprecision), we make a first attempt to 1-bit FQT. We provide a theoretical\nanalysis of FQT based on Adam and SGD, revealing that the gradient variance\ninfluences the convergence of FQT. Building on these theoretical results, we\nintroduce an Activation Gradient Pruning (AGP) strategy. The strategy leverages\nthe heterogeneity of gradients by pruning less informative gradients and\nenhancing the numerical precision of remaining gradients to mitigate gradient\nvariance. Additionally, we propose Sample Channel joint Quantization (SCQ),\nwhich utilizes different quantization strategies in the computation of weight\ngradients and activation gradients to ensure that the method is friendly to\nlow-bitwidth hardware. Finally, we present a framework to deploy our algorithm.\nFor fine-tuning VGGNet-16 and ResNet-18 on multiple datasets, our algorithm\nachieves an average accuracy improvement of approximately 6%, compared to\nper-sample quantization. Moreover, our training speedup can reach a maximum of\n5.13x compared to full precision training.\n","authors":["Chang Gao","Jianfei Chen","Kang Zhao","Jiaqi Wang","Liping Jing"],"pdf_url":"https://arxiv.org/pdf/2408.14267v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14266v1","updated":"2024-08-26T13:40:33Z","published":"2024-08-26T13:40:33Z","title":"HyperSBINN: A Hypernetwork-Enhanced Systems Biology-Informed Neural\n  Network for Efficient Drug Cardiosafety Assessment","summary":"  Mathematical modeling in systems toxicology enables a comprehensive\nunderstanding of the effects of pharmaceutical substances on cardiac health.\nHowever, the complexity of these models limits their widespread application in\nearly drug discovery. In this paper, we introduce a novel approach to solving\nparameterized models of cardiac action potentials by combining meta-learning\ntechniques with Systems Biology-Informed Neural Networks (SBINNs). The proposed\nmethod, HyperSBINN, effectively addresses the challenge of predicting the\neffects of various compounds at different concentrations on cardiac action\npotentials, outperforming traditional differential equation solvers in speed.\nOur model efficiently handles scenarios with limited data and complex\nparameterized differential equations. The HyperSBINN model demonstrates robust\nperformance in predicting APD90 values, indicating its potential as a reliable\ntool for modeling cardiac electrophysiology and aiding in preclinical drug\ndevelopment. This framework represents an advancement in computational\nmodeling, offering a scalable and efficient solution for simulating and\nunderstanding complex biological systems.\n","authors":["Inass Soukarieh","Gerhard Hessler","Hervé Minoux","Marcel Mohr","Friedemann Schmidt","Jan Wenzel","Pierre Barbillon","Hugo Gangloff","Pierre Gloaguen"],"pdf_url":"https://arxiv.org/pdf/2408.14266v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14254v1","updated":"2024-08-26T13:16:42Z","published":"2024-08-26T13:16:42Z","title":"Integrated Brain Connectivity Analysis with fMRI, DTI, and sMRI Powered\n  by Interpretable Graph Neural Networks","summary":"  Multimodal neuroimaging modeling has becomes a widely used approach but\nconfronts considerable challenges due to heterogeneity, which encompasses\nvariability in data types, scales, and formats across modalities. This\nvariability necessitates the deployment of advanced computational methods to\nintegrate and interpret these diverse datasets within a cohesive analytical\nframework. In our research, we amalgamate functional magnetic resonance\nimaging, diffusion tensor imaging, and structural MRI into a cohesive\nframework. This integration capitalizes on the unique strengths of each\nmodality and their inherent interconnections, aiming for a comprehensive\nunderstanding of the brain's connectivity and anatomical characteristics.\nUtilizing the Glasser atlas for parcellation, we integrate imaging derived\nfeatures from various modalities: functional connectivity from fMRI, structural\nconnectivity from DTI, and anatomical features from sMRI within consistent\nregions. Our approach incorporates a masking strategy to differentially weight\nneural connections, thereby facilitating a holistic amalgamation of multimodal\nimaging data. This technique enhances interpretability at connectivity level,\ntranscending traditional analyses centered on singular regional attributes. The\nmodel is applied to the Human Connectome Project's Development study to\nelucidate the associations between multimodal imaging and cognitive functions\nthroughout youth. The analysis demonstrates improved predictive accuracy and\nuncovers crucial anatomical features and essential neural connections,\ndeepening our understanding of brain structure and function.\n","authors":["Gang Qu","Ziyu Zhou","Vince D. Calhoun","Aiying Zhang","Yu-Ping Wang"],"pdf_url":"https://arxiv.org/pdf/2408.14254v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14252v1","updated":"2024-08-26T13:14:26Z","published":"2024-08-26T13:14:26Z","title":"An Evaluation of Explanation Methods for Black-Box Detectors of\n  Machine-Generated Text","summary":"  The increasing difficulty to distinguish language-model-generated from\nhuman-written text has led to the development of detectors of machine-generated\ntext (MGT). However, in many contexts, a black-box prediction is not\nsufficient, it is equally important to know on what grounds a detector made\nthat prediction. Explanation methods that estimate feature importance promise\nto provide indications of which parts of an input are used by classifiers for\nprediction. However, the quality of different explanation methods has not\npreviously been assessed for detectors of MGT. This study conducts the first\nsystematic evaluation of explanation quality for this task. The dimensions of\nfaithfulness and stability are assessed with five automated experiments, and\nusefulness is evaluated in a user study. We use a dataset of ChatGPT-generated\nand human-written documents, and pair predictions of three existing\nlanguage-model-based detectors with the corresponding SHAP, LIME, and Anchor\nexplanations. We find that SHAP performs best in terms of faithfulness,\nstability, and in helping users to predict the detector's behavior. In\ncontrast, LIME, perceived as most useful by users, scores the worst in terms of\nuser performance at predicting the detectors' behavior.\n","authors":["Loris Schoenegger","Yuxi Xia","Benjamin Roth"],"pdf_url":"https://arxiv.org/pdf/2408.14252v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.03816v2","updated":"2024-08-26T13:12:45Z","published":"2024-08-07T14:52:06Z","title":"Early Prediction of Causes (not Effects) in Healthcare by Long-Term\n  Clinical Time Series Forecasting","summary":"  Machine learning for early syndrome diagnosis aims to solve the intricate\ntask of predicting a ground truth label that most often is the outcome (effect)\nof a medical consensus definition applied to observed clinical measurements\n(causes), given clinical measurements observed several hours before. Instead of\nfocusing on the prediction of the future effect, we propose to directly predict\nthe causes via time series forecasting (TSF) of clinical variables and\ndetermine the effect by applying the gold standard consensus definition to the\nforecasted values. This method has the invaluable advantage of being\nstraightforwardly interpretable to clinical practitioners, and because model\ntraining does not rely on a particular label anymore, the forecasted data can\nbe used to predict any consensus-based label. We exemplify our method by means\nof long-term TSF with Transformer models, with a focus on accurate prediction\nof sparse clinical variables involved in the SOFA-based Sepsis-3 definition and\nthe new Simplified Acute Physiology Score (SAPS-II) definition. Our experiments\nare conducted on two datasets and show that contrary to recent proposals which\nadvocate set function encoders for time series and direct multi-step decoders,\nbest results are achieved by a combination of standard dense encoders with\niterative multi-step decoders. The key for success of iterative multi-step\ndecoding can be attributed to its ability to capture cross-variate dependencies\nand to a student forcing training strategy that teaches the model to rely on\nits own previous time step predictions for the next time step prediction.\n","authors":["Michael Staniek","Marius Fracarolli","Michael Hagmann","Stefan Riezler"],"pdf_url":"https://arxiv.org/pdf/2408.03816v2.pdf","comment":"Published at Machine Learning for Healthcare (MLHC), Toronto, 2024"},{"id":"http://arxiv.org/abs/2408.12658v2","updated":"2024-08-26T13:02:46Z","published":"2024-08-22T18:04:29Z","title":"Hierarchical Generative Modeling of Melodic Vocal Contours in Hindustani\n  Classical Music","summary":"  Hindustani music is a performance-driven oral tradition that exhibits the\nrendition of rich melodic patterns. In this paper, we focus on generative\nmodeling of singers' vocal melodies extracted from audio recordings, as the\nvoice is musically prominent within the tradition. Prior generative work in\nHindustani music models melodies as coarse discrete symbols which fails to\ncapture the rich expressive melodic intricacies of singing. Thus, we propose to\nuse a finely quantized pitch contour, as an intermediate representation for\nhierarchical audio modeling. We propose GaMaDHaNi, a modular two-level\nhierarchy, consisting of a generative model on pitch contours, and a pitch\ncontour to audio synthesis model. We compare our approach to non-hierarchical\naudio models and hierarchical models that use a self-supervised intermediate\nrepresentation, through a listening test and qualitative analysis. We also\nevaluate audio model's ability to faithfully represent the pitch contour input\nusing Pearson correlation coefficient. By using pitch contours as an\nintermediate representation, we show that our model may be better equipped to\nlisten and respond to musicians in a human-AI collaborative setting by\nhighlighting two potential interaction use cases (1) primed generation, and (2)\ncoarse pitch conditioning.\n","authors":["Nithya Shikarpur","Krishna Maneesha Dendukuri","Yusong Wu","Antoine Caillon","Cheng-Zhi Anna Huang"],"pdf_url":"https://arxiv.org/pdf/2408.12658v2.pdf","comment":"Accepted at International Society for Music Information Retrieval\n  (ISMIR) 2024"},{"id":"http://arxiv.org/abs/2408.14236v1","updated":"2024-08-26T12:50:27Z","published":"2024-08-26T12:50:27Z","title":"DSTI at LLMs4OL 2024 Task A: Intrinsic versus extrinsic knowledge for\n  type classification","summary":"  We introduce semantic towers, an extrinsic knowledge representation method,\nand compare it to intrinsic knowledge in large language models for ontology\nlearning. Our experiments show a trade-off between performance and semantic\ngrounding for extrinsic knowledge compared to a fine-tuned model intrinsic\nknowledge. We report our findings on the Large Language Models for Ontology\nLearning (LLMs4OL) 2024 challenge.\n","authors":["Hanna Abi Akl"],"pdf_url":"https://arxiv.org/pdf/2408.14236v1.pdf","comment":"8 pages, 4 figures, accepted for the LLMs4OL challenge at the\n  International Semantic Web Conference (ISWC) 2024"},{"id":"http://arxiv.org/abs/2408.14234v1","updated":"2024-08-26T12:49:41Z","published":"2024-08-26T12:49:41Z","title":"FSDEM: Feature Selection Dynamic Evaluation Metric","summary":"  Expressive evaluation metrics are indispensable for informative experiments\nin all areas, and while several metrics are established in some areas, in\nothers, such as feature selection, only indirect or otherwise limited\nevaluation metrics are found. In this paper, we propose a novel evaluation\nmetric to address several problems of its predecessors and allow for flexible\nand reliable evaluation of feature selection algorithms. The proposed metric is\na dynamic metric with two properties that can be used to evaluate both the\nperformance and the stability of a feature selection algorithm. We conduct\nseveral empirical experiments to illustrate the use of the proposed metric in\nthe successful evaluation of feature selection algorithms. We also provide a\ncomparison and analysis to show the different aspects involved in the\nevaluation of the feature selection algorithms. The results indicate that the\nproposed metric is successful in carrying out the evaluation task for feature\nselection algorithms.\n  This paper is an extended version of a paper accepted at SISAP 2024.\n","authors":["Muhammad Rajabinasab","Anton D. Lautrup","Tobias Hyrup","Arthur Zimek"],"pdf_url":"https://arxiv.org/pdf/2408.14234v1.pdf","comment":"Short version of this paper is accepted at 17th International\n  Conference on Similarity Search and Applications, SISAP 2024"},{"id":"http://arxiv.org/abs/2408.14229v1","updated":"2024-08-26T12:44:17Z","published":"2024-08-26T12:44:17Z","title":"Gallery-Aware Uncertainty Estimation For Open-Set Face Recognition","summary":"  Accurately estimating image quality and model robustness improvement are\ncritical challenges in unconstrained face recognition, which can be addressed\nthrough uncertainty estimation via probabilistic face embeddings. Previous\nresearch mainly focused on uncertainty estimation in face verification, leaving\nthe open-set face recognition task underexplored. In open-set face recognition,\none seeks to classify an image, which could also be unknown. Here, the low\nvariance of probabilistic embedding does not imply a low error probability: an\nimage embedding could be close to several classes in a gallery, thus yielding\nhigh uncertainty. We propose a method aware of two sources of ambiguity in the\nopen-set recognition system: (1) the gallery uncertainty caused by overlapping\nclasses and (2) the uncertainty of the face embeddings. To detect both types,\nwe use a Bayesian probabilistic model of embedding distribution, which provides\na principled uncertainty estimate. Challenging open-set face recognition\ndatasets, such as IJB-C, serve as a testbed for our method. We also propose a\nnew open-set recognition protocol for whale and dolphin identification. The\nproposed approach better identifies recognition errors than uncertainty\nestimation methods based solely on image quality.\n","authors":["Leonid Erlygin","Alexey Zaytsev"],"pdf_url":"https://arxiv.org/pdf/2408.14229v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14225v1","updated":"2024-08-26T12:41:41Z","published":"2024-08-26T12:41:41Z","title":"Provable Imbalanced Point Clustering","summary":"  We suggest efficient and provable methods to compute an approximation for\nimbalanced point clustering, that is, fitting $k$-centers to a set of points in\n$\\mathbb{R}^d$, for any $d,k\\geq 1$. To this end, we utilize \\emph{coresets},\nwhich, in the context of the paper, are essentially weighted sets of points in\n$\\mathbb{R}^d$ that approximate the fitting loss for every model in a given\nset, up to a multiplicative factor of $1\\pm\\varepsilon$. We provide [Section 3\nand Section E in the appendix] experiments that show the empirical contribution\nof our suggested methods for real images (novel and reference), synthetic data,\nand real-world data. We also propose choice clustering, which by combining\nclustering algorithms yields better performance than each one separately.\n","authors":["David Denisov","Dan Feldman","Shlomi Dolev","Michael Segal"],"pdf_url":"https://arxiv.org/pdf/2408.14225v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02378v2","updated":"2024-08-26T12:36:51Z","published":"2023-07-05T15:45:53Z","title":"Continuum Limits of Ollivier's Ricci Curvature on data clouds: pointwise\n  consistency and global lower bounds","summary":"  Let $M$ denote a low-dimensional manifold embedded in Euclidean space and let\n${X}= \\{ x_1, \\dots, x_n \\}$ be a collection of points uniformly sampled from\nit. We study the relationship between the curvature of a random geometric graph\nbuilt from ${X}$ and the curvature of the manifold $M$ via continuum limits of\nOllivier's discrete Ricci curvature. We prove pointwise, non-asymptotic\nconsistency results and also show that if $M$ has Ricci curvature bounded from\nbelow by a positive constant, then the random geometric graph will inherit this\nglobal structural property with high probability. We discuss applications of\nthe global discrete curvature bounds to contraction properties of heat kernels\non graphs, as well as implications for manifold learning from data clouds. In\nparticular, we show that our consistency results allow for estimating the\nintrinsic curvature of a manifold by first estimating concrete extrinsic\nquantities.\n","authors":["Nicolas Garcia Trillos","Melanie Weber"],"pdf_url":"https://arxiv.org/pdf/2307.02378v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11341v2","updated":"2024-08-26T12:14:31Z","published":"2024-04-17T13:00:52Z","title":"The Causal Chambers: Real Physical Systems as a Testbed for AI\n  Methodology","summary":"  In some fields of AI, machine learning and statistics, the validation of new\nmethods and algorithms is often hindered by the scarcity of suitable real-world\ndatasets. Researchers must often turn to simulated data, which yields limited\ninformation about the applicability of the proposed methods to real problems.\nAs a step forward, we have constructed two devices that allow us to quickly and\ninexpensively produce large datasets from non-trivial but well-understood\nphysical systems. The devices, which we call causal chambers, are\ncomputer-controlled laboratories that allow us to manipulate and measure an\narray of variables from these physical systems, providing a rich testbed for\nalgorithms from a variety of fields. We illustrate potential applications\nthrough a series of case studies in fields such as causal discovery,\nout-of-distribution generalization, change point detection, independent\ncomponent analysis, and symbolic regression. For applications to causal\ninference, the chambers allow us to carefully perform interventions. We also\nprovide and empirically validate a causal model of each chamber, which can be\nused as ground truth for different tasks. All hardware and software is made\nopen source, and the datasets are publicly available at causalchamber.org or\nthrough the Python package causalchamber.\n","authors":["Juan L. Gamella","Jonas Peters","Peter Bühlmann"],"pdf_url":"https://arxiv.org/pdf/2404.11341v2.pdf","comment":"40 pages, 20 figures"},{"id":"http://arxiv.org/abs/2408.14206v1","updated":"2024-08-26T12:09:38Z","published":"2024-08-26T12:09:38Z","title":"Lemon and Orange Disease Classification using CNN-Extracted Features and\n  Machine Learning Classifier","summary":"  Lemons and oranges, both are the most economically significant citrus fruits\nglobally. The production of lemons and oranges is severely affected due to\ndiseases in its growth stages. Fruit quality has degraded due to the presence\nof flaws. Thus, it is necessary to diagnose the disease accurately so that we\ncan avoid major loss of lemons and oranges. To improve citrus farming, we\nproposed a disease classification approach for lemons and oranges. This\napproach would enable early disease detection and intervention, reduce yield\nlosses, and optimize resource allocation. For the initial modeling of disease\nclassification, the research uses innovative deep learning architectures such\nas VGG16, VGG19 and ResNet50. In addition, for achieving better accuracy, the\nbasic machine learning algorithms used for classification problems include\nRandom Forest, Naive Bayes, K-Nearest Neighbors (KNN) and Logistic Regression.\nThe lemon and orange fruits diseases are classified more accurately (95.0% for\nlemon and 99.69% for orange) by the model. The model's base features were\nextracted from the ResNet50 pre-trained model and the diseases are classified\nby the Logistic Regression which beats the performance given by VGG16 and VGG19\nfor other classifiers. Experimental outcomes show that the proposed model also\noutperforms existing models in which most of them classified the diseases using\nthe Softmax classifier without using any individual classifiers.\n","authors":["Khandoker Nosiba Arifin","Sayma Akter Rupa","Md Musfique Anwar","Israt Jahan"],"pdf_url":"https://arxiv.org/pdf/2408.14206v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03846v2","updated":"2024-08-26T11:58:22Z","published":"2024-02-06T09:48:33Z","title":"Efficient Generation of Hidden Outliers for Improved Outlier Detection","summary":"  Outlier generation is a popular technique used for solving important outlier\ndetection tasks. Generating outliers with realistic behavior is challenging.\nPopular existing methods tend to disregard the 'multiple views' property of\noutliers in high-dimensional spaces. The only existing method accounting for\nthis property falls short in efficiency and effectiveness. We propose BISECT, a\nnew outlier generation method that creates realistic outliers mimicking said\nproperty. To do so, BISECT employs a novel proposition introduced in this\narticle stating how to efficiently generate said realistic outliers. Our method\nhas better guarantees and complexity than the current methodology for\nrecreating 'multiple views'. We use the synthetic outliers generated by BISECT\nto effectively enhance outlier detection in diverse datasets, for multiple use\ncases. For instance, oversampling with BISECT reduced the error by up to 3\ntimes when compared with the baselines.\n","authors":["Jose Cribeiro-Ramallo","Vadim Arzamasov","Klemens Böhm"],"pdf_url":"https://arxiv.org/pdf/2402.03846v2.pdf","comment":"Preprint. Full paper is scheduled to appear in TKDD; Updated results\n  in table 4"},{"id":"http://arxiv.org/abs/2408.14195v1","updated":"2024-08-26T11:47:52Z","published":"2024-08-26T11:47:52Z","title":"Representative Arm Identification: A fixed confidence approach to\n  identify cluster representatives","summary":"  We study the representative arm identification (RAI) problem in the\nmulti-armed bandits (MAB) framework, wherein we have a collection of arms, each\nassociated with an unknown reward distribution. An underlying instance is\ndefined by a partitioning of the arms into clusters of predefined sizes, such\nthat for any $j > i$, all arms in cluster $i$ have a larger mean reward than\nthose in cluster $j$. The goal in RAI is to reliably identify a certain\nprespecified number of arms from each cluster, while using as few arm pulls as\npossible. The RAI problem covers as special cases several well-studied MAB\nproblems such as identifying the best arm or any $M$ out of the top $K$, as\nwell as both full and coarse ranking. We start by providing an\ninstance-dependent lower bound on the sample complexity of any feasible\nalgorithm for this setting. We then propose two algorithms, based on the idea\nof confidence intervals, and provide high probability upper bounds on their\nsample complexity, which orderwise match the lower bound. Finally, we do an\nempirical comparison of both algorithms along with an LUCB-type alternative on\nboth synthetic and real-world datasets, and demonstrate the superior\nperformance of our proposed schemes in most cases.\n","authors":["Sarvesh Gharat","Aniket Yadav","Nikhil Karamchandani","Jayakrishnan Nair"],"pdf_url":"https://arxiv.org/pdf/2408.14195v1.pdf","comment":"We analyse a clustered multi-armed bandit formulation, where the\n  learning objective is to identify representative arms from each cluster, in a\n  fixed confidence setting"},{"id":"http://arxiv.org/abs/2408.05920v3","updated":"2024-08-26T11:41:28Z","published":"2024-08-12T05:00:23Z","title":"Urban Region Pre-training and Prompting: A Graph-based Approach","summary":"  Urban region representation is crucial for various urban downstream tasks.\nHowever, despite the proliferation of methods and their success, acquiring\ngeneral urban region knowledge and adapting to different tasks remains\nchallenging. Previous work often neglects the spatial structures and functional\nlayouts between entities, limiting their ability to capture transferable\nknowledge across regions. Further, these methods struggle to adapt effectively\nto specific downstream tasks, as they do not adequately address the unique\nfeatures and relationships required for different downstream tasks. In this\npaper, we propose a $\\textbf{G}$raph-based $\\textbf{U}$rban $\\textbf{R}$egion\n$\\textbf{P}$re-training and $\\textbf{P}$rompting framework ($\\textbf{GURPP}$)\nfor region representation learning. Specifically, we first construct an urban\nregion graph that integrates detailed spatial entity data for more effective\nurban region representation. Then, we develop a subgraph-centric urban region\npre-training model to capture the heterogeneous and transferable patterns of\ninteractions among entities. To further enhance the adaptability of these\nembeddings to different tasks, we design two graph-based prompting methods to\nincorporate explicit/hidden task knowledge. Extensive experiments on various\nurban region prediction tasks and different cities demonstrate the superior\nperformance of our GURPP framework.\n","authors":["Jiahui Jin","Yifan Song","Dong Kan","Haojia Zhu","Xiangguo Sun","Zhicheng Li","Xigang Sun","Jinghui Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.05920v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.02595v3","updated":"2024-08-26T11:35:52Z","published":"2023-04-02T02:19:15Z","title":"Bayesian neural networks via MCMC: a Python-based tutorial","summary":"  Bayesian inference provides a methodology for parameter estimation and\nuncertainty quantification in machine learning and deep learning methods.\nVariational inference and Markov Chain Monte-Carlo (MCMC) sampling methods are\nused to implement Bayesian inference. In the past three decades, MCMC sampling\nmethods have faced some challenges in being adapted to larger models (such as\nin deep learning) and big data problems. Advanced proposal distributions that\nincorporate gradients, such as a Langevin proposal distribution, provide a\nmeans to address some of the limitations of MCMC sampling for Bayesian neural\nnetworks. Furthermore, MCMC methods have typically been constrained to\nstatisticians and currently not well-known among deep learning researchers. We\npresent a tutorial for MCMC methods that covers simple Bayesian linear and\nlogistic models, and Bayesian neural networks. The aim of this tutorial is to\nbridge the gap between theory and implementation via coding, given a general\nsparsity of libraries and tutorials to this end. This tutorial provides code in\nPython with data and instructions that enable their use and extension. We\nprovide results for some benchmark problems showing the strengths and\nweaknesses of implementing the respective Bayesian models via MCMC. We\nhighlight the challenges in sampling multi-modal posterior distributions for\nthe case of Bayesian neural networks and the need for further improvement of\nconvergence diagnosis methods.\n","authors":["Rohitash Chandra","Joshua Simmons"],"pdf_url":"https://arxiv.org/pdf/2304.02595v3.pdf","comment":"IEEE Access (2024)"},{"id":"http://arxiv.org/abs/2406.12284v2","updated":"2024-08-26T11:33:13Z","published":"2024-06-18T05:23:29Z","title":"Demystifying the Recency Heuristic in Temporal-Difference Learning","summary":"  The recency heuristic in reinforcement learning is the assumption that\nstimuli that occurred closer in time to an acquired reward should be more\nheavily reinforced. The recency heuristic is one of the key assumptions made by\nTD($\\lambda$), which reinforces recent experiences according to an\nexponentially decaying weighting. In fact, all other widely used return\nestimators for TD learning, such as $n$-step returns, satisfy a weaker (i.e.,\nnon-monotonic) recency heuristic. Why is the recency heuristic effective for\ntemporal credit assignment? What happens when credit is assigned in a way that\nviolates this heuristic? In this paper, we analyze the specific mathematical\nimplications of adopting the recency heuristic in TD learning. We prove that\nany return estimator satisfying this heuristic: 1) is guaranteed to converge to\nthe correct value function, 2) has a relatively fast contraction rate, and 3)\nhas a long window of effective credit assignment, yet bounded worst-case\nvariance. We also give a counterexample where on-policy, tabular TD methods\nviolating the recency heuristic diverge. Our results offer some of the first\ntheoretical evidence that credit assignment based on the recency heuristic\nfacilitates learning.\n","authors":["Brett Daley","Marlos C. Machado","Martha White"],"pdf_url":"https://arxiv.org/pdf/2406.12284v2.pdf","comment":"RLC 2024. 18 pages, 8 figures, 1 table"},{"id":"http://arxiv.org/abs/2408.14183v1","updated":"2024-08-26T11:16:03Z","published":"2024-08-26T11:16:03Z","title":"Robot Navigation with Entity-Based Collision Avoidance using Deep\n  Reinforcement Learning","summary":"  Efficient navigation in dynamic environments is crucial for autonomous robots\ninteracting with various environmental entities, including both moving agents\nand static obstacles. In this study, we present a novel methodology that\nenhances the robot's interaction with different types of agents and obstacles\nbased on specific safety requirements. This approach uses information about the\nentity types, improving collision avoidance and ensuring safer navigation. We\nintroduce a new reward function that penalizes the robot for collisions with\ndifferent entities such as adults, bicyclists, children, and static obstacles,\nand additionally encourages the robot's proximity to the goal. It also\npenalizes the robot for being close to entities, and the safe distance also\ndepends on the entity type. Additionally, we propose an optimized algorithm for\ntraining and testing, which significantly accelerates train, validation, and\ntest steps and enables training in complex environments. Comprehensive\nexperiments conducted using simulation demonstrate that our approach\nconsistently outperforms conventional navigation and collision avoidance\nmethods, including state-of-the-art techniques. To sum up, this work\ncontributes to enhancing the safety and efficiency of navigation systems for\nautonomous robots in dynamic, crowded environments.\n","authors":["Yury Kolomeytsev","Dmitry Golembiovsky"],"pdf_url":"https://arxiv.org/pdf/2408.14183v1.pdf","comment":"14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2402.11237v2","updated":"2024-08-26T10:39:22Z","published":"2024-02-17T10:02:22Z","title":"Be Persistent: Towards a Unified Solution for Mitigating Shortcuts in\n  Deep Learning","summary":"  Deep neural networks (DNNs) are vulnerable to shortcut learning: rather than\nlearning the intended task, they tend to draw inconclusive relationships\nbetween their inputs and outputs. Shortcut learning is ubiquitous among many\nfailure cases of neural networks, and traces of this phenomenon can be seen in\ntheir generalizability issues, domain shift, adversarial vulnerability, and\neven bias towards majority groups. In this paper, we argue that this\ncommonality in the cause of various DNN issues creates a significant\nopportunity that should be leveraged to find a unified solution for shortcut\nlearning. To this end, we outline the recent advances in topological data\nanalysis (TDA), and persistent homology (PH) in particular, to sketch a unified\nroadmap for detecting shortcuts in deep learning. We demonstrate our arguments\nby investigating the topological features of computational graphs in DNNs using\ntwo cases of unlearnable examples and bias in decision-making as our test\nstudies. Our analysis of these two failure cases of DNNs reveals that finding a\nunified solution for shortcut learning in DNNs is not out of reach, and TDA can\nplay a significant role in forming such a framework.\n","authors":["Hadi M. Dolatabadi","Sarah M. Erfani","Christopher Leckie"],"pdf_url":"https://arxiv.org/pdf/2402.11237v2.pdf","comment":"Accepted to the 2024 European Conference on Artificial Intelligence\n  (ECAI)"},{"id":"http://arxiv.org/abs/2306.08270v2","updated":"2024-08-26T10:26:28Z","published":"2023-06-14T06:13:50Z","title":"Solar Active Regions Detection Via 2D Circular Kernel Time Series\n  Transformation, Entropy and Machine Learning Approach","summary":"  This study proposes an enhancement to the existing method for detecting Solar\nActive Regions (ARs). Our technique tracks ARs using images from the\nAtmospheric Imaging Assembly (AIA) of NASA's Solar Dynamics Observatory (SDO).\nIt involves a 2D circular kernel time series transformation, combined with\nStatistical and Entropy measures, and a Machine Learning (ML) approach. The\ntechnique transforms the circular area around pixels in the SDO AIA images into\none-dimensional time series (1-DTS). Statistical measures (Median Value, Xmed;\n95th Percentile, X95) and Entropy measures (Distribution Entropy, DisEn; Fuzzy\nEntropy, FuzzyEn) are used as feature selection methods (FSM 1), alongside a\nmethod applying 1-DTS elements directly as features (FSM 2). The ML algorithm\nclassifies these series into three categories: no Active Region (nARs type 1,\nclass 1), non-flaring Regions outside active regions with brightness (nARs type\n2, class 2), and flaring Active Regions (ARs, class 3). The ML model achieves a\nclassification accuracy of 0.900 and 0.914 for Entropy and Statistical\nmeasures, respectively. Notably, Fuzzy Entropy shows the highest classification\naccuracy (AKF=0.895), surpassing DisEn (AKF=0.738), X95 (AKF=0.873), and Xmed\n(AKF=0.840). This indicates the high effectiveness of Entropy and Statistical\nmeasures for AR detection in SDO AIA images. FSM 2 captures a similar\ndistribution of flaring AR activities as FSM 1. Additionally, we introduce a\ngeneralizing characteristic of AR activities (GSA), finding a direct agreement\nbetween increased AR activities and higher GSA values. The Python code\nimplementation of the proposed method is available in supplementary material.\n","authors":["Irewola Aaron Oludehinwa","Andrei Velichko","Maksim Belyaev","Olasunkanmi I. Olusola"],"pdf_url":"https://arxiv.org/pdf/2306.08270v2.pdf","comment":"30 pages, 10 figures, 4 tables"},{"id":"http://arxiv.org/abs/2404.03309v2","updated":"2024-08-26T10:21:00Z","published":"2024-04-04T09:08:04Z","title":"Optimistic Online Non-stochastic Control via FTRL","summary":"  This paper brings the concept of ``optimism\" to the new and promising\nframework of online Non-stochastic Control (NSC). Namely, we study how NSC can\nbenefit from a prediction oracle of unknown quality responsible for forecasting\nfuture costs. The posed problem is first reduced to an optimistic learning with\ndelayed feedback problem, which is handled through the Optimistic Follow the\nRegularized Leader (OFTRL) algorithmic family. This reduction enables the\ndesign of \\texttt{OptFTRL-C}, the first Disturbance Action Controller (DAC)\nwith optimistic policy regret bounds. These new bounds are commensurate with\nthe oracle's accuracy, ranging from $\\mathcal{O}(1)$ for perfect predictions to\nthe order-optimal $\\mathcal{O}(\\sqrt{T})$ even when all predictions fail. By\naddressing the challenge of incorporating untrusted predictions into online\ncontrol, this work contributes to the advancement of the NSC framework and\npaves the way toward effective and robust learning-based controllers.\n","authors":["Naram Mhaisen","George Iosifidis"],"pdf_url":"https://arxiv.org/pdf/2404.03309v2.pdf","comment":"to appear in the proceedings of IEEE CDC 2024"},{"id":"http://arxiv.org/abs/2312.01878v8","updated":"2024-08-26T10:13:43Z","published":"2023-12-04T13:20:15Z","title":"HGPROMPT: Bridging Homogeneous and Heterogeneous Graphs for Few-shot\n  Prompt Learning","summary":"  Graph neural networks (GNNs) and heterogeneous graph neural networks (HGNNs)\nare prominent techniques for homogeneous and heterogeneous graph representation\nlearning, yet their performance in an end-to-end supervised framework greatly\ndepends on the availability of task-specific supervision. To reduce the\nlabeling cost, pre-training on self-supervised pretext tasks has become a\npopular paradigm,but there is often a gap between the pre-trained model and\ndownstream tasks, stemming from the divergence in their objectives. To bridge\nthe gap, prompt learning has risen as a promising direction especially in\nfew-shot settings, without the need to fully fine-tune the pre-trained model.\nWhile there has been some early exploration of prompt-based learning on graphs,\nthey primarily deal with homogeneous graphs, ignoring the heterogeneous graphs\nthat are prevalent in downstream applications. In this paper, we propose\nHGPROMPT, a novel pre-training and prompting framework to unify not only\npre-training and downstream tasks but also homogeneous and heterogeneous graphs\nvia a dual-template design. Moreover, we propose dual-prompt in HGPROMPT to\nassist a downstream task in locating the most relevant prior to bridge the gaps\ncaused by not only feature variations but also heterogeneity differences across\ntasks. Finally, we thoroughly evaluate and analyze HGPROMPT through extensive\nexperiments on three public datasets.\n","authors":["Xingtong Yu","Yuan Fang","Zemin Liu","Xinming Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.01878v8.pdf","comment":"AAAI2024 main track"},{"id":"http://arxiv.org/abs/2311.15317v5","updated":"2024-08-26T10:12:45Z","published":"2023-11-26T14:35:28Z","title":"Generalized Graph Prompt: Toward a Unification of Pre-Training and\n  Downstream Tasks on Graphs","summary":"  Graph neural networks have emerged as a powerful tool for graph\nrepresentation learning, but their performance heavily relies on abundant\ntask-specific supervision. To reduce labeling requirement, the \"pre-train,\nprompt\" paradigms have become increasingly common. However, existing study of\nprompting on graphs is limited, lacking a universal treatment to appeal to\ndifferent downstream tasks. In this paper, we propose GraphPrompt, a novel\npre-training and prompting framework on graphs. GraphPrompt not only unifies\npre-training and downstream tasks into a common task template but also employs\na learnable prompt to assist a downstream task in locating the most relevant\nknowledge from the pre-trained model in a task-specific manner. To further\nenhance GraphPrompt in these two stages, we extend it into GraphPrompt+ with\ntwo major enhancements. First, we generalize several popular graph pre-training\ntasks beyond simple link prediction to broaden the compatibility with our task\ntemplate. Second, we propose a more generalized prompt design that incorporates\na series of prompt vectors within every layer of the pre-trained graph encoder,\nin order to capitalize on the hierarchical information across different layers\nbeyond just the readout layer. Finally, we conduct extensive experiments on\nfive public datasets to evaluate and analyze GraphPrompt and GraphPrompt+.\n","authors":["Xingtong Yu","Zhenghao Liu","Yuan Fang","Zemin Liu","Sihong Chen","Xinming Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.15317v5.pdf","comment":"Accepted by IEEE TKDE. Extension of \"GraphPrompt: Unifying\n  Pre-Training and Downstream Tasks for Graph Neural Networks\". arXiv admin\n  note: substantial text overlap with arXiv:2302.08043"},{"id":"http://arxiv.org/abs/2312.03731v7","updated":"2024-08-26T10:11:45Z","published":"2023-11-28T02:36:53Z","title":"MultiGPrompt for Multi-Task Pre-Training and Prompting on Graphs","summary":"  Graphs can inherently model interconnected objects on the Web, thereby\nfacilitating a series of Web applications, such as web analyzing and content\nrecommendation. Recently, Graph Neural Networks (GNNs) have emerged as a\nmainstream technique for graph representation learning. However, their efficacy\nwithin an end-to-end supervised framework is significantly tied to the\navailabilityof task-specific labels. To mitigate labeling costs and enhance\nrobustness in few-shot settings, pre-training on self-supervised tasks has\nemerged as a promising method, while prompting has been proposed to further\nnarrow the objective gap between pretext and downstream tasks. Although there\nhas been some initial exploration of prompt-based learning on graphs, they\nprimarily leverage a single pretext task, resulting in a limited subset of\ngeneral knowledge that could be learned from the pre-training data. Hence, in\nthis paper, we propose MultiGPrompt, a novel multi-task pre-training and\nprompting framework to exploit multiple pretext tasks for more comprehensive\npre-trained knowledge. First, in pre-training, we design a set of pretext\ntokens to synergize multiple pretext tasks. Second, we propose a dual-prompt\nmechanism consisting of composed and open prompts to leverage task-specific and\nglobal pre-training knowledge, to guide downstream tasks in few-shot settings.\nFinally, we conduct extensive experiments on six public datasets to evaluate\nand analyze MultiGPrompt.\n","authors":["Xingtong Yu","Chang Zhou","Yuan Fang","Xinming Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.03731v7.pdf","comment":"WWW2024 research track"},{"id":"http://arxiv.org/abs/2402.03903v3","updated":"2024-08-26T09:59:24Z","published":"2024-02-06T11:13:57Z","title":"Averaging $n$-step Returns Reduces Variance in Reinforcement Learning","summary":"  Multistep returns, such as $n$-step returns and $\\lambda$-returns, are\ncommonly used to improve the sample efficiency of reinforcement learning (RL)\nmethods. The variance of the multistep returns becomes the limiting factor in\ntheir length; looking too far into the future increases variance and reverses\nthe benefits of multistep learning. In our work, we demonstrate the ability of\ncompound returns -- weighted averages of $n$-step returns -- to reduce\nvariance. We prove for the first time that any compound return with the same\ncontraction modulus as a given $n$-step return has strictly lower variance. We\nadditionally prove that this variance-reduction property improves the\nfinite-sample complexity of temporal-difference learning under linear function\napproximation. Because general compound returns can be expensive to implement,\nwe introduce two-bootstrap returns which reduce variance while remaining\nefficient, even when using minibatched experience replay. We conduct\nexperiments showing that compound returns often increase the sample efficiency\nof $n$-step deep RL agents like DQN and PPO.\n","authors":["Brett Daley","Martha White","Marlos C. Machado"],"pdf_url":"https://arxiv.org/pdf/2402.03903v3.pdf","comment":"ICML 2024. 27 pages, 7 figures, 3 tables"},{"id":"http://arxiv.org/abs/2407.13431v2","updated":"2024-08-26T09:58:04Z","published":"2024-07-18T12:00:32Z","title":"Improving Out-of-Distribution Generalization of Trajectory Prediction\n  for Autonomous Driving via Polynomial Representations","summary":"  Robustness against Out-of-Distribution (OoD) samples is a key performance\nindicator of a trajectory prediction model. However, the development and\nranking of state-of-the-art (SotA) models are driven by their In-Distribution\n(ID) performance on individual competition datasets. We present an OoD testing\nprotocol that homogenizes datasets and prediction tasks across two large-scale\nmotion datasets. We introduce a novel prediction algorithm based on polynomial\nrepresentations for agent trajectory and road geometry on both the input and\noutput sides of the model. With a much smaller model size, training effort, and\ninference time, we reach near SotA performance for ID testing and significantly\nimprove robustness in OoD testing. Within our OoD testing protocol, we further\nstudy two augmentation strategies of SotA models and their effects on model\ngeneralization. Highlighting the contrast between ID and OoD performance, we\nsuggest adding OoD testing to the evaluation criteria of trajectory prediction\nmodels.\n","authors":["Yue Yao","Shengchao Yan","Daniel Goehring","Wolfram Burgard","Joerg Reichardt"],"pdf_url":"https://arxiv.org/pdf/2407.13431v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14152v1","updated":"2024-08-26T09:55:32Z","published":"2024-08-26T09:55:32Z","title":"Application of Disentanglement to Map Registration Problem","summary":"  Geospatial data come from various sources, such as satellites, aircraft, and\nLiDAR. The variability of the source is not limited to the types of data\nacquisition techniques, as we have maps from different time periods. To\nincorporate these data for a coherent analysis, it is essential to first align\ndifferent \"styles\" of geospatial data to its matching images that point to the\nsame location on the surface of the Earth. In this paper, we approach the image\nregistration as a two-step process of (1) extracting geospatial contents\ninvariant to visual (and any other non-content-related) information, and (2)\nmatching the data based on such (purely) geospatial contents. We hypothesize\nthat a combination of $\\beta$-VAE-like architecture [2] and adversarial\ntraining will achieve both the disentanglement of the geographic information\nand artistic styles and generation of new map tiles by composing the encoded\ngeographic information with any artistic style.\n","authors":["Hae Jin Song","Patrycja Krawczuk","Po-Hsuan Huang"],"pdf_url":"https://arxiv.org/pdf/2408.14152v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14146v1","updated":"2024-08-26T09:44:21Z","published":"2024-08-26T09:44:21Z","title":"TSAK: Two-Stage Semantic-Aware Knowledge Distillation for Efficient\n  Wearable Modality and Model Optimization in Manufacturing Lines","summary":"  Smaller machine learning models, with less complex architectures and sensor\ninputs, can benefit wearable sensor-based human activity recognition (HAR)\nsystems in many ways, from complexity and cost to battery life. In the specific\ncase of smart factories, optimizing human-robot collaboration hinges on the\nimplementation of cutting-edge, human-centric AI systems. To this end, workers'\nactivity recognition enables accurate quantification of performance metrics,\nimproving efficiency holistically. We present a two-stage semantic-aware\nknowledge distillation (KD) approach, TSAK, for efficient, privacy-aware, and\nwearable HAR in manufacturing lines, which reduces the input sensor modalities\nas well as the machine learning model size, while reaching similar recognition\nperformance as a larger multi-modal and multi-positional teacher model. The\nfirst stage incorporates a teacher classifier model encoding attention, causal,\nand combined representations. The second stage encompasses a semantic\nclassifier merging the three representations from the first stage. To evaluate\nTSAK, we recorded a multi-modal dataset at a smart factory testbed with\nwearable and privacy-aware sensors (IMU and capacitive) located on both\nworkers' hands. In addition, we evaluated our approach on OpenPack, the only\navailable open dataset mimicking the wearable sensor placements on both hands\nin the manufacturing HAR scenario. We compared several KD strategies with\ndifferent representations to regulate the training process of a smaller student\nmodel. Compared to the larger teacher model, the student model takes fewer\nsensor channels from a single hand, has 79% fewer parameters, runs 8.88 times\nfaster, and requires 96.6% less computing power (FLOPS).\n","authors":["Hymalai Bello","Daniel Geißler","Sungho Suh","Bo Zhou","Paul Lukowicz"],"pdf_url":"https://arxiv.org/pdf/2408.14146v1.pdf","comment":"Accepted in 27th International Conference on Pattern Recognition\n  (ICPR)"},{"id":"http://arxiv.org/abs/2407.02112v2","updated":"2024-08-26T09:43:12Z","published":"2024-07-02T09:54:39Z","title":"A Data-Centric Perspective on Evaluating Machine Learning Models for\n  Tabular Data","summary":"  Tabular data is prevalent in real-world machine learning applications, and\nnew models for supervised learning of tabular data are frequently proposed.\nComparative studies assessing the performance of models typically consist of\nmodel-centric evaluation setups with overly standardized data preprocessing.\nThis paper demonstrates that such model-centric evaluations are biased, as\nreal-world modeling pipelines often require dataset-specific preprocessing and\nfeature engineering. Therefore, we propose a data-centric evaluation framework.\nWe select 10 relevant datasets from Kaggle competitions and implement\nexpert-level preprocessing pipelines for each dataset. We conduct experiments\nwith different preprocessing pipelines and hyperparameter optimization (HPO)\nregimes to quantify the impact of model selection, HPO, feature engineering,\nand test-time adaptation. Our main findings are: 1. After dataset-specific\nfeature engineering, model rankings change considerably, performance\ndifferences decrease, and the importance of model selection reduces. 2. Recent\nmodels, despite their measurable progress, still significantly benefit from\nmanual feature engineering. This holds true for both tree-based models and\nneural networks. 3. While tabular data is typically considered static, samples\nare often collected over time, and adapting to distribution shifts can be\nimportant even in supposedly static data. These insights suggest that research\nefforts should be directed toward a data-centric perspective, acknowledging\nthat tabular data requires feature engineering and often exhibits temporal\ncharacteristics. Our framework is available under:\nhttps://github.com/atschalz/dc_tabeval.\n","authors":["Andrej Tschalzev","Sascha Marton","Stefan Lüdtke","Christian Bartelt","Heiner Stuckenschmidt"],"pdf_url":"https://arxiv.org/pdf/2407.02112v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14144v1","updated":"2024-08-26T09:42:18Z","published":"2024-08-26T09:42:18Z","title":"Neighborhood and Global Perturbations Supported SAM in Federated\n  Learning: From Local Tweaks To Global Awareness","summary":"  Federated Learning (FL) can be coordinated under the orchestration of a\ncentral server to collaboratively build a privacy-preserving model without the\nneed for data exchange. However, participant data heterogeneity leads to local\noptima divergence, subsequently affecting convergence outcomes. Recent research\nhas focused on global sharpness-aware minimization (SAM) and dynamic\nregularization techniques to enhance consistency between global and local\ngeneralization and optimization objectives. Nonetheless, the estimation of\nglobal SAM introduces additional computational and memory overhead, while\ndynamic regularization suffers from bias in the local and global dual variables\ndue to training isolation. In this paper, we propose a novel FL algorithm,\nFedTOGA, designed to consider optimization and generalization objectives while\nmaintaining minimal uplink communication overhead. By linking local\nperturbations to global updates, global generalization consistency is improved.\nAdditionally, global updates are used to correct local dynamic regularizers,\nreducing dual variables bias and enhancing optimization consistency. Global\nupdates are passively received by clients, reducing overhead. We also propose\nneighborhood perturbation to approximate local perturbation, analyzing its\nstrengths and limitations. Theoretical analysis shows FedTOGA achieves faster\nconvergence $O(1/T)$ under non-convex functions. Empirical studies demonstrate\nthat FedTOGA outperforms state-of-the-art algorithms, with a 1\\% accuracy\nincrease and 30\\% faster convergence, achieving state-of-the-art.\n","authors":["Boyuan Li","Zihao Peng","Yafei Li","Mingliang Xu","Shengbo Chen","Baofeng Ji","Cong Shen"],"pdf_url":"https://arxiv.org/pdf/2408.14144v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14143v1","updated":"2024-08-26T09:41:40Z","published":"2024-08-26T09:41:40Z","title":"2D-Malafide: Adversarial Attacks Against Face Deepfake Detection Systems","summary":"  We introduce 2D-Malafide, a novel and lightweight adversarial attack designed\nto deceive face deepfake detection systems. Building upon the concept of 1D\nconvolutional perturbations explored in the speech domain, our method leverages\n2D convolutional filters to craft perturbations which significantly degrade the\nperformance of state-of-the-art face deepfake detectors. Unlike traditional\nadditive noise approaches, 2D-Malafide optimises a small number of filter\ncoefficients to generate robust adversarial perturbations which are\ntransferable across different face images. Experiments, conducted using the\nFaceForensics++ dataset, demonstrate that 2D-Malafide substantially degrades\ndetection performance in both white-box and black-box settings, with larger\nfilter sizes having the greatest impact. Additionally, we report an\nexplainability analysis using GradCAM which illustrates how 2D-Malafide\nmisleads detection systems by altering the image areas used most for\nclassification. Our findings highlight the vulnerability of current deepfake\ndetection systems to convolutional adversarial attacks as well as the need for\nfuture work to enhance detection robustness through improved image fidelity\nconstraints.\n","authors":["Chiara Galdi","Michele Panariello","Massimiliano Todisco","Nicholas Evans"],"pdf_url":"https://arxiv.org/pdf/2408.14143v1.pdf","comment":"Accepted at BIOSIG 2024"},{"id":"http://arxiv.org/abs/2405.18194v3","updated":"2024-08-26T09:35:54Z","published":"2024-05-28T14:04:09Z","title":"Delving into Differentially Private Transformer","summary":"  Deep learning with differential privacy (DP) has garnered significant\nattention over the past years, leading to the development of numerous methods\naimed at enhancing model accuracy and training efficiency. This paper delves\ninto the problem of training Transformer models with differential privacy. Our\ntreatment is modular: the logic is to `reduce' the problem of training DP\nTransformer to the more basic problem of training DP vanilla neural nets. The\nlatter is better understood and amenable to many model-agnostic methods. Such\n`reduction' is done by first identifying the hardness unique to DP Transformer\ntraining: the attention distraction phenomenon and a lack of compatibility with\nexisting techniques for efficient gradient clipping. To deal with these two\nissues, we propose the Re-Attention Mechanism and Phantom Clipping,\nrespectively. We believe that our work not only casts new light on training DP\nTransformers but also promotes a modular treatment to advance research in the\nfield of differentially private deep learning.\n","authors":["Youlong Ding","Xueyang Wu","Yining Meng","Yonggang Luo","Hao Wang","Weike Pan"],"pdf_url":"https://arxiv.org/pdf/2405.18194v3.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2408.14134v1","updated":"2024-08-26T09:29:56Z","published":"2024-08-26T09:29:56Z","title":"Exploring the Potential of Large Language Models for Heterophilic Graphs","summary":"  Graph Neural Networks (GNNs) are essential for various graph-based learning\ntasks. Notably, classical GNN architectures operate under the assumption of\nhomophily, which posits that connected nodes are likely to share similar\nfeatures. However, this assumption limits the effectiveness of GNNs in handling\nheterophilic graphs where connected nodes often exhibit dissimilar\ncharacteristics. Existing approaches for homophily graphs such as non-local\nneighbor extension and architectural refinement overlook the rich textual data\nassociated with nodes, which could unlock deeper insights into these\nheterophilic contexts. With advancements in Large Language Models (LLMs), there\nis significant promise to enhance GNNs by leveraging the extensive open-world\nknowledge within LLMs to more effectively interpret and utilize textual data\nfor characterizing heterophilic graphs. In this work, we explore the potential\nof LLMs for modeling heterophilic graphs and propose a novel two-stage\nframework: LLM-enhanced edge discriminator and LLM-guided edge reweighting.\nSpecifically, in the first stage, we fine-tune the LLM to better identify\nhomophilic and heterophilic edges based on the textual information of their\nnodes. In the second stage, we adaptively manage message propagation in GNNs\nfor different edge types based on node features, structures, and heterophilic\nor homophilic characteristics. To cope with the computational demands when\ndeploying LLMs in practical scenarios, we further explore model distillation\ntechniques to fine-tune smaller, more efficient models that maintain\ncompetitive performance. Extensive experiments validate the effectiveness of\nour framework, demonstrating the feasibility of using LLMs to enhance GNNs for\nnode classification on heterophilic graphs.\n","authors":["Yuxia Wu","Shujie Li","Yuan Fang","Chuan Shi"],"pdf_url":"https://arxiv.org/pdf/2408.14134v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2408.14130v1","updated":"2024-08-26T09:24:36Z","published":"2024-08-26T09:24:36Z","title":"Theoretical Proportion Label Perturbation for Learning from Label\n  Proportions in Large Bags","summary":"  Learning from label proportions (LLP) is a kind of weakly supervised learning\nthat trains an instance-level classifier from label proportions of bags, which\nconsist of sets of instances without using instance labels. A challenge in LLP\narises when the number of instances in a bag (bag size) is numerous, making the\ntraditional LLP methods difficult due to GPU memory limitations. This study\naims to develop an LLP method capable of learning from bags with large sizes.\nIn our method, smaller bags (mini-bags) are generated by sampling instances\nfrom large-sized bags (original bags), and these mini-bags are used in place of\nthe original bags. However, the proportion of a mini-bag is unknown and differs\nfrom that of the original bag, leading to overfitting. To address this issue,\nwe propose a perturbation method for the proportion labels of sampled mini-bags\nto mitigate overfitting to noisy label proportions. This perturbation is added\nbased on the multivariate hypergeometric distribution, which is statistically\nmodeled. Additionally, loss weighting is implemented to reduce the negative\nimpact of proportions sampled from the tail of the distribution. Experimental\nresults demonstrate that the proportion label perturbation and loss weighting\nachieve classification accuracy comparable to that obtained without sampling.\nOur codes are available at https://github.com/stainlessnight/LLP-LargeBags.\n","authors":["Shunsuke Kubo","Shinnosuke Matsuo","Daiki Suehiro","Kazuhiro Terada","Hiroaki Ito","Akihiko Yoshizawa","Ryoma Bise"],"pdf_url":"https://arxiv.org/pdf/2408.14130v1.pdf","comment":"Accepted at ECAI2024"},{"id":"http://arxiv.org/abs/2408.14126v1","updated":"2024-08-26T09:19:58Z","published":"2024-08-26T09:19:58Z","title":"Enhancing Fairness through Reweighting: A Path to Attain the Sufficiency\n  Rule","summary":"  We introduce an innovative approach to enhancing the empirical risk\nminimization (ERM) process in model training through a refined reweighting\nscheme of the training data to enhance fairness. This scheme aims to uphold the\nsufficiency rule in fairness by ensuring that optimal predictors maintain\nconsistency across diverse sub-groups. We employ a bilevel formulation to\naddress this challenge, wherein we explore sample reweighting strategies.\nUnlike conventional methods that hinge on model size, our formulation bases\ngeneralization complexity on the space of sample weights. We discretize the\nweights to improve training speed. Empirical validation of our method showcases\nits effectiveness and robustness, revealing a consistent improvement in the\nbalance between prediction performance and fairness metrics across various\nexperiments.\n","authors":["Xuan Zhao","Klaus Broelemann","Salvatore Ruggieri","Gjergji Kasneci"],"pdf_url":"https://arxiv.org/pdf/2408.14126v1.pdf","comment":"accepted at ECAI 2024"},{"id":"http://arxiv.org/abs/2407.05206v4","updated":"2024-08-26T09:15:11Z","published":"2024-07-06T23:16:41Z","title":"Helios: An extremely low power event-based gesture recognition for\n  always-on smart eyewear","summary":"  This paper introduces Helios, the first extremely low-power, real-time,\nevent-based hand gesture recognition system designed for all-day on smart\neyewear. As augmented reality (AR) evolves, current smart glasses like the Meta\nRay-Bans prioritize visual and wearable comfort at the expense of\nfunctionality. Existing human-machine interfaces (HMIs) in these devices, such\nas capacitive touch and voice controls, present limitations in ergonomics,\nprivacy and power consumption. Helios addresses these challenges by leveraging\nnatural hand interactions for a more intuitive and comfortable user experience.\nOur system utilizes a extremely low-power and compact 3mmx4mm/20mW event camera\nto perform natural hand-based gesture recognition for always-on smart eyewear.\nThe camera's output is processed by a convolutional neural network (CNN)\nrunning on a NXP Nano UltraLite compute platform, consuming less than 350mW.\nHelios can recognize seven classes of gestures, including subtle microgestures\nlike swipes and pinches, with 91% accuracy. We also demonstrate real-time\nperformance across 20 users at a remarkably low latency of 60ms. Our user\ntesting results align with the positive feedback we received during our recent\nsuccessful demo at AWE-USA-2024.\n","authors":["Prarthana Bhattacharyya","Joshua Mitton","Ryan Page","Owen Morgan","Ben Menzies","Gabriel Homewood","Kemi Jacobs","Paolo Baesso","David Trickett","Chris Mair","Taru Muhonen","Rory Clark","Louis Berridge","Richard Vigars","Iain Wallace"],"pdf_url":"https://arxiv.org/pdf/2407.05206v4.pdf","comment":"Accepted at ECCV-Integrating Computer Vision in Smart Eyewear, 2024.\n  18 pages, 10 figures. First three authors contributed equally to this paper"},{"id":"http://arxiv.org/abs/2408.14118v1","updated":"2024-08-26T09:06:35Z","published":"2024-08-26T09:06:35Z","title":"Towards Lifelong Learning Embeddings: An Algorithmic Approach to\n  Dynamically Extend Embeddings","summary":"  The rapid evolution of technology has transformed business operations and\ncustomer interactions worldwide, with personalization emerging as a key\nopportunity for e-commerce companies to engage customers more effectively. The\napplication of machine learning, particularly that of deep learning models, has\ngained significant traction due to its ability to rapidly recognize patterns in\nlarge datasets, thereby offering numerous possibilities for personalization.\nThese models use embeddings to map discrete information, such as product IDs,\ninto a latent vector space, a method increasingly popular in recent years.\nHowever, e-commerce's dynamic nature, characterized by frequent new product\nintroductions, poses challenges for these embeddings, which typically require\nfixed dimensions and inputs, leading to the need for periodic retraining from\nscratch. This paper introduces a modular algorithm that extends embedding input\nsize while preserving learned knowledge, addressing the challenges posed by\ne-commerce's dynamism. The proposed algorithm also incorporates strategies to\nmitigate the cold start problem associated with new products. The results of\ninitial experiments suggest that this method outperforms traditional\nembeddings.\n","authors":["Miguel Alves Gomes","Philipp Meisen","Tobias Meisen"],"pdf_url":"https://arxiv.org/pdf/2408.14118v1.pdf","comment":"Accepted Extended Abstract for 3rd Workshop on End-End Customer\n  Journey Optimization at KDD2024, Barcelona, Spain"},{"id":"http://arxiv.org/abs/2408.14116v1","updated":"2024-08-26T09:05:43Z","published":"2024-08-26T09:05:43Z","title":"Hierarchical Learning and Computing over Space-Ground Integrated\n  Networks","summary":"  Space-ground integrated networks hold great promise for providing global\nconnectivity, particularly in remote areas where large amounts of valuable data\nare generated by Internet of Things (IoT) devices, but lacking terrestrial\ncommunication infrastructure. The massive data is conventionally transferred to\nthe cloud server for centralized artificial intelligence (AI) models training,\nraising huge communication overhead and privacy concerns. To address this, we\npropose a hierarchical learning and computing framework, which leverages the\nlowlatency characteristic of low-earth-orbit (LEO) satellites and the global\ncoverage of geostationary-earth-orbit (GEO) satellites, to provide global\naggregation services for locally trained models on ground IoT devices. Due to\nthe time-varying nature of satellite network topology and the energy\nconstraints of LEO satellites, efficiently aggregating the received local\nmodels from ground devices on LEO satellites is highly challenging. By\nleveraging the predictability of inter-satellite connectivity, modeling the\nspace network as a directed graph, we formulate a network energy minimization\nproblem for model aggregation, which turns out to be a Directed Steiner Tree\n(DST) problem. We propose a topologyaware energy-efficient routing (TAEER)\nalgorithm to solve the DST problem by finding a minimum spanning arborescence\non a substitute directed graph. Extensive simulations under realworld\nspace-ground integrated network settings demonstrate that the proposed TAEER\nalgorithm significantly reduces energy consumption and outperforms benchmarks.\n","authors":["Jingyang Zhu","Yuanming Shi","Yong Zhou","Chunxiao Jiang","Linling Kuang"],"pdf_url":"https://arxiv.org/pdf/2408.14116v1.pdf","comment":"14 pages, 10 figures"},{"id":"http://arxiv.org/abs/2403.08525v2","updated":"2024-08-26T08:49:48Z","published":"2024-03-13T13:33:35Z","title":"From Weak to Strong Sound Event Labels using Adaptive Change-Point\n  Detection and Active Learning","summary":"  We propose an adaptive change point detection method (A-CPD) for machine\nguided weak label annotation of audio recording segments. The goal is to\nmaximize the amount of information gained about the temporal activations of the\ntarget sounds. For each unlabeled audio recording, we use a prediction model to\nderive a probability curve used to guide annotation. The prediction model is\ninitially pre-trained on available annotated sound event data with classes that\nare disjoint from the classes in the unlabeled dataset. The prediction model\nthen gradually adapts to the annotations provided by the annotator in an active\nlearning loop. We derive query segments to guide the weak label annotator\ntowards strong labels, using change point detection on these probabilities. We\nshow that it is possible to derive strong labels of high quality with a limited\nannotation budget, and show favorable results for A-CPD when compared to two\nbaseline query segment strategies.\n","authors":["John Martinsson","Olof Mogren","Maria Sandsten","Tuomas Virtanen"],"pdf_url":"https://arxiv.org/pdf/2403.08525v2.pdf","comment":"Accepted at EUSIPCO 2024 (nominated best student paper)"},{"id":"http://arxiv.org/abs/2405.08334v2","updated":"2024-08-26T08:24:14Z","published":"2024-05-14T06:09:08Z","title":"Could Chemical LLMs benefit from Message Passing","summary":"  Pretrained language models (LMs) showcase significant capabilities in\nprocessing molecular text, while concurrently, message passing neural networks\n(MPNNs) demonstrate resilience and versatility in the domain of molecular\nscience. Despite these advancements, we find there are limited studies\ninvestigating the bidirectional interactions between molecular structures and\ntheir corresponding textual representations. Therefore, in this paper, we\npropose two strategies to evaluate whether an information integration can\nenhance the performance: contrast learning, which involves utilizing an MPNN to\nsupervise the training of the LM, and fusion, which exploits information from\nboth models. Our empirical analysis reveals that the integration approaches\nexhibit superior performance compared to baselines when applied to smaller\nmolecular graphs, while these integration approaches do not yield performance\nenhancements on large scale graphs.\n","authors":["Jiaqing Xie","Ziheng Chi"],"pdf_url":"https://arxiv.org/pdf/2405.08334v2.pdf","comment":"Accepted at ACL @ Languages and Molecules 2024. In Proceedings of ACL\n  2024"},{"id":"http://arxiv.org/abs/2408.14086v1","updated":"2024-08-26T08:12:26Z","published":"2024-08-26T08:12:26Z","title":"ReLExS: Reinforcement Learning Explanations for Stackelberg No-Regret\n  Learners","summary":"  With the constraint of a no regret follower, will the players in a two-player\nStackelberg game still reach Stackelberg equilibrium? We first show when the\nfollower strategy is either reward-average or transform-reward-average, the two\nplayers can always get the Stackelberg Equilibrium. Then, we extend that the\nplayers can achieve the Stackelberg equilibrium in the two-player game under\nthe no regret constraint. Also, we show a strict upper bound of the follower's\nutility difference between with and without no regret constraint. Moreover, in\nconstant-sum two-player Stackelberg games with non-regret action sequences, we\nensure the total optimal utility of the game remains also bounded.\n","authors":["Xiangge Huang","Jingyuan Li","Jiaqing Xie"],"pdf_url":"https://arxiv.org/pdf/2408.14086v1.pdf","comment":"10 pages, 3 figures. Technical Report"},{"id":"http://arxiv.org/abs/2407.20003v2","updated":"2024-08-26T08:10:56Z","published":"2024-07-29T13:34:34Z","title":"On the Effects of Irrelevant Variables in Treatment Effect Estimation\n  with Deep Disentanglement","summary":"  Estimating treatment effects from observational data is paramount in\nhealthcare, education, and economics, but current deep disentanglement-based\nmethods to address selection bias are insufficiently handling irrelevant\nvariables. We demonstrate in experiments that this leads to prediction errors.\nWe disentangle pre-treatment variables with a deep embedding method and\nexplicitly identify and represent irrelevant variables, additionally to\ninstrumental, confounding and adjustment latent factors. To this end, we\nintroduce a reconstruction objective and create an embedding space for\nirrelevant variables using an attached autoencoder. Instead of relying on\nserendipitous suppression of irrelevant variables as in previous deep\ndisentanglement approaches, we explicitly force irrelevant variables into this\nembedding space and employ orthogonalization to prevent irrelevant information\nfrom leaking into the latent space representations of the other factors. Our\nexperiments with synthetic and real-world benchmark datasets show that we can\nbetter identify irrelevant variables and more precisely predict treatment\neffects than previous methods, while prediction quality degrades less when\nadditional irrelevant variables are introduced.\n","authors":["Ahmad Saeed Khan","Erik Schaffernicht","Johannes Andreas Stork"],"pdf_url":"https://arxiv.org/pdf/2407.20003v2.pdf","comment":"Paper is accepted at ECAI-2024"},{"id":"http://arxiv.org/abs/2408.14080v1","updated":"2024-08-26T08:02:57Z","published":"2024-08-26T08:02:57Z","title":"SONICS: Synthetic Or Not -- Identifying Counterfeit Songs","summary":"  The recent surge in AI-generated songs presents exciting possibilities and\nchallenges. While these tools democratize music creation, they also necessitate\nthe ability to distinguish between human-composed and AI-generated songs for\nsafeguarding artistic integrity and content curation. Existing research and\ndatasets in fake song detection only focus on singing voice deepfake detection\n(SVDD), where the vocals are AI-generated but the instrumental music is sourced\nfrom real songs. However, this approach is inadequate for contemporary\nend-to-end AI-generated songs where all components (vocals, lyrics, music, and\nstyle) could be AI-generated. Additionally, existing datasets lack lyrics-music\ndiversity, long-duration songs, and open fake songs. To address these gaps, we\nintroduce SONICS, a novel dataset for end-to-end Synthetic Song Detection\n(SSD), comprising over 97k songs with over 49k synthetic songs from popular\nplatforms like Suno and Udio. Furthermore, we highlight the importance of\nmodeling long-range temporal dependencies in songs for effective authenticity\ndetection, an aspect overlooked in existing methods. To capture these patterns,\nwe propose a novel model, SpecTTTra, that is up to 3 times faster and 6 times\nmore memory efficient compared to popular CNN and Transformer-based models\nwhile maintaining competitive performance. Finally, we offer both AI-based and\nHuman evaluation benchmarks, addressing another deficiency in current research.\n","authors":["Md Awsafur Rahman","Zaber Ibn Abdul Hakim","Najibul Haque Sarker","Bishmoy Paul","Shaikh Anowarul Fattah"],"pdf_url":"https://arxiv.org/pdf/2408.14080v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14073v1","updated":"2024-08-26T07:56:17Z","published":"2024-08-26T07:56:17Z","title":"Score-based change point detection via tracking the best of infinitely\n  many experts","summary":"  We suggest a novel algorithm for online change point detection based on\nsequential score function estimation and tracking the best expert approach. The\ncore of the procedure is a version of the fixed share forecaster for the case\nof infinite number of experts and quadratic loss functions. The algorithm shows\na promising performance in numerical experiments on artificial and real-world\ndata sets. We also derive new upper bounds on the dynamic regret of the fixed\nshare forecaster with varying parameter, which are of independent interest.\n","authors":["Anna Markovich","Nikita Puchkin"],"pdf_url":"https://arxiv.org/pdf/2408.14073v1.pdf","comment":"43 pages, 4 figures"},{"id":"http://arxiv.org/abs/2311.02971v3","updated":"2024-08-26T07:46:53Z","published":"2023-11-06T09:17:18Z","title":"TabRepo: A Large Scale Repository of Tabular Model Evaluations and its\n  AutoML Applications","summary":"  We introduce TabRepo, a new dataset of tabular model evaluations and\npredictions. TabRepo contains the predictions and metrics of 1310 models\nevaluated on 200 classification and regression datasets. We illustrate the\nbenefit of our dataset in multiple ways. First, we show that it allows to\nperform analysis such as comparing Hyperparameter Optimization against current\nAutoML systems while also considering ensembling at marginal cost by using\nprecomputed model predictions. Second, we show that our dataset can be readily\nleveraged to perform transfer-learning. In particular, we show that applying\nstandard transfer-learning techniques allows to outperform current\nstate-of-the-art tabular systems in accuracy, runtime and latency.\n","authors":["David Salinas","Nick Erickson"],"pdf_url":"https://arxiv.org/pdf/2311.02971v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14063v1","updated":"2024-08-26T07:44:53Z","published":"2024-08-26T07:44:53Z","title":"Bridging the gap between Learning-to-plan, Motion Primitives and Safe\n  Reinforcement Learning","summary":"  Trajectory planning under kinodynamic constraints is fundamental for advanced\nrobotics applications that require dexterous, reactive, and rapid skills in\ncomplex environments. These constraints, which may represent task, safety, or\nactuator limitations, are essential for ensuring the proper functioning of\nrobotic platforms and preventing unexpected behaviors. Recent advances in\nkinodynamic planning demonstrate that learning-to-plan techniques can generate\ncomplex and reactive motions under intricate constraints. However, these\ntechniques necessitate the analytical modeling of both the robot and the entire\ntask, a limiting assumption when systems are extremely complex or when\nconstructing accurate task models is prohibitive. This paper addresses this\nlimitation by combining learning-to-plan methods with reinforcement learning,\nresulting in a novel integration of black-box learning of motion primitives and\noptimization. We evaluate our approach against state-of-the-art safe\nreinforcement learning methods, showing that our technique, particularly when\nexploiting task structure, outperforms baseline methods in challenging\nscenarios such as planning to hit in robot air hockey. This work demonstrates\nthe potential of our integrated approach to enhance the performance and safety\nof robots operating under complex kinodynamic constraints.\n","authors":["Piotr Kicki","Davide Tateo","Puze Liu","Jonas Guenster","Jan Peters","Krzysztof Walas"],"pdf_url":"https://arxiv.org/pdf/2408.14063v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12961v2","updated":"2024-08-26T07:43:03Z","published":"2024-08-23T10:12:08Z","title":"Symplectic Bregman divergences","summary":"  We present a generalization of Bregman divergences in symplectic vector\nspaces that we term symplectic Bregman divergences. Symplectic Bregman\ndivergences are derived from a symplectic generalization of the Fenchel-Young\ninequality which relies on the notion of symplectic subdifferentials. The\nsymplectic Fenchel-Young inequality is obtained using the symplectic Fenchel\ntransform which is defined with respect to a linear symplectic form. When the\nsymplectic form is built from an inner product, we show that the corresponding\nsymplectic Bregman divergences amount to ordinary Bregman divergences with\nrespect to composite inner products. Some potential applications of symplectic\ndivergences in geometric mechanics, information geometry, and learning dynamics\nin machine learning are touched upon.\n","authors":["Frank Nielsen"],"pdf_url":"https://arxiv.org/pdf/2408.12961v2.pdf","comment":"12 pages, 2 figures"},{"id":"http://arxiv.org/abs/2404.10635v4","updated":"2024-08-26T07:40:52Z","published":"2024-03-26T15:36:47Z","title":"Compressed Federated Reinforcement Learning with a Generative Model","summary":"  Reinforcement learning has recently gained unprecedented popularity, yet it\nstill grapples with sample inefficiency. Addressing this challenge, federated\nreinforcement learning (FedRL) has emerged, wherein agents collaboratively\nlearn a single policy by aggregating local estimations. However, this\naggregation step incurs significant communication costs. In this paper, we\npropose CompFedRL, a communication-efficient FedRL approach incorporating both\n\\textit{periodic aggregation} and (direct/error-feedback) compression\nmechanisms. Specifically, we consider compressed federated $Q$-learning with a\ngenerative model setup, where a central server learns an optimal $Q$-function\nby periodically aggregating compressed $Q$-estimates from local agents. For the\nfirst time, we characterize the impact of these two mechanisms (which have\nremained elusive) by providing a finite-time analysis of our algorithm,\ndemonstrating strong convergence behaviors when utilizing either direct or\nerror-feedback compression. Our bounds indicate improved solution accuracy\nconcerning the number of agents and other federated hyperparameters while\nsimultaneously reducing communication costs. To corroborate our theory, we also\nconduct in-depth numerical experiments to verify our findings, considering\nTop-$K$ and Sparsified-$K$ sparsification operators.\n","authors":["Ali Beikmohammadi","Sarit Khirirat","Sindri Magnússon"],"pdf_url":"https://arxiv.org/pdf/2404.10635v4.pdf","comment":"European Conference on Machine Learning and Principles and Practice\n  of Knowledge Discovery in Databases (ECML-PKDD 2024)"},{"id":"http://arxiv.org/abs/2408.10174v2","updated":"2024-08-26T07:34:46Z","published":"2024-08-19T17:32:15Z","title":"SMILE: Zero-Shot Sparse Mixture of Low-Rank Experts Construction From\n  Pre-Trained Foundation Models","summary":"  Deep model training on extensive datasets is increasingly becoming\ncost-prohibitive, prompting the widespread adoption of deep model fusion\ntechniques to leverage knowledge from pre-existing models. From simple weight\naveraging to more sophisticated methods like AdaMerging, model fusion\neffectively improves model performance and accelerates the development of new\nmodels. However, potential interference between parameters of individual models\nand the lack of interpretability in the fusion progress remain significant\nchallenges. Existing methods often try to resolve the parameter interference\nissue by evaluating attributes of parameters, such as their magnitude or sign,\nor by parameter pruning. In this study, we begin by examining the fine-tuning\nof linear layers through the lens of subspace analysis and explicitly define\nparameter interference as an optimization problem to shed light on this\nsubject. Subsequently, we introduce an innovative approach to model fusion\ncalled zero-shot Sparse MIxture of Low-rank Experts (SMILE) construction, which\nallows for the upscaling of source models into an MoE model without extra data\nor further training. Our approach relies on the observation that fine-tuning\nmostly keeps the important parts from the pre-training, but it uses less\nsignificant or unused areas to adapt to new tasks. Also, the issue of parameter\ninterference, which is intrinsically intractable in the original parameter\nspace, can be managed by expanding the dimensions. We conduct extensive\nexperiments across diverse scenarios, such as image classification and text\ngeneration tasks, using full fine-tuning and LoRA fine-tuning, and we apply our\nmethod to large language models (CLIP models, Flan-T5 models, and Mistral-7B\nmodels), highlighting the adaptability and scalability of SMILE. Code is\navailable at https://github.com/tanganke/fusion_bench\n","authors":["Anke Tang","Li Shen","Yong Luo","Shuai Xie","Han Hu","Lefei Zhang","Bo Du","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2408.10174v2.pdf","comment":"Code is available at https://github.com/tanganke/fusion_bench"},{"id":"http://arxiv.org/abs/2301.12778v3","updated":"2024-08-26T07:19:33Z","published":"2023-01-30T10:48:10Z","title":"Investigating Feature and Model Importance in Android Malware Detection:\n  An Implemented Survey and Experimental Comparison of ML-Based Methods","summary":"  The popularity of Android means it is a common target for malware. Over the\nyears, various studies have found that machine learning models can effectively\ndiscriminate malware from benign applications. However, as the operating system\nevolves, so does malware, bringing into question the findings of these previous\nstudies, many of which report very high accuracies using small, outdated, and\noften imbalanced datasets. In this paper, we reimplement 18 representative past\nworks and reevaluate them using a balanced, relevant, and up-to-date dataset\ncomprising 124,000 applications. We also carry out new experiments designed to\nfill holes in existing knowledge, and use our findings to identify the most\neffective features and models to use for Android malware detection within a\ncontemporary environment. We show that high detection accuracies (up to 96.8%)\ncan be achieved using features extracted through static analysis alone,\nyielding a modest benefit (1%) from using far more expensive dynamic analysis.\nAPI calls and opcodes are the most productive static and TCP network traffic\nprovide the most predictive dynamic features. Random forests are generally the\nmost effective model, outperforming more complex deep learning approaches.\nWhilst directly combining static and dynamic features is generally ineffective,\nensembling models separately leads to performances comparable to the best\nmodels but using less brittle features.\n","authors":["Ali Muzaffar","Hani Ragab Hassen","Hind Zantout","Michael A Lones"],"pdf_url":"https://arxiv.org/pdf/2301.12778v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13089v2","updated":"2024-08-26T06:40:07Z","published":"2024-08-23T14:16:10Z","title":"On the good reliability of an interval-based metric to validate\n  prediction uncertainty for machine learning regression tasks","summary":"  This short study presents an opportunistic approach to a (more) reliable\nvalidation method for prediction uncertainty average calibration. Considering\nthat variance-based calibration metrics (ZMS, NLL, RCE...) are quite sensitive\nto the presence of heavy tails in the uncertainty and error distributions, a\nshift is proposed to an interval-based metric, the Prediction Interval Coverage\nProbability (PICP). It is shown on a large ensemble of molecular properties\ndatasets that (1) sets of z-scores are well represented by Student's-$t(\\nu)$\ndistributions, $\\nu$ being the number of degrees of freedom; (2) accurate\nestimation of 95 $\\%$ prediction intervals can be obtained by the simple\n$2\\sigma$ rule for $\\nu>3$; and (3) the resulting PICPs are more quickly and\nreliably tested than variance-based calibration metrics. Overall, this method\nenables to test 20 $\\%$ more datasets than ZMS testing. Conditional calibration\nis also assessed using the PICP approach.\n","authors":["Pascal Pernot"],"pdf_url":"https://arxiv.org/pdf/2408.13089v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14042v1","updated":"2024-08-26T06:39:49Z","published":"2024-08-26T06:39:49Z","title":"PAGE: Parametric Generative Explainer for Graph Neural Network","summary":"  This article introduces PAGE, a parameterized generative interpretive\nframework. PAGE is capable of providing faithful explanations for any graph\nneural network without necessitating prior knowledge or internal details.\nSpecifically, we train the auto-encoder to generate explanatory substructures\nby designing appropriate training strategy. Due to the dimensionality reduction\nof features in the latent space of the auto-encoder, it becomes easier to\nextract causal features leading to the model's output, which can be easily\nemployed to generate explanations. To accomplish this, we introduce an\nadditional discriminator to capture the causality between latent causal\nfeatures and the model's output. By designing appropriate optimization\nobjectives, the well-trained discriminator can be employed to constrain the\nencoder in generating enhanced causal features. Finally, these features are\nmapped to substructures of the input graph through the decoder to serve as\nexplanations. Compared to existing methods, PAGE operates at the sample scale\nrather than nodes or edges, eliminating the need for perturbation or encoding\nprocesses as seen in previous methods. Experimental results on both\nartificially synthesized and real-world datasets demonstrate that our approach\nnot only exhibits the highest faithfulness and accuracy but also significantly\noutperforms baseline models in terms of efficiency.\n","authors":["Yang Qiu","Wei Liu","Jun Wang","Ruixuan Li"],"pdf_url":"https://arxiv.org/pdf/2408.14042v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10737v2","updated":"2024-08-26T06:37:24Z","published":"2024-06-15T20:47:38Z","title":"Dynamic Domains, Dynamic Solutions: DPCore for Continual Test-Time\n  Adaptation","summary":"  Continual Test-Time Adaptation (CTTA) seeks to adapt a source pre-trained\nmodel to continually changing, unlabeled target domains. Existing TTA methods\nare typically designed for environments where domain changes occur sequentially\nand can struggle in more dynamic scenarios, as illustrated in Figure\n\\ref{fig:settings}. Inspired by the principles of online K-Means, we introduce\na novel approach to CTTA through visual prompting. We propose a \\emph{Dynamic\nPrompt Coreset} that not only preserves knowledge from previously visited\ndomains but also accommodates learning from new potential domains. This is\ncomplemented by a distance-based \\emph{Weight Updating Mechanism} that ensures\nthe coreset remains current and relevant. Our approach employs a fixed model\narchitecture alongside the coreset and an innovative updating system to\neffectively mitigate challenges such as catastrophic forgetting and error\naccumulation. Extensive testing on four widely-used benchmarks demonstrates\nthat our method consistently outperforms state-of-the-art alternatives in both\nclassification and segmentation CTTA tasks across the structured and dynamic\nCTTA settings, with $99\\%$ fewer trainable parameters.\n","authors":["Yunbei Zhang","Akshay Mehra","Jihun Hamm"],"pdf_url":"https://arxiv.org/pdf/2406.10737v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14037v1","updated":"2024-08-26T06:14:25Z","published":"2024-08-26T06:14:25Z","title":"Re-Mix: Optimizing Data Mixtures for Large Scale Imitation Learning","summary":"  Increasingly large imitation learning datasets are being collected with the\ngoal of training foundation models for robotics. However, despite the fact that\ndata selection has been of utmost importance in vision and natural language\nprocessing, little work in robotics has questioned what data such models should\nactually be trained on. In this work we investigate how to weigh different\nsubsets or ``domains'' of robotics datasets for robot foundation model\npre-training. Concrete, we use distributionally robust optimization (DRO) to\nmaximize worst-case performance across all possible downstream domains. Our\nmethod, Re-Mix, addresses the wide range of challenges that arise when applying\nDRO to robotics datasets including variability in action spaces and dynamics\nacross different datasets. Re-Mix employs early stopping, action normalization,\nand discretization to counteract these issues. Through extensive\nexperimentation on the largest open-source robot manipulation dataset, the Open\nX-Embodiment dataset, we demonstrate that data curation can have an outsized\nimpact on downstream performance. Specifically, domain weights learned by\nRe-Mix outperform uniform weights by 38\\% on average and outperform\nhuman-selected weights by 32\\% on datasets used to train existing generalist\nrobot policies, specifically the RT-X models.\n","authors":["Joey Hejna","Chethan Bhateja","Yichen Jian","Karl Pertsch","Dorsa Sadigh"],"pdf_url":"https://arxiv.org/pdf/2408.14037v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19757v3","updated":"2024-08-26T05:54:22Z","published":"2024-05-30T07:06:02Z","title":"Improving SMOTE via Fusing Conditional VAE for Data-adaptive Noise\n  Filtering","summary":"  Recent advances in a generative neural network model extend the development\nof data augmentation methods. However, the augmentation methods based on the\nmodern generative models fail to achieve notable performance for class\nimbalance data compared to the conventional model, Synthetic Minority\nOversampling Technique (SMOTE). We investigate the problem of the generative\nmodel for imbalanced classification and introduce a framework to enhance the\nSMOTE algorithm using Variational Autoencoders (VAE). Our approach\nsystematically quantifies the density of data points in a low-dimensional\nlatent space using the VAE, simultaneously incorporating information on class\nlabels and classification difficulty. Then, the data points potentially\ndegrading the augmentation are systematically excluded, and the neighboring\nobservations are directly augmented on the data space. Empirical studies on\nseveral imbalanced datasets represent that this simple process innovatively\nimproves the conventional SMOTE algorithm over the deep learning models.\nConsequently, we conclude that the selection of minority data and the\ninterpolation in the data space are beneficial for imbalanced classification\nproblems with a relatively small number of data points.\n","authors":["Sungchul Hong","Seunghwan An","Jong-June Jeon"],"pdf_url":"https://arxiv.org/pdf/2405.19757v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18878v2","updated":"2024-08-26T05:54:21Z","published":"2024-03-27T10:46:24Z","title":"Teaching AI the Anatomy Behind the Scan: Addressing Anatomical Flaws in\n  Medical Image Segmentation with Learnable Prior","summary":"  Imposing key anatomical features, such as the number of organs, their shapes\nand relative positions, is crucial for building a robust multi-organ\nsegmentation model. Current attempts to incorporate anatomical features include\nbroadening the effective receptive field (ERF) size with data-intensive\nmodules, or introducing anatomical constraints that scales poorly to\nmulti-organ segmentation. We introduce a novel architecture called the\nAnatomy-Informed Cascaded Segmentation Network (AIC-Net). AIC-Net incorporates\na learnable input termed \"Anatomical Prior\", which can be adapted to\npatient-specific anatomy using a differentiable spatial deformation. The\ndeformed prior later guides decoder layers towards more anatomy-informed\npredictions. We repeat this process at a local patch level to enhance the\nrepresentation of intricate objects, resulting in a cascaded network structure.\nAIC-Net is a general method that enhances any existing segmentation models to\nbe more anatomy-aware. We have validated the performance of AIC-Net, with\nvarious backbones, on two multi-organ segmentation tasks: abdominal organs and\nvertebrae. For each respective task, our benchmarks demonstrate improved dice\nscore and Hausdorff distance.\n","authors":["Young Seok Jeon","Hongfei Yang","Huazhu Fu","Mengling Feng"],"pdf_url":"https://arxiv.org/pdf/2403.18878v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14028v1","updated":"2024-08-26T05:38:27Z","published":"2024-08-26T05:38:27Z","title":"SurGen: Text-Guided Diffusion Model for Surgical Video Generation","summary":"  Diffusion-based video generation models have made significant strides,\nproducing outputs with improved visual fidelity, temporal coherence, and user\ncontrol. These advancements hold great promise for improving surgical education\nby enabling more realistic, diverse, and interactive simulation environments.\nIn this study, we introduce SurGen, a text-guided diffusion model tailored for\nsurgical video synthesis, producing the highest resolution and longest duration\nvideos among existing surgical video generation models. We validate the visual\nand temporal quality of the outputs using standard image and video generation\nmetrics. Additionally, we assess their alignment to the corresponding text\nprompts through a deep learning classifier trained on surgical data. Our\nresults demonstrate the potential of diffusion models to serve as valuable\neducational tools for surgical trainees.\n","authors":["Joseph Cho","Samuel Schmidgall","Cyril Zakka","Mrudang Mathur","Rohan Shad","William Hiesinger"],"pdf_url":"https://arxiv.org/pdf/2408.14028v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14025v1","updated":"2024-08-26T05:31:46Z","published":"2024-08-26T05:31:46Z","title":"An Item Response Theory-based R Module for Algorithm Portfolio Analysis","summary":"  Experimental evaluation is crucial in AI research, especially for assessing\nalgorithms across diverse tasks. Many studies often evaluate a limited set of\nalgorithms, failing to fully understand their strengths and weaknesses within a\ncomprehensive portfolio. This paper introduces an Item Response Theory (IRT)\nbased analysis tool for algorithm portfolio evaluation called AIRT-Module.\nTraditionally used in educational psychometrics, IRT models test question\ndifficulty and student ability using responses to test questions. Adapting IRT\nto algorithm evaluation, the AIRT-Module contains a Shiny web application and\nthe R package airt. AIRT-Module uses algorithm performance measures to compute\nanomalousness, consistency, and difficulty limits for an algorithm and the\ndifficulty of test instances. The strengths and weaknesses of algorithms are\nvisualised using the difficulty spectrum of the test instances. AIRT-Module\noffers a detailed understanding of algorithm capabilities across varied test\ninstances, thus enhancing comprehensive AI method assessment. It is available\nat https://sevvandi.shinyapps.io/AIRT/ .\n","authors":["Brodie Oldfield","Sevvandi Kandanaarachchi","Ziqi Xu","Mario Andrés Muñoz"],"pdf_url":"https://arxiv.org/pdf/2408.14025v1.pdf","comment":"10 Pages, 6 Figures. Submitted to SoftwareX"},{"id":"http://arxiv.org/abs/2408.10566v2","updated":"2024-08-26T05:08:29Z","published":"2024-08-20T06:05:52Z","title":"SparseGrow: Addressing Growth-Induced Forgetting in Task-Agnostic\n  Continual Learning","summary":"  In continual learning (CL), model growth enhances adaptability over new data,\nimproving knowledge retention for more tasks. However, improper model growth\ncan lead to severe degradation of previously learned knowledge, an issue we\nname as growth-induced forgetting (GIFt), especially in task-agnostic CL using\nentire grown model for inference. Existing works, despite adopting model growth\nand random initialization for better adaptability, often fail to recognize the\npresence of GIFt caused by improper model growth. This oversight limits\ncomprehensive control of forgetting and hinders full utilization of model\ngrowth. We are the first in CL to identify this issue and conduct an in-depth\nstudy on root cause of GIFt, where layer expansion stands out among model\ngrowth strategies, widening layers without affecting model functionality. Yet,\ndirect adoption of layer expansion presents challenges. It lacks data-driven\ncontrol and initialization of expanded parameters to balance adaptability and\nknowledge retention. This paper presents a novel SparseGrow approach to\novercome the issue of GIFt while enhancing adaptability over new data.\nSparseGrow employs data-driven sparse layer expansion to control efficient\nparameter usage during growth, reducing GIFt from excessive growth and\nfunctionality changes. It also combines sparse growth with on-data\ninitialization at training late-stage to create partially 0-valued expansions\nthat fit learned distribution, enhancing retention and adaptability. To further\nminimize forgetting, freezing is applied by calculating the sparse mask,\nallowing data-driven preservation of important parameters. Through experiments\nacross datasets with various settings, cases and task numbers, we demonstrate\nthe necessity of layer expansion and showcase the effectiveness of SparseGrow\nin overcoming GIFt, highlighting its adaptability and knowledge retention for\nincremental tasks.\n","authors":["Yuqing Zhao","Divya Saxena","Jiannong Cao","Xiaoyun Liu","Changlin Song"],"pdf_url":"https://arxiv.org/pdf/2408.10566v2.pdf","comment":"This paper has been submitted to the AAAI conference. If accepted,\n  the final version will be updated to reflect the conference proceedings"},{"id":"http://arxiv.org/abs/2407.10784v3","updated":"2024-08-26T04:58:15Z","published":"2024-07-15T15:02:53Z","title":"AdapTable: Test-Time Adaptation for Tabular Data via Shift-Aware\n  Uncertainty Calibrator and Label Distribution Handler","summary":"  In real-world scenarios, tabular data often suffer from distribution shifts\nthat threaten the performance of machine learning models. Despite its\nprevalence and importance, handling distribution shifts in the tabular domain\nremains underexplored due to the inherent challenges within the tabular data\nitself. In this sense, test-time adaptation (TTA) offers a promising solution\nby adapting models to target data without accessing source data, crucial for\nprivacy-sensitive tabular domains. However, existing TTA methods either 1)\noverlook the nature of tabular distribution shifts, often involving label\ndistribution shifts, or 2) impose architectural constraints on the model,\nleading to a lack of applicability. To this end, we propose AdapTable, a novel\nTTA framework for tabular data. AdapTable operates in two stages: 1)\ncalibrating model predictions using a shift-aware uncertainty calibrator, and\n2) adjusting these predictions to match the target label distribution with a\nlabel distribution handler. We validate the effectiveness of AdapTable through\ntheoretical analysis and extensive experiments on various distribution shift\nscenarios. Our results demonstrate AdapTable's ability to handle various\nreal-world distribution shifts, achieving up to a 16% improvement on the HELOC\ndataset.\n","authors":["Changhun Kim","Taewon Kim","Seungyeon Woo","June Yong Yang","Eunho Yang"],"pdf_url":"https://arxiv.org/pdf/2407.10784v3.pdf","comment":"Under Review at AAAI 2025"},{"id":"http://arxiv.org/abs/2408.14014v1","updated":"2024-08-26T04:39:33Z","published":"2024-08-26T04:39:33Z","title":"Category-Theoretical and Topos-Theoretical Frameworks in Machine\n  Learning: A Survey","summary":"  In this survey, we provide an overview of category theory-derived machine\nlearning from four mainstream perspectives: gradient-based learning,\nprobability-based learning, invariance and equivalence-based learning, and\ntopos-based learning. For the first three topics, we primarily review research\nin the past five years, updating and expanding on the previous survey by\nShiebler et al.. The fourth topic, which delves into higher category theory,\nparticularly topos theory, is surveyed for the first time in this paper. In\ncertain machine learning methods, the compositionality of functors plays a\nvital role, prompting the development of specific categorical frameworks.\nHowever, when considering how the global properties of a network reflect in\nlocal structures and how geometric properties are expressed with logic, the\ntopos structure becomes particularly significant and profound.\n","authors":["Yiyang Jia","Guohong Peng","Zheng Yang","Tianhao Chen"],"pdf_url":"https://arxiv.org/pdf/2408.14014v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14010v1","updated":"2024-08-26T04:31:55Z","published":"2024-08-26T04:31:55Z","title":"Improving Water Quality Time-Series Prediction in Hong Kong using\n  Sentinel-2 MSI Data and Google Earth Engine Cloud Computing","summary":"  Effective water quality monitoring in coastal regions is crucial due to the\nprogressive deterioration caused by pollution and human activities. To address\nthis, this study develops time-series models to predict chlorophyll-a (Chl-a),\nsuspended solids (SS), and turbidity using Sentinel-2 satellite data and Google\nEarth Engine (GEE) in the coastal regions of Hong Kong. Leveraging Long\nShort-Term Memory (LSTM) Recurrent Neural Networks, the study incorporates\nextensive temporal datasets to enhance prediction accuracy. The models utilize\nspectral data from Sentinel-2, focusing on optically active components, and\ndemonstrate that selected variables closely align with the spectral\ncharacteristics of Chl-a and SS. The results indicate improved predictive\nperformance over previous methods, highlighting the potential for remote\nsensing technology in continuous and comprehensive water quality assessment.\n","authors":["Rohin Sood","Kevin Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.14010v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14001v1","updated":"2024-08-26T03:58:20Z","published":"2024-08-26T03:58:20Z","title":"Decentralized Federated Learning with Model Caching on Mobile Agents","summary":"  Federated Learning (FL) aims to train a shared model using data and\ncomputation power on distributed agents coordinated by a central server.\nDecentralized FL (DFL) utilizes local model exchange and aggregation between\nagents to reduce the communication and computation overheads on the central\nserver. However, when agents are mobile, the communication opportunity between\nagents can be sporadic, largely hindering the convergence and accuracy of DFL.\nIn this paper, we study delay-tolerant model spreading and aggregation enabled\nby model caching on mobile agents. Each agent stores not only its own model,\nbut also models of agents encountered in the recent past. When two agents meet,\nthey exchange their own models as well as the cached models. Local model\naggregation works on all models in the cache. We theoretically analyze the\nconvergence of DFL with cached models, explicitly taking into account the model\nstaleness introduced by caching. We design and compare different model caching\nalgorithms for different DFL and mobility scenarios. We conduct detailed case\nstudies in a vehicular network to systematically investigate the interplay\nbetween agent mobility, cache staleness, and model convergence. In our\nexperiments, cached DFL converges quickly, and significantly outperforms DFL\nwithout caching.\n","authors":["Xiaoyu Wang","Guojun Xiong","Houwei Cao","Jian Li","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2408.14001v1.pdf","comment":"27 pages"},{"id":"http://arxiv.org/abs/2407.06886v7","updated":"2024-08-26T03:25:12Z","published":"2024-07-09T14:14:47Z","title":"Aligning Cyber Space with Physical World: A Comprehensive Survey on\n  Embodied AI","summary":"  Embodied Artificial Intelligence (Embodied AI) is crucial for achieving\nArtificial General Intelligence (AGI) and serves as a foundation for various\napplications that bridge cyberspace and the physical world. Recently, the\nemergence of Multi-modal Large Models (MLMs) and World Models (WMs) have\nattracted significant attention due to their remarkable perception,\ninteraction, and reasoning capabilities, making them a promising architecture\nfor the brain of embodied agents. However, there is no comprehensive survey for\nEmbodied AI in the era of MLMs. In this survey, we give a comprehensive\nexploration of the latest advancements in Embodied AI. Our analysis firstly\nnavigates through the forefront of representative works of embodied robots and\nsimulators, to fully understand the research focuses and their limitations.\nThen, we analyze four main research targets: 1) embodied perception, 2)\nembodied interaction, 3) embodied agent, and 4) sim-to-real adaptation,\ncovering the state-of-the-art methods, essential paradigms, and comprehensive\ndatasets. Additionally, we explore the complexities of MLMs in virtual and real\nembodied agents, highlighting their significance in facilitating interactions\nin dynamic digital and physical environments. Finally, we summarize the\nchallenges and limitations of embodied AI and discuss their potential future\ndirections. We hope this survey will serve as a foundational reference for the\nresearch community and inspire continued innovation. The associated project can\nbe found at https://github.com/HCPLab-SYSU/Embodied_AI_Paper_List.\n","authors":["Yang Liu","Weixing Chen","Yongjie Bai","Xiaodan Liang","Guanbin Li","Wen Gao","Liang Lin"],"pdf_url":"https://arxiv.org/pdf/2407.06886v7.pdf","comment":"The first comprehensive review of Embodied AI in the era of MLMs, 39\n  pages. We also provide the paper list for Embodied AI:\n  https://github.com/HCPLab-SYSU/Embodied_AI_Paper_List"},{"id":"http://arxiv.org/abs/2408.13991v1","updated":"2024-08-26T03:19:52Z","published":"2024-08-26T03:19:52Z","title":"Dual-CBA: Improving Online Continual Learning via Dual Continual Bias\n  Adaptors from a Bi-level Optimization Perspective","summary":"  In online continual learning (CL), models trained on changing distributions\neasily forget previously learned knowledge and bias toward newly received\ntasks. To address this issue, we present Continual Bias Adaptor (CBA), a\nbi-level framework that augments the classification network to adapt to\ncatastrophic distribution shifts during training, enabling the network to\nachieve a stable consolidation of all seen tasks. However, the CBA module\nadjusts distribution shifts in a class-specific manner, exacerbating the\nstability gap issue and, to some extent, fails to meet the need for continual\ntesting in online CL. To mitigate this challenge, we further propose a novel\nclass-agnostic CBA module that separately aggregates the posterior\nprobabilities of classes from new and old tasks, and applies a stable\nadjustment to the resulting posterior probabilities. We combine the two kinds\nof CBA modules into a unified Dual-CBA module, which thus is capable of\nadapting to catastrophic distribution shifts and simultaneously meets the\nreal-time testing requirements of online CL. Besides, we propose Incremental\nBatch Normalization (IBN), a tailored BN module to re-estimate its population\nstatistics for alleviating the feature bias arising from the inner loop\noptimization problem of our bi-level framework. To validate the effectiveness\nof the proposed method, we theoretically provide some insights into how it\nmitigates catastrophic distribution shifts, and empirically demonstrate its\nsuperiority through extensive experiments based on four rehearsal-based\nbaselines and three public continual learning benchmarks.\n","authors":["Quanziang Wang","Renzhen Wang","Yichen Wu","Xixi Jia","Minghao Zhou","Deyu Meng"],"pdf_url":"https://arxiv.org/pdf/2408.13991v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.15951v4","updated":"2024-08-26T03:09:46Z","published":"2023-06-28T06:21:22Z","title":"Reduce Computational Complexity for Convolutional Layers by Skipping\n  Zeros","summary":"  Convolutional neural networks necessitate good algorithms to reduce\ncomplexity, and sufficient utilization of parallel processors for acceleration.\nWithin convolutional layers, there are three types of operators: convolution\nused in forward propagation, deconvolution and dilated-convolution utilized in\nbackward propagation. During the execution of these operators, zeros are\ntypically added to tensors, leading to redundant calculations and unnecessary\nstrain on hardware. To circumvent these inefficiencies, we propose the C-K-S\nalgorithm, accompanied by efficient GPU implementations. C-K-S trims filters to\nexclude zero-padding. For deconvolution and dilated-convolution, C-K-S\ntransforms sparse tensors into dense tensors, and standardizes the local\ncomputational rules to simplify the hardware control. The experimental results\ndemonstrate that C-K-S offers good performance in terms of speed and\nconvergence, surpassing the capabilities of PyTorch and cuDNN in certain\nscenarios.\n","authors":["Zhiyi Zhang","Pengfei Zhang","Zhuopin Xu","Qi Wang"],"pdf_url":"https://arxiv.org/pdf/2306.15951v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08713v2","updated":"2024-08-26T03:03:47Z","published":"2024-08-16T12:51:52Z","title":"Beyond KAN: Introducing KarSein for Adaptive High-Order Feature\n  Interaction Modeling in CTR Prediction","summary":"  Modeling feature interactions is crucial for click-through rate (CTR)\nprediction, particularly when it comes to high-order explicit interactions.\nTraditional methods struggle with this task because they often predefine a\nmaximum interaction order, which relies heavily on prior knowledge and can\nlimit the model's effectiveness. Additionally, modeling high-order interactions\ntypically leads to increased computational costs. Therefore, the challenge lies\nin adaptively modeling high-order feature interactions while maintaining\nefficiency. To address this issue, we introduce Kolmogorov-Arnold Represented\nSparse Efficient Interaction Network (KarSein), designed to optimize both\npredictive accuracy and computational efficiency. We firstly identify\nlimitations of directly applying Kolmogorov-Arnold Networks (KAN) to CTR and\nthen introduce KarSein to overcome these issues. It features a novel\narchitecture that reduces the computational costs of KAN and supports embedding\nvectors as feature inputs. Additionally, KarSein employs guided symbolic\nregression to address the challenge of KAN in spontaneously learning\nmultiplicative relationships. Extensive experiments demonstrate KarSein's\nsuperior performance, achieving significant predictive accuracy with minimal\ncomputational overhead. Furthermore, KarSein maintains strong global\nexplainability while enabling the removal of redundant features, resulting in a\nsparse network structure. These advantages also position KarSein as a promising\nmethod for efficient inference.\n","authors":["Yunxiao Shi","Wujiang Xu","Mingyu Jin","Haimin Zhang","Qiang Wu","Yongfeng Zhang","Min Xu"],"pdf_url":"https://arxiv.org/pdf/2408.08713v2.pdf","comment":"KarSein for CTR"},{"id":"http://arxiv.org/abs/2304.06879v2","updated":"2024-08-26T02:59:10Z","published":"2023-04-14T01:12:48Z","title":"Performative Prediction with Neural Networks","summary":"  Performative prediction is a framework for learning models that influence the\ndata they intend to predict. We focus on finding classifiers that are\nperformatively stable, i.e. optimal for the data distribution they induce.\nStandard convergence results for finding a performatively stable classifier\nwith the method of repeated risk minimization assume that the data distribution\nis Lipschitz continuous to the model's parameters. Under this assumption, the\nloss must be strongly convex and smooth in these parameters; otherwise, the\nmethod will diverge for some problems. In this work, we instead assume that the\ndata distribution is Lipschitz continuous with respect to the model's\npredictions, a more natural assumption for performative systems. As a result,\nwe are able to significantly relax the assumptions on the loss function. In\nparticular, we do not need to assume convexity with respect to the model's\nparameters. As an illustration, we introduce a resampling procedure that models\nrealistic distribution shifts and show that it satisfies our assumptions. We\nsupport our theory by showing that one can learn performatively stable\nclassifiers with neural networks making predictions about real data that shift\naccording to our proposed procedure.\n","authors":["Mehrnaz Mofakhami","Ioannis Mitliagkas","Gauthier Gidel"],"pdf_url":"https://arxiv.org/pdf/2304.06879v2.pdf","comment":"Published at AISTATS 2023; Theoretical results extended"},{"id":"http://arxiv.org/abs/2408.13282v1","updated":"2024-08-26T02:53:55Z","published":"2024-08-26T02:53:55Z","title":"Question answering system of bridge design specification based on large\n  language model","summary":"  This paper constructs question answering system for bridge design\nspecification based on large language model. Three implementation schemes are\ntried: full fine-tuning of the Bert pretrained model, parameter-efficient\nfine-tuning of the Bert pretrained model, and self-built language model from\nscratch. Through the self-built question and answer task dataset, based on the\ntensorflow and keras deep learning platform framework, the model is constructed\nand trained to predict the start position and end position of the answer in the\nbridge design specification given by the user. The experimental results show\nthat full fine-tuning of the Bert pretrained model achieves 100% accuracy in\nthe training-dataset, validation-dataset and test-dataset, and the system can\nextract the answers from the bridge design specification given by the user to\nanswer various questions of the user; While parameter-efficient fine-tuning of\nthe Bert pretrained model and self-built language model from scratch perform\nwell in the training-dataset, their generalization ability in the test-dataset\nneeds to be improved. The research of this paper provides a useful reference\nfor the development of question answering system in professional field.\n","authors":["Leye Zhang","Xiangxiang Tian","Hongjun Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.13282v1.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2408.13986v1","updated":"2024-08-26T02:36:55Z","published":"2024-08-26T02:36:55Z","title":"AgentMove: Predicting Human Mobility Anywhere Using Large Language Model\n  based Agentic Framework","summary":"  Human mobility prediction plays a crucial role in various real-world\napplications. Although deep learning based models have shown promising results\nover the past decade, their reliance on extensive private mobility data for\ntraining and their inability to perform zero-shot predictions, have hindered\nfurther advancements. Recently, attempts have been made to apply large language\nmodels (LLMs) to mobility prediction task. However, their performance has been\nconstrained by the absence of a systematic design of workflow. They directly\ngenerate the final output using LLMs, which limits the potential of LLMs to\nuncover complex mobility patterns and underestimates their extensive reserve of\nglobal geospatial knowledge. In this paper, we introduce AgentMove, a\nsystematic agentic prediction framework to achieve generalized mobility\nprediction for any cities worldwide. In AgentMove, we first decompose the\nmobility prediction task into three sub-tasks and then design corresponding\nmodules to complete these subtasks, including spatial-temporal memory for\nindividual mobility pattern mining, world knowledge generator for modeling the\neffects of urban structure and collective knowledge extractor for capturing the\nshared patterns among population. Finally, we combine the results of three\nmodules and conduct a reasoning step to generate the final predictions.\nExtensive experiments on mobility data from two sources in 12 cities\ndemonstrate that AgentMove outperforms the best baseline more than 8% in\nvarious metrics and it shows robust predictions with various LLMs as base and\nalso less geographical bias across cities. Codes and data can be found in\nhttps://github.com/tsinghua-fib-lab/AgentMove.\n","authors":["Jie Feng","Yuwei Du","Jie Zhao","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2408.13986v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2209.11691v4","updated":"2024-08-26T02:33:01Z","published":"2022-09-23T16:11:09Z","title":"Linear multidimensional regression with interactive fixed-effects","summary":"  This paper studies a linear and additively separable model for\nmultidimensional panel data of three or more dimensions with unobserved\ninteractive fixed effects. Two approaches are considered to account for these\nunobserved interactive fixed-effects when estimating coefficients on the\nobserved covariates. First, the model is embedded within the standard two\ndimensional panel framework and restrictions are formed under which the factor\nstructure methods in Bai (2009) lead to consistent estimation of model\nparameters, but at slow rates of convergence. The second approach develops a\nkernel weighted fixed-effects method that is more robust to the\nmultidimensional nature of the problem and can achieve the parametric rate of\nconsistency under certain conditions. Theoretical results and simulations show\nsome benefits to standard two-dimensional panel methods when the structure of\nthe interactive fixed-effect term is known, but also highlight how the kernel\nweighted method performs well without knowledge of this structure. The methods\nare implemented to estimate the demand elasticity for beer.\n","authors":["Hugo Freeman"],"pdf_url":"https://arxiv.org/pdf/2209.11691v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12095v2","updated":"2024-08-26T02:26:31Z","published":"2024-08-22T03:08:49Z","title":"uMedSum: A Unified Framework for Advancing Medical Abstractive\n  Summarization","summary":"  Medical abstractive summarization faces the challenge of balancing\nfaithfulness and informativeness. Current methods often sacrifice key\ninformation for faithfulness or introduce confabulations when prioritizing\ninformativeness. While recent advancements in techniques like in-context\nlearning (ICL) and fine-tuning have improved medical summarization, they often\noverlook crucial aspects such as faithfulness and informativeness without\nconsidering advanced methods like model reasoning and self-improvement.\nMoreover, the field lacks a unified benchmark, hindering systematic evaluation\ndue to varied metrics and datasets. This paper addresses these gaps by\npresenting a comprehensive benchmark of six advanced abstractive summarization\nmethods across three diverse datasets using five standardized metrics. Building\non these findings, we propose uMedSum, a modular hybrid summarization framework\nthat introduces novel approaches for sequential confabulation removal followed\nby key missing information addition, ensuring both faithfulness and\ninformativeness. Our work improves upon previous GPT-4-based state-of-the-art\n(SOTA) medical summarization methods, significantly outperforming them in both\nquantitative metrics and qualitative domain expert evaluations. Notably, we\nachieve an average relative performance improvement of 11.8% in reference-free\nmetrics over the previous SOTA. Doctors prefer uMedSum's summaries 6 times more\nthan previous SOTA in difficult cases where there are chances of confabulations\nor missing information. These results highlight uMedSum's effectiveness and\ngeneralizability across various datasets and metrics, marking a significant\nadvancement in medical summarization.\n","authors":["Aishik Nagar","Yutong Liu","Andy T. Liu","Viktor Schlegel","Vijay Prakash Dwivedi","Arun-Kumar Kaliya-Perumal","Guna Pratheep Kalanchiam","Yili Tang","Robby T. Tan"],"pdf_url":"https://arxiv.org/pdf/2408.12095v2.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2403.10650v2","updated":"2024-08-26T02:19:11Z","published":"2024-03-15T19:35:10Z","title":"PALM: Pushing Adaptive Learning Rate Mechanisms for Continual Test-Time\n  Adaptation","summary":"  Real-world vision models in dynamic environments face rapid shifts in domain\ndistributions, leading to decreased recognition performance. Using unlabeled\ntest data, continual test-time adaptation (CTTA) directly adjusts a pre-trained\nsource discriminative model to these changing domains. A highly effective CTTA\nmethod involves applying layer-wise adaptive learning rates for selectively\nadapting pre-trained layers. However, it suffers from the poor estimation of\ndomain shift and the inaccuracies arising from the pseudo-labels. This work\naims to overcome these limitations by identifying layers for adaptation via\nquantifying model prediction uncertainty without relying on pseudo-labels. We\nutilize the magnitude of gradients as a metric, calculated by backpropagating\nthe KL divergence between the softmax output and a uniform distribution, to\nselect layers for further adaptation. Subsequently, for the parameters\nexclusively belonging to these selected layers, with the remaining ones frozen,\nwe evaluate their sensitivity to approximate the domain shift and adjust their\nlearning rates accordingly. We conduct extensive image classification\nexperiments on CIFAR-10C, CIFAR-100C, and ImageNet-C, demonstrating the\nsuperior efficacy of our method compared to prior approaches.\n","authors":["Sarthak Kumar Maharana","Baoming Zhang","Yunhui Guo"],"pdf_url":"https://arxiv.org/pdf/2403.10650v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13979v1","updated":"2024-08-26T02:09:05Z","published":"2024-08-26T02:09:05Z","title":"Nemesis: Normalizing the Soft-prompt Vectors of Vision-Language Models","summary":"  With the prevalence of large-scale pretrained vision-language models (VLMs),\nsuch as CLIP, soft-prompt tuning has become a popular method for adapting these\nmodels to various downstream tasks. However, few works delve into the inherent\nproperties of learnable soft-prompt vectors, specifically the impact of their\nnorms to the performance of VLMs. This motivates us to pose an unexplored\nresearch question: ``Do we need to normalize the soft prompts in VLMs?'' To\nfill this research gap, we first uncover a phenomenon, called the\n\\textbf{Low-Norm Effect} by performing extensive corruption experiments,\nsuggesting that reducing the norms of certain learned prompts occasionally\nenhances the performance of VLMs, while increasing them often degrades it. To\nharness this effect, we propose a novel method named \\textbf{N}ormalizing\nth\\textbf{e} soft-pro\\textbf{m}pt v\\textbf{e}ctors of vi\\textbf{si}on-language\nmodel\\textbf{s} (\\textbf{Nemesis}) to normalize soft-prompt vectors in VLMs. To\nthe best of our knowledge, our work is the first to systematically investigate\nthe role of norms of soft-prompt vector in VLMs, offering valuable insights for\nfuture research in soft-prompt tuning. The code is available at\n\\texttt{\\href{https://github.com/ShyFoo/Nemesis}{https://github.com/ShyFoo/Nemesis}}.\n","authors":["Shuai Fu","Xiequn Wang","Qiushi Huang","Yu Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.13979v1.pdf","comment":"Accepted at ICLR 2024 (Spotlight)"},{"id":"http://arxiv.org/abs/2408.02679v2","updated":"2024-08-26T01:17:43Z","published":"2024-07-31T08:44:34Z","title":"Visual Analysis of Multi-outcome Causal Graphs","summary":"  We introduce a visual analysis method for multiple causal graphs with\ndifferent outcome variables, namely, multi-outcome causal graphs. Multi-outcome\ncausal graphs are important in healthcare for understanding multimorbidity and\ncomorbidity. To support the visual analysis, we collaborated with medical\nexperts to devise two comparative visualization techniques at different stages\nof the analysis process. First, a progressive visualization method is proposed\nfor comparing multiple state-of-the-art causal discovery algorithms. The method\ncan handle mixed-type datasets comprising both continuous and categorical\nvariables and assist in the creation of a fine-tuned causal graph of a single\noutcome. Second, a comparative graph layout technique and specialized visual\nencodings are devised for the quick comparison of multiple causal graphs. In\nour visual analysis approach, analysts start by building individual causal\ngraphs for each outcome variable, and then, multi-outcome causal graphs are\ngenerated and visualized with our comparative technique for analyzing\ndifferences and commonalities of these causal graphs. Evaluation includes\nquantitative measurements on benchmark datasets, a case study with a medical\nexpert, and expert user studies with real-world health research data.\n","authors":["Mengjie Fan","Jinlu Yu","Daniel Weiskopf","Nan Cao","Huai-Yu Wang","Liang Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.02679v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10148v2","updated":"2024-08-26T01:08:49Z","published":"2024-06-14T15:59:36Z","title":"A Primal-Dual-Assisted Penalty Approach to Bilevel Optimization with\n  Coupled Constraints","summary":"  Interest in bilevel optimization has grown in recent years, partially due to\nits applications to tackle challenging machine-learning problems. Several\nexciting recent works have been centered around developing efficient\ngradient-based algorithms that can solve bilevel optimization problems with\nprovable guarantees. However, the existing literature mainly focuses on bilevel\nproblems either without constraints, or featuring only simple constraints that\ndo not couple variables across the upper and lower levels, excluding a range of\ncomplex applications. Our paper studies this challenging but less explored\nscenario and develops a (fully) first-order algorithm, which we term BLOCC, to\ntackle BiLevel Optimization problems with Coupled Constraints. We establish\nrigorous convergence theory for the proposed algorithm and demonstrate its\neffectiveness on two well-known real-world applications - hyperparameter\nselection in support vector machine (SVM) and infrastructure planning in\ntransportation networks using the real data from the city of Seville.\n","authors":["Liuyuan Jiang","Quan Xiao","Victor M. Tenorio","Fernando Real-Rojas","Antonio G. Marques","Tianyi Chen"],"pdf_url":"https://arxiv.org/pdf/2406.10148v2.pdf","comment":"In this version, we have made the following updates: (1) Added a\n  sensitivity analysis of the algorithm's hyperparameters (stepsize and penalty\n  constant) in Appendix G. (2) Included a computational complexity analysis and\n  comparison in Appendix H. (3) Explicitly stated the inner-loop stepsizes in\n  Remarks 2 and 3"},{"id":"http://arxiv.org/abs/2406.18747v2","updated":"2024-08-26T01:07:11Z","published":"2024-06-26T20:25:53Z","title":"A Stem-Agnostic Single-Decoder System for Music Source Separation Beyond\n  Four Stems","summary":"  Despite significant recent progress across multiple subtasks of audio source\nseparation, few music source separation systems support separation beyond the\nfour-stem vocals, drums, bass, and other (VDBO) setup. Of the very few current\nsystems that support source separation beyond this setup, most continue to rely\non an inflexible decoder setup that can only support a fixed pre-defined set of\nstems. Increasing stem support in these inflexible systems correspondingly\nrequires increasing computational complexity, rendering extensions of these\nsystems computationally infeasible for long-tail instruments. In this work, we\npropose Banquet, a system that allows source separation of multiple stems using\njust one decoder. A bandsplit source separation model is extended to work in a\nquery-based setup in tandem with a music instrument recognition PaSST model. On\nthe MoisesDB dataset, Banquet, at only 24.9 M trainable parameters, approached\nthe performance level of the significantly more complex 6-stem Hybrid\nTransformer Demucs on VDBO stems and outperformed it on guitar and piano. The\nquery-based setup allows for the separation of narrow instrument classes such\nas clean acoustic guitars, and can be successfully applied to the extraction of\nless common stems such as reeds and organs. Implementation is available at\nhttps://github.com/kwatcharasupat/query-bandit.\n","authors":["Karn N. Watcharasupat","Alexander Lerch"],"pdf_url":"https://arxiv.org/pdf/2406.18747v2.pdf","comment":"Accepted to the 25th International Society for Music Information\n  Retrieval Conference (ISMIR 2024). Camera-ready version"},{"id":"http://arxiv.org/abs/2407.07275v2","updated":"2024-08-26T00:55:01Z","published":"2024-07-09T23:39:37Z","title":"Remastering Divide and Remaster: A Cinematic Audio Source Separation\n  Dataset with Multilingual Support","summary":"  Cinematic audio source separation (CASS), as a problem of extracting the\ndialogue, music, and effects stems from their mixture, is a relatively new\nsubtask of audio source separation. To date, only one publicly available\ndataset exists for CASS, that is, the Divide and Remaster (DnR) dataset, which\nis currently at version 2. While DnR v2 has been an incredibly useful resource\nfor CASS, several areas of improvement have been identified, particularly\nthrough its use in the 2023 Sound Demixing Challenge. In this work, we develop\nversion 3 of the DnR dataset, addressing issues relating to vocal content in\nnon-dialogue stems, loudness distributions, mastering process, and linguistic\ndiversity. In particular, the dialogue stem of DnR v3 includes speech content\nfrom more than 30 languages from multiple families including but not limited to\nthe Germanic, Romance, Indo-Aryan, Dravidian, Malayo-Polynesian, and Bantu\nfamilies. Benchmark results using the Bandit model indicated that training on\nmultilingual data yields significant generalizability to the model even in\nlanguages with low data availability. Even in languages with high data\navailability, the multilingual model often performs on par or better than\ndedicated models trained on monolingual CASS datasets. Dataset and model\nimplementation will be made available at\nhttps://github.com/kwatcharasupat/source-separation-landing.\n","authors":["Karn N. Watcharasupat","Chih-Wei Wu","Iroro Orife"],"pdf_url":"https://arxiv.org/pdf/2407.07275v2.pdf","comment":"Accepted to the 5th IEEE International Symposium on the Internet of\n  Sounds. Camera-ready version"},{"id":"http://arxiv.org/abs/2405.00697v2","updated":"2024-08-26T00:53:17Z","published":"2024-04-10T11:20:52Z","title":"Unveiling Nonlinear Dynamics in Catastrophe Bond Pricing: A Machine\n  Learning Perspective","summary":"  This paper explores the implications of using machine learning models in the\npricing of catastrophe (CAT) bonds. By integrating advanced machine learning\ntechniques, our approach uncovers nonlinear relationships and complex\ninteractions between key risk factors and CAT bond spreads -- dynamics that are\noften overlooked by traditional linear regression models. Using primary market\nCAT bond transaction records between January 1999 and March 2021, our findings\ndemonstrate that machine learning models not only enhance the accuracy of CAT\nbond pricing but also provide a deeper understanding of how various risk\nfactors interact and influence bond prices in a nonlinear way. These findings\nsuggest that investors and issuers can benefit from incorporating machine\nlearning to better capture the intricate interplay between risk factors when\npricing CAT bonds. The results also highlight the potential for machine\nlearning models to refine our understanding of asset pricing in markets\ncharacterized by complex risk structures.\n","authors":["Xiaowei Chen","Hong Li","Yufan Lu","Rui Zhou"],"pdf_url":"https://arxiv.org/pdf/2405.00697v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.03588v2","updated":"2024-08-26T00:52:40Z","published":"2024-08-07T07:04:29Z","title":"Facing the Music: Tackling Singing Voice Separation in Cinematic Audio\n  Source Separation","summary":"  Cinematic audio source separation (CASS), as a standalone problem of\nextracting individual stems from their mixture, is a fairly new subtask of\naudio source separation. A typical setup of CASS is a three-stem problem, with\nthe aim of separating the mixture into the dialogue (DX), music (MX), and\neffects (FX) stems. Given the creative nature of cinematic sound production,\nhowever, several edge cases exist; some sound sources do not fit neatly in any\nof these three stems, necessitating the use of additional auxiliary stems in\nproduction. One very common edge case is the singing voice in film audio, which\nmay belong in either the DX or MX or neither, depending heavily on the\ncinematic context. In this work, we demonstrate a very straightforward\nextension of the dedicated-decoder Bandit and query-based single-decoder\nBanquet models to a four-stem problem, treating non-musical dialogue,\ninstrumental music, singing voice, and effects as separate stems.\nInterestingly, the query-based Banquet model outperformed the dedicated-decoder\nBandit model. We hypothesized that this is due to a better feature alignment at\nthe bottleneck as enforced by the band-agnostic FiLM layer. Dataset and model\nimplementation will be made available at\nhttps://github.com/kwatcharasupat/source-separation-landing.\n","authors":["Karn N. Watcharasupat","Chih-Wei Wu","Iroro Orife"],"pdf_url":"https://arxiv.org/pdf/2408.03588v2.pdf","comment":"Submitted to the Late-Breaking Demo Session of the 25th International\n  Society for Music Information Retrieval (ISMIR) Conference, 2024"},{"id":"http://arxiv.org/abs/2402.17131v2","updated":"2024-08-26T23:59:43Z","published":"2024-02-27T01:53:02Z","title":"Predicting O-GlcNAcylation Sites in Mammalian Proteins with Transformers\n  and RNNs Trained with a New Loss Function","summary":"  Glycosylation, a protein modification, has multiple essential functional and\nstructural roles. O-GlcNAcylation, a subtype of glycosylation, has the\npotential to be an important target for therapeutics, but methods to reliably\npredict O-GlcNAcylation sites had not been available until 2023; a 2021 review\ncorrectly noted that published models were insufficient and failed to\ngeneralize. Moreover, many are no longer usable. In 2023, a considerably better\nRNN model with an F$_1$ score of 36.17% and an MCC of 34.57% on a large dataset\nwas published. This article first sought to improve these metrics using\ntransformer encoders. While transformers displayed high performance on this\ndataset, their performance was inferior to that of the previously published\nRNN. We then created a new loss function, which we call the weighted focal\ndifferentiable MCC, to improve the performance of classification models. RNN\nmodels trained with this new function display superior performance to models\ntrained using the weighted cross-entropy loss; this new function can also be\nused to fine-tune trained models. A two-cell RNN trained with this loss\nachieves state-of-the-art performance in O-GlcNAcylation site prediction with\nan F$_1$ score of 38.88% and an MCC of 38.20% on that large dataset.\n","authors":["Pedro Seber"],"pdf_url":"https://arxiv.org/pdf/2402.17131v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.08658v2","updated":"2024-08-26T23:24:52Z","published":"2022-02-17T13:43:06Z","title":"The merged-staircase property: a necessary and nearly sufficient\n  condition for SGD learning of sparse functions on two-layer neural networks","summary":"  It is currently known how to characterize functions that neural networks can\nlearn with SGD for two extremal parameterizations: neural networks in the\nlinear regime, and neural networks with no structural constraints. However, for\nthe main parametrization of interest (non-linear but regular networks) no tight\ncharacterization has yet been achieved, despite significant developments.\n  We take a step in this direction by considering depth-2 neural networks\ntrained by SGD in the mean-field regime. We consider functions on binary inputs\nthat depend on a latent low-dimensional subspace (i.e., small number of\ncoordinates). This regime is of interest since it is poorly understood how\nneural networks routinely tackle high-dimensional datasets and adapt to latent\nlow-dimensional structure without suffering from the curse of dimensionality.\nAccordingly, we study SGD-learnability with $O(d)$ sample complexity in a large\nambient dimension $d$.\n  Our main results characterize a hierarchical property, the \"merged-staircase\nproperty\", that is both necessary and nearly sufficient for learning in this\nsetting.\n  We further show that non-linear training is necessary: for this class of\nfunctions, linear methods on any feature map (e.g., the NTK) are not capable of\nlearning efficiently. The key tools are a new \"dimension-free\" dynamics\napproximation result that applies to functions defined on a latent space of\nlow-dimension, a proof of global convergence based on polynomial identity\ntesting, and an improvement of lower bounds against linear methods for\nnon-almost orthogonal functions.\n","authors":["Emmanuel Abbe","Enric Boix-Adsera","Theodor Misiakiewicz"],"pdf_url":"https://arxiv.org/pdf/2202.08658v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14687v1","updated":"2024-08-26T23:24:31Z","published":"2024-08-26T23:24:31Z","title":"A Synthetic Benchmark to Explore Limitations of Localized Drift\n  Detections","summary":"  Concept drift is a common phenomenon in data streams where the statistical\nproperties of the target variable change over time. Traditionally, drift is\nassumed to occur globally, affecting the entire dataset uniformly. However,\nthis assumption does not always hold true in real-world scenarios where only\nspecific subpopulations within the data may experience drift. This paper\nexplores the concept of localized drift and evaluates the performance of\nseveral drift detection techniques in identifying such localized changes. We\nintroduce a synthetic dataset based on the Agrawal generator, where drift is\ninduced in a randomly chosen subgroup. Our experiments demonstrate that\ncommonly adopted drift detection methods may fail to detect drift when it is\nconfined to a small subpopulation. We propose and test various drift detection\napproaches to quantify their effectiveness in this localized drift scenario. We\nmake the source code for the generation of the synthetic benchmark available at\nhttps://github.com/fgiobergia/subgroup-agrawal-drift.\n","authors":["Flavio Giobergia","Eliana Pastor","Luca de Alfaro","Elena Baralis"],"pdf_url":"https://arxiv.org/pdf/2408.14687v1.pdf","comment":"Paper accepted at DELTA Workshop @ KDD 2024"},{"id":"http://arxiv.org/abs/2408.14685v1","updated":"2024-08-26T23:21:44Z","published":"2024-08-26T23:21:44Z","title":"Model-Based Reinforcement Learning for Control of Strongly-Disturbed\n  Unsteady Aerodynamic Flows","summary":"  The intrinsic high dimension of fluid dynamics is an inherent challenge to\ncontrol of aerodynamic flows, and this is further complicated by a flow's\nnonlinear response to strong disturbances. Deep reinforcement learning, which\ntakes advantage of the exploratory aspects of reinforcement learning (RL) and\nthe rich nonlinearity of a deep neural network, provides a promising approach\nto discover feasible control strategies. However, the typical model-free\napproach to reinforcement learning requires a significant amount of interaction\nbetween the flow environment and the RL agent during training, and this high\ntraining cost impedes its development and application. In this work, we propose\na model-based reinforcement learning (MBRL) approach by incorporating a novel\nreduced-order model as a surrogate for the full environment. The model consists\nof a physics-augmented autoencoder, which compresses high-dimensional CFD flow\nfield snaphsots into a three-dimensional latent space, and a latent dynamics\nmodel that is trained to accurately predict the long-time dynamics of\ntrajectories in the latent space in response to action sequences. The\nrobustness and generalizability of the model is demonstrated in two distinct\nflow environments, a pitching airfoil in a highly disturbed environment and a\nvertical-axis wind turbine in a disturbance-free environment. Based on the\ntrained model in the first problem, we realize an MBRL strategy to mitigate\nlift variation during gust-airfoil encounters. We demonstrate that the policy\nlearned in the reduced-order environment translates to an effective control\nstrategy in the full CFD environment.\n","authors":["Zhecheng Liu","Diederik Beckers","Jeff D. Eldredge"],"pdf_url":"https://arxiv.org/pdf/2408.14685v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14682v1","updated":"2024-08-26T23:13:38Z","published":"2024-08-26T23:13:38Z","title":"Detecting Interpretable Subgroup Drifts","summary":"  The ability to detect and adapt to changes in data distributions is crucial\nto maintain the accuracy and reliability of machine learning models. Detection\nis generally approached by observing the drift of model performance from a\nglobal point of view. However, drifts occurring in (fine-grained) data\nsubgroups may go unnoticed when monitoring global drift. We take a different\nperspective, and introduce methods for observing drift at the finer granularity\nof subgroups. Relevant data subgroups are identified during training and\nmonitored efficiently throughout the model's life. Performance drifts in any\nsubgroup are detected, quantified and characterized so as to provide an\ninterpretable summary of the model behavior over time. Experimental results\nconfirm that our subgroup-level drift analysis identifies drifts that do not\nshow at the (coarser) global dataset level. The proposed approach provides a\nvaluable tool for monitoring model performance in dynamic real-world\napplications, offering insights into the evolving nature of data and ultimately\ncontributing to more robust and adaptive models.\n","authors":["Flavio Giobergia","Eliana Pastor","Luca de Alfaro","Elena Baralis"],"pdf_url":"https://arxiv.org/pdf/2408.14682v1.pdf","comment":"Currently under submission"},{"id":"http://arxiv.org/abs/2401.10393v3","updated":"2024-08-26T23:10:59Z","published":"2024-01-18T22:06:38Z","title":"Natural Mitigation of Catastrophic Interference: Continual Learning in\n  Power-Law Learning Environments","summary":"  Neural networks often suffer from catastrophic interference (CI): performance\non previously learned tasks drops off significantly when learning a new task.\nThis contrasts strongly with humans, who can continually learn new tasks\nwithout appreciably forgetting previous tasks. Prior work has explored various\ntechniques for mitigating CI and promoting continual learning such as\nregularization, rehearsal, generative replay, and context-specific components.\nThis paper takes a different approach, one guided by cognitive science research\nshowing that in naturalistic environments, the probability of encountering a\ntask decreases as a power-law of the time since it was last performed. We argue\nthat techniques for mitigating CI should be compared against the intrinsic\nmitigation in simulated naturalistic learning environments. Thus, we evaluate\nthe extent of the natural mitigation of CI when training models in power-law\nenvironments, similar to those humans face. Our results show that natural\nrehearsal environments are better at mitigating CI than existing methods,\ncalling for the need for better evaluation processes. The benefits of this\nenvironment include simplicity, rehearsal that is agnostic to both tasks and\nmodels, and the lack of a need for extra neural circuitry. In addition, we\nexplore popular mitigation techniques in power-law environments to create new\nbaselines for continual learning research.\n","authors":["Atith Gandhi","Raj Sanjay Shah","Vijay Marupudi","Sashank Varma"],"pdf_url":"https://arxiv.org/pdf/2401.10393v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14681v1","updated":"2024-08-26T23:10:42Z","published":"2024-08-26T23:10:42Z","title":"Enhancing Neural Network Interpretability Through Conductance-Based\n  Information Plane Analysis","summary":"  The Information Plane is a conceptual framework used to analyze the flow of\ninformation in neural networks, but traditional methods based on activations\nmay not fully capture the dynamics of information processing. This paper\nintroduces a new approach that uses layer conductance, a measure of sensitivity\nto input features, to enhance the Information Plane analysis. By incorporating\ngradient-based contributions, we provide a more precise characterization of\ninformation dynamics within the network. The proposed conductance-based\nInformation Plane and a new Information Transformation Efficiency (ITE) metric\nare evaluated on pretrained ResNet50 and VGG16 models using the ImageNet\ndataset. Our results demonstrate the ability to identify critical hidden layers\nthat contribute significantly to model performance and interpretability, giving\ninsights into information compression, preservation, and utilization across\nlayers. The conductance-based approach offers a granular perspective on feature\nattribution, enhancing our understanding of the decision-making processes\nwithin neural networks. Furthermore, our empirical findings challenge certain\ntheoretical predictions of the Information Bottleneck theory, highlighting the\ncomplexities of information dynamics in real-world data scenarios. The proposed\nmethod not only advances our understanding of information dynamics in neural\nnetworks but also has the potential to significantly impact the broader field\nof Artificial Intelligence by enabling the development of more interpretable,\nefficient, and robust models.\n","authors":["Jaouad Dabounou","Amine Baazzouz"],"pdf_url":"https://arxiv.org/pdf/2408.14681v1.pdf","comment":"16 pages, 10 figures"},{"id":"http://arxiv.org/abs/2408.14680v1","updated":"2024-08-26T23:10:01Z","published":"2024-08-26T23:10:01Z","title":"On-Chip Learning with Memristor-Based Neural Networks: Assessing\n  Accuracy and Efficiency Under Device Variations, Conductance Errors, and\n  Input Noise","summary":"  This paper presents a memristor-based compute-in-memory hardware accelerator\nfor on-chip training and inference, focusing on its accuracy and efficiency\nagainst device variations, conductance errors, and input noise. Utilizing\nrealistic SPICE models of commercially available silver-based metal\nself-directed channel (M-SDC) memristors, the study incorporates inherent\ndevice non-idealities into the circuit simulations. The hardware, consisting of\n30 memristors and 4 neurons, utilizes three different M-SDC structures with\ntungsten, chromium, and carbon media to perform binary image classification\ntasks. An on-chip training algorithm precisely tunes memristor conductance to\nachieve target weights. Results show that incorporating moderate noise (<15%)\nduring training enhances robustness to device variations and noisy input data,\nachieving up to 97% accuracy despite conductance variations and input noises.\nThe network tolerates a 10% conductance error without significant accuracy\nloss. Notably, omitting the initial memristor reset pulse during training\nconsiderably reduces training time and energy consumption. The hardware\ndesigned with chromium-based memristors exhibits superior performance,\nachieving a training time of 2.4 seconds and an energy consumption of 18.9 mJ.\nThis research provides insights for developing robust and energy-efficient\nmemristor-based neural networks for on-chip learning in edge applications.\n","authors":["M. Reza Eslami","Dhiman Biswas","Soheib Takhtardeshir","Sarah S. Sharif","Yaser M. Banad"],"pdf_url":"https://arxiv.org/pdf/2408.14680v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12320v2","updated":"2024-08-26T23:05:51Z","published":"2024-08-22T11:57:07Z","title":"PolyRouter: A Multi-LLM Querying System","summary":"  With the rapid growth of Large Language Models (LLMs) across various domains,\nnumerous new LLMs have emerged, each possessing domain-specific expertise. This\nproliferation has highlighted the need for quick, high-quality, and\ncost-effective LLM query response methods. Yet, no single LLM exists to\nefficiently balance this trilemma. Some models are powerful but extremely\ncostly, while others are fast and inexpensive but qualitatively inferior. To\naddress this challenge, we present PolyRouter, a non-monolithic LLM querying\nsystem that seamlessly integrates various LLM experts into a single query\ninterface and dynamically routes incoming queries to the most high-performant\nexpert based on query's requirements. Through extensive experiments, we\ndemonstrate that when compared to standalone expert models, PolyRouter improves\nquery efficiency by up to 40%, and leads to significant cost reductions of up\nto 30%, while maintaining or enhancing model performance by up to 10%.\n","authors":["Dimitris Stripelis","Zijian Hu","Jipeng Zhang","Zhaozhuo Xu","Alay Dilipbhai Shah","Han Jin","Yuhang Yao","Salman Avestimehr","Chaoyang He"],"pdf_url":"https://arxiv.org/pdf/2408.12320v2.pdf","comment":"14 pages, 7 figures, 2 tables"},{"id":"http://arxiv.org/abs/2408.14678v1","updated":"2024-08-26T23:01:48Z","published":"2024-08-26T23:01:48Z","title":"Bridging the Gap: Unpacking the Hidden Challenges in Knowledge\n  Distillation for Online Ranking Systems","summary":"  Knowledge Distillation (KD) is a powerful approach for compressing a large\nmodel into a smaller, more efficient model, particularly beneficial for\nlatency-sensitive applications like recommender systems. However, current KD\nresearch predominantly focuses on Computer Vision (CV) and NLP tasks,\noverlooking unique data characteristics and challenges inherent to recommender\nsystems. This paper addresses these overlooked challenges, specifically: (1)\nmitigating data distribution shifts between teacher and student models, (2)\nefficiently identifying optimal teacher configurations within time and\nbudgetary constraints, and (3) enabling computationally efficient and rapid\nsharing of teacher labels to support multiple students. We present a robust KD\nsystem developed and rigorously evaluated on multiple large-scale personalized\nvideo recommendation systems within Google. Our live experiment results\ndemonstrate significant improvements in student model performance while\nensuring consistent and reliable generation of high quality teacher labels from\na continuous data stream of data.\n","authors":["Nikhil Khani","Shuo Yang","Aniruddh Nath","Yang Liu","Pendo Abbo","Li Wei","Shawn Andrews","Maciej Kula","Jarrod Kahn","Zhe Zhao","Lichan Hong","Ed Chi"],"pdf_url":"https://arxiv.org/pdf/2408.14678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14677v1","updated":"2024-08-26T22:57:01Z","published":"2024-08-26T22:57:01Z","title":"Can Optimization Trajectories Explain Multi-Task Transfer?","summary":"  Despite the widespread adoption of multi-task training in deep learning,\nlittle is understood about how multi-task learning (MTL) affects\ngeneralization. Prior work has conjectured that the negative effects of MTL are\ndue to optimization challenges that arise during training, and many\noptimization methods have been proposed to improve multi-task performance.\nHowever, recent work has shown that these methods fail to consistently improve\nmulti-task generalization. In this work, we seek to improve our understanding\nof these failures by empirically studying how MTL impacts the optimization of\ntasks, and whether this impact can explain the effects of MTL on\ngeneralization. We show that MTL results in a generalization gap-a gap in\ngeneralization at comparable training loss-between single-task and multi-task\ntrajectories early into training. However, we find that factors of the\noptimization trajectory previously proposed to explain generalization gaps in\nsingle-task settings cannot explain the generalization gaps between single-task\nand multi-task models. Moreover, we show that the amount of gradient conflict\nbetween tasks is correlated with negative effects to task optimization, but is\nnot predictive of generalization. Our work sheds light on the underlying causes\nfor failures in MTL and, importantly, raises questions about the role of\ngeneral purpose multi-task optimization algorithms.\n","authors":["David Mueller","Mark Dredze","Nicholas Andrews"],"pdf_url":"https://arxiv.org/pdf/2408.14677v1.pdf","comment":"Pre-print"}],"Multimedia":[{"id":"http://arxiv.org/abs/2408.14155v1","updated":"2024-08-26T09:59:45Z","published":"2024-08-26T09:59:45Z","title":"Digital Fingerprinting on Multimedia: A Survey","summary":"  The explosive growth of multimedia content in the digital economy era has\nbrought challenges in content recognition, copyright protection, and data\nmanagement. As an emerging content management technology, perceptual hash-based\ndigital fingerprints, serving as compact summaries of multimedia content, have\nbeen widely adopted for efficient multimedia content identification and\nretrieval across different modalities (e.g., text, image, video, audio),\nattracting significant attention from both academia and industry. Despite the\nincreasing applications of digital fingerprints, there is a lack of systematic\nand comprehensive literature review on multimedia digital fingerprints. This\nsurvey aims to fill this gap and provide an important resource for researchers\nstudying the details and related advancements of multimedia digital\nfingerprints. The survey first introduces the definition, characteristics, and\nrelated concepts (including hash functions, granularity, similarity measures,\netc.) of digital fingerprints. It then focuses on analyzing and summarizing the\nalgorithms for extracting unimodal fingerprints of different types of digital\ncontent, including text fingerprints, image fingerprints, video fingerprints,\nand audio fingerprints. Particularly, it provides an in-depth review and\nsummary of deep learning-based fingerprints. Additionally, the survey\nelaborates on the various practical applications of digital fingerprints and\noutlines the challenges and potential future research directions. The goal is\nto promote the continued development of multimedia digital fingerprint\nresearch.\n","authors":["Wendi Chen","Wensheng Gan","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2408.14155v1.pdf","comment":"Preprint. 5 figures, 7 tables"},{"id":"http://arxiv.org/abs/2407.04284v2","updated":"2024-08-26T08:19:03Z","published":"2024-07-05T06:32:52Z","title":"TSC-PCAC: Voxel Transformer and Sparse Convolution Based Point Cloud\n  Attribute Compression for 3D Broadcasting","summary":"  Point cloud has been the mainstream representation for advanced 3D\napplications, such as virtual reality and augmented reality. However, the\nmassive data amounts of point clouds is one of the most challenging issues for\ntransmission and storage. In this paper, we propose an end-to-end voxel\nTransformer and Sparse Convolution based Point Cloud Attribute Compression\n(TSC-PCAC) for 3D broadcasting. Firstly, we present a framework of the\nTSC-PCAC, which include Transformer and Sparse Convolutional Module (TSCM)\nbased variational autoencoder and channel context module. Secondly, we propose\na two-stage TSCM, where the first stage focuses on modeling local dependencies\nand feature representations of the point clouds, and the second stage captures\nglobal features through spatial and channel pooling encompassing larger\nreceptive fields. This module effectively extracts global and local interpoint\nrelevance to reduce informational redundancy. Thirdly, we design a TSCM based\nchannel context module to exploit interchannel correlations, which improves the\npredicted probability distribution of quantized latent representations and thus\nreduces the bitrate. Experimental results indicate that the proposed TSC-PCAC\nmethod achieves an average of 38.53%, 21.30%, and 11.19% Bjontegaard Delta\nbitrate reductions compared to the Sparse-PCAC, NF-PCAC, and G-PCC v23 methods,\nrespectively. The encoding/decoding time costs are reduced up to 97.68%/98.78%\non average compared to the Sparse-PCAC. The source code and the trained models\nof the TSC-PCAC are available at https://github.com/igizuxo/TSC-PCAC.\n","authors":["Zixi Guo","Yun Zhang","Linwei Zhu","Hanli Wang","Gangyi Jiang"],"pdf_url":"https://arxiv.org/pdf/2407.04284v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14084v1","updated":"2024-08-26T08:11:35Z","published":"2024-08-26T08:11:35Z","title":"HABD: a houma alliance book ancient handwritten character recognition\n  database","summary":"  The Houma Alliance Book, one of history's earliest calligraphic examples, was\nunearthed in the 1970s. These artifacts were meticulously organized,\nreproduced, and copied by the Shanxi Provincial Institute of Cultural Relics.\nHowever, because of their ancient origins and severe ink erosion, identifying\ncharacters in the Houma Alliance Book is challenging, necessitating the use of\ndigital technology. In this paper, we propose a new ancient handwritten\ncharacter recognition database for the Houma alliance book, along with a novel\nbenchmark based on deep learning architectures. More specifically, a collection\nof 26,732 characters samples from the Houma Alliance Book were gathered,\nencompassing 327 different types of ancient characters through iterative\nannotation. Furthermore, benchmark algorithms were proposed by combining four\ndeep neural network classifiers with two data augmentation methods. This\nresearch provides valuable resources and technical support for further studies\non the Houma Alliance Book and other ancient characters. This contributes to\nour understanding of ancient culture and history, as well as the preservation\nand inheritance of humanity's cultural heritage.\n","authors":["Xiaoyu Yuan","Xiaohua Huang","Zibo Zhang","Yabo Sun"],"pdf_url":"https://arxiv.org/pdf/2408.14084v1.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.12321v2","updated":"2024-08-26T04:27:54Z","published":"2024-08-22T11:57:16Z","title":"MaVEn: An Effective Multi-granularity Hybrid Visual Encoding Framework\n  for Multimodal Large Language Model","summary":"  This paper presents MaVEn, an innovative Multi-granularity Visual Encoding\nframework designed to enhance the capabilities of Multimodal Large Language\nModels (MLLMs) in multi-image reasoning. Current MLLMs primarily focus on\nsingle-image visual understanding, limiting their ability to interpret and\nintegrate information across multiple images. MaVEn addresses this limitation\nby combining discrete visual symbol sequences, which abstract coarse-grained\nsemantic concepts, with traditional continuous representation sequences that\nmodel fine-grained features. This dual approach bridges the semantic gap\nbetween visual and textual data, thereby improving the model's ability to\nprocess and interpret information from multiple images effectively.\nAdditionally, we design a dynamic reduction mechanism by for long-sequence\ncontinuous features to enhance multi-image processing efficiency. Experimental\nresults demonstrate that MaVEn significantly enhances MLLMs' understanding in\ncomplex multi-image scenarios, while also improving performance in single-image\ncontexts.\n","authors":["Chaoya Jiang","Jia Hongrui","Haiyang Xu","Wei Ye","Mengfan Dong","Ming Yan","Ji Zhang","Fei Huang","Shikun Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.12321v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14547v1","updated":"2024-08-26T18:00:33Z","published":"2024-08-26T18:00:33Z","title":"Revisiting Image Captioning Training Paradigm via Direct CLIP-based\n  Optimization","summary":"  The conventional training approach for image captioning involves pre-training\na network using teacher forcing and subsequent fine-tuning with Self-Critical\nSequence Training to maximize hand-crafted captioning metrics. However, when\nattempting to optimize modern and higher-quality metrics like CLIP-Score and\nPAC-Score, this training method often encounters instability and fails to\nacquire the genuine descriptive capabilities needed to produce fluent and\ninformative captions. In this paper, we propose a new training paradigm termed\nDirect CLIP-Based Optimization (DiCO). Our approach jointly learns and\noptimizes a reward model that is distilled from a learnable captioning\nevaluator with high human correlation. This is done by solving a weighted\nclassification problem directly inside the captioner. At the same time, DiCO\nprevents divergence from the original model, ensuring that fluency is\nmaintained. DiCO not only exhibits improved stability and enhanced quality in\nthe generated captions but also aligns more closely with human preferences\ncompared to existing methods, especially in modern metrics. Additionally, it\nmaintains competitive performance in traditional metrics. Our source code and\ntrained models are publicly available at https://github.com/aimagelab/DiCO.\n","authors":["Nicholas Moratelli","Davide Caffagni","Marcella Cornia","Lorenzo Baraldi","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2408.14547v1.pdf","comment":"BMVC 2024"}]},"2024-08-25T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2408.12574v2","updated":"2024-08-25T23:58:25Z","published":"2024-08-22T17:41:45Z","title":"MuMA-ToM: Multi-modal Multi-Agent Theory of Mind","summary":"  Understanding people's social interactions in complex real-world scenarios\noften relies on intricate mental reasoning. To truly understand how and why\npeople interact with one another, we must infer the underlying mental states\nthat give rise to the social interactions, i.e., Theory of Mind reasoning in\nmulti-agent interactions. Additionally, social interactions are often\nmulti-modal -- we can watch people's actions, hear their conversations, and/or\nread about their past behaviors. For AI systems to successfully and safely\ninteract with people in real-world environments, they also need to understand\npeople's mental states as well as their inferences about each other's mental\nstates based on multi-modal information about their interactions. For this, we\nintroduce MuMA-ToM, a Multi-modal Multi-Agent Theory of Mind benchmark.\nMuMA-ToM is the first multi-modal Theory of Mind benchmark that evaluates\nmental reasoning in embodied multi-agent interactions. In MuMA-ToM, we provide\nvideo and text descriptions of people's multi-modal behavior in realistic\nhousehold environments. Based on the context, we then ask questions about\npeople's goals, beliefs, and beliefs about others' goals. We validated MuMA-ToM\nin a human experiment and provided a human baseline. We also proposed a novel\nmulti-modal, multi-agent ToM model, LIMP (Language model-based Inverse\nMulti-agent Planning). Our experimental results show that LIMP significantly\noutperforms state-of-the-art methods, including large multi-modal models (e.g.,\nGPT-4o, Gemini-1.5 Pro) and a recent multi-modal ToM model, BIP-ALM.\n","authors":["Haojun Shi","Suyu Ye","Xinyu Fang","Chuanyang Jin","Leyla Isik","Yen-Ling Kuo","Tianmin Shu"],"pdf_url":"https://arxiv.org/pdf/2408.12574v2.pdf","comment":"Project website: https://scai.cs.jhu.edu/projects/MuMA-ToM/ Code:\n  https://github.com/SCAI-JHU/MuMA-ToM"},{"id":"http://arxiv.org/abs/2408.13959v1","updated":"2024-08-25T23:46:35Z","published":"2024-08-25T23:46:35Z","title":"Bidirectional Awareness Induction in Autoregressive Seq2Seq Models","summary":"  Autoregressive Sequence-To-Sequence models are the foundation of many Deep\nLearning achievements in major research fields such as Vision and Natural\nLanguage Processing. Despite that, they still present significant limitations.\nFor instance, when errors occur in the early steps of the prediction, the whole\noutput is severely affected. Such reliance on previously predicted tokens and\nthe inherent computational unfriendliness of sequential algorithms, motivated\nresearchers to explore different architectures and methods in the search for\nbidirectional approaches. In this work, we introduce the Bidirectional\nAwareness Induction (BAI), a training method that leverages a subset of\nelements in the network, the Pivots, to perform bidirectional learning without\nbreaking the autoregressive constraints. To showcase its flexibility, we apply\nthe method to three architectures, the Transformer, ExpansionNet v2 and GPT,\nthen perform experiments over three tasks. Experimental results showcase BAI's\neffectiveness on all selected tasks and architectures. In particular, we\nobserved an increase of up to 2.4 CIDEr in Image-Captioning, 4.96 BLEU in\nNeural Machine Translation, and 1.16 ROUGE in Text Summarization compared to\nthe respective baselines. Notably, BAI not only has a positive impact on models\ntrained from scratch but on pre-trained models as well. Such an aspect,\ncombined with the absence of architectural requirements synergizes well with\nthe current trend of LLMs.\n","authors":["Jia Cheng Hu","Roberto Cavicchioli","Alessandro Capotondi"],"pdf_url":"https://arxiv.org/pdf/2408.13959v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13958v1","updated":"2024-08-25T23:41:39Z","published":"2024-08-25T23:41:39Z","title":"Prediction of COPD Using Machine Learning, Clinical Summary Notes, and\n  Vital Signs","summary":"  Chronic obstructive pulmonary disease (COPD) is a chronic inflammatory lung\ndisease that causes obstructed airflow from the lungs. In the United States,\nmore than 15.7 million Americans have been diagnosed with COPD, with 96% of\nindividuals living with at least one other chronic health condition. It is the\n4th leading cause of death in the country. Over 2.2 million patients are\nadmitted to hospitals annually due to COPD exacerbations. Monitoring and\npredicting patient exacerbations on-time could save their life. This paper\npresents two different predictive models to predict COPD exacerbation using AI\nand natural language processing (NLP) approaches. These models use respiration\nsummary notes, symptoms, and vital signs. To train and test these models, data\nrecords containing physiologic signals and vital signs time series were used.\nThese records were captured from patient monitors and comprehensive clinical\ndata obtained from hospital medical information systems for tens of thousands\nof Intensive Care Unit (ICU) patients. We achieved an area under the Receiver\noperating characteristic (ROC) curve of 0.82 in detection and prediction of\nCOPD exacerbation.\n","authors":["Negar Orangi-Fard"],"pdf_url":"https://arxiv.org/pdf/2408.13958v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2406.06878v2","updated":"2024-08-25T23:25:08Z","published":"2024-06-11T01:43:23Z","title":"Modeling language contact with the Iterated Learning Model","summary":"  Contact between languages has the potential to transmit vocabulary and other\nlanguage features; however, this does not always happen. Here, an iterated\nlearning model is used to examine, in a simple way, the resistance of languages\nto change during language contact. Iterated learning models are agent-based\nmodels of language change, they demonstrate that languages that are expressive\nand compositional arise spontaneously as a consequence of a language\ntransmission bottleneck. A recently introduced type of iterated learning model,\nthe Semi-Supervised ILM is used to simulate language contact. These simulations\ndo not include many of the complex factors involved in language contact and do\nnot model a population of speakers; nonetheless the model demonstrates that the\ndynamics which lead languages in the model to spontaneously become expressive\nand compositional, also cause a language to maintain its core traits even after\nmixing with another language.\n","authors":["Seth Bullock","Conor Houghton"],"pdf_url":"https://arxiv.org/pdf/2406.06878v2.pdf","comment":"to appear ALIFE24"},{"id":"http://arxiv.org/abs/2408.13940v1","updated":"2024-08-25T21:20:17Z","published":"2024-08-25T21:20:17Z","title":"CoT Rerailer: Enhancing the Reliability of Large Language Models in\n  Complex Reasoning Tasks through Error Detection and Correction","summary":"  Chain-of-Thought (CoT) prompting enhances Large Language Models (LLMs)\ncomplex reasoning abilities by generating intermediate steps. However, these\nsteps can introduce hallucinations and accumulate errors. We propose the CoT\nRerailer to address these challenges, employing self-consistency and\nmulti-agent debate systems to identify and rectify errors in the reasoning\nprocess. The CoT Rerailer first selects the most logically correct Reasoning\nPath (RP) using consistency checks and critical evaluation by automated agents.\nIt then engages a multi-agent debate system to propose and validate corrections\nto ensure the generation of an error-free intermediate logical path. The\ncorrected steps are then used to generate a revised reasoning chain to further\nreduce hallucinations and enhance answer quality. We demonstrate the\neffectiveness of our approach across diverse question-answering datasets in\nvarious knowledge domains. The CoT Rerailer enhances the reliability of\nLLM-generated reasoning, contributing to more trustworthy AI driven\ndecision-making processes.\n","authors":["Guangya Wan","Yuqi Wu","Jie Chen","Sheng Li"],"pdf_url":"https://arxiv.org/pdf/2408.13940v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13933v1","updated":"2024-08-25T20:41:22Z","published":"2024-08-25T20:41:22Z","title":"MobileQuant: Mobile-friendly Quantization for On-device Language Models","summary":"  Large language models (LLMs) have revolutionized language processing,\ndelivering outstanding results across multiple applications. However, deploying\nLLMs on edge devices poses several challenges with respect to memory, energy,\nand compute costs, limiting their widespread use in devices such as mobile\nphones. A promising solution is to reduce the number of bits used to represent\nweights and activations. While existing works have found partial success at\nquantizing LLMs to lower bitwidths, e.g. 4-bit weights, quantizing activations\nbeyond 16 bits often leads to large computational overheads due to poor\non-device quantization support, or a considerable accuracy drop. Yet, 8-bit\nactivations are very attractive for on-device deployment as they would enable\nLLMs to fully exploit mobile-friendly hardware, e.g. Neural Processing Units\n(NPUs). In this work, we make a first attempt to facilitate the on-device\ndeployment of LLMs using integer-only quantization. We first investigate the\nlimitations of existing quantization methods for on-device deployment, with a\nspecial focus on activation quantization. We then address these limitations by\nintroducing a simple post-training quantization method, named MobileQuant, that\nextends previous weight equivalent transformation works by jointly optimizing\nthe weight transformation and activation range parameters in an end-to-end\nmanner. MobileQuant demonstrates superior capabilities over existing methods by\n1) achieving near-lossless quantization on a wide range of LLM benchmarks, 2)\nreducing latency and energy consumption by 20\\%-50\\% compared to current\non-device quantization strategies, 3) requiring limited compute budget, 4)\nbeing compatible with mobile-friendly compute units, e.g. NPU.\n","authors":["Fuwen Tan","Royson Lee","Łukasz Dudziak","Shell Xu Hu","Sourav Bhattacharya","Timothy Hospedales","Georgios Tzimiropoulos","Brais Martinez"],"pdf_url":"https://arxiv.org/pdf/2408.13933v1.pdf","comment":"Code and models available: https://github.com/saic-fi/MobileQuant"},{"id":"http://arxiv.org/abs/2408.13915v1","updated":"2024-08-25T18:47:55Z","published":"2024-08-25T18:47:55Z","title":"LLMs are Superior Feedback Providers: Bootstrapping Reasoning for Lie\n  Detection with Self-Generated Feedback","summary":"  Large Language Models (LLMs) excel at generating human-like dialogues and\ncomprehending text. However, understanding the subtleties of complex exchanges\nin language remains a challenge. We propose a bootstrapping framework that\nleverages self-generated feedback to enhance LLM reasoning capabilities for lie\ndetection. The framework consists of three stages: suggestion, feedback\ncollection, and modification. In the suggestion stage, a cost-effective\nlanguage model generates initial predictions based on game state and dialogue.\nThe feedback-collection stage involves a language model providing feedback on\nthese predictions. In the modification stage, a more advanced language model\nrefines the initial predictions using the auto-generated feedback. We\ninvestigate the application of the proposed framework for detecting betrayal\nand deception in Diplomacy games, and compare it with feedback from\nprofessional human players. The LLM-generated feedback exhibits superior\nquality and significantly enhances the performance of the model. Our approach\nachieves a 39% improvement over the zero-shot baseline in lying-F1 without the\nneed for any training data, rivaling state-of-the-art supervised learning\nresults.\n","authors":["Tanushree Banerjee","Richard Zhu","Runzhe Yang","Karthik Narasimhan"],"pdf_url":"https://arxiv.org/pdf/2408.13915v1.pdf","comment":"19 pages, 18 figures"},{"id":"http://arxiv.org/abs/2408.13909v1","updated":"2024-08-25T18:10:16Z","published":"2024-08-25T18:10:16Z","title":"LowCLIP: Adapting the CLIP Model Architecture for Low-Resource Languages\n  in Multimodal Image Retrieval Task","summary":"  This research explores the development of multimodal vision-language models\nfor image retrieval in low-resource languages, specifically Azerbaijani.\nExisting vision-language models primarily support high-resource languages, and\nfine-tuning them remains computationally demanding. To address challenges in\nvision-language retrieval for low-resource languages, we integrated the CLIP\nmodel architecture and employed several techniques to balance computational\nefficiency with performance. These techniques include synthetic data generation\nthrough machine translation, image augmentation, and further training the\nattention mechanisms of transformer-based models with domain-specific data. We\nintegrated Multilingual BERT as a text encoder with image encoders like\nResNet50, EfficientNet0, Vision Transformer (ViT), and Tiny Swin Transformer.\nOur study found that models like EfficientNet0 and Tiny Swin Transformer\nperform best on the datasets they were trained on, such as COCO, Flickr30k, and\nFlickr8k. Augmentation techniques boosted EfficientNet0 MAP on Flickr30k from\n0.84 to 0.87 and ResNet50 MAP on MSCOCO from 0.70 to 0.80, contributing to a\nnew state of the art in vision-language retrieval. We share our configurations\nand results to support further research. Code and pre-trained models are\navailable at https://github.com/aliasgerovs/azclip.\n","authors":["Ali Asgarov","Samir Rustamov"],"pdf_url":"https://arxiv.org/pdf/2408.13909v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.05109v2","updated":"2024-08-25T17:22:29Z","published":"2024-05-08T15:05:55Z","title":"QFMTS: Generating Query-Focused Summaries over Multi-Table Inputs","summary":"  Table summarization is a crucial task aimed at condensing information from\ntabular data into concise and comprehensible textual summaries. However,\nexisting approaches often fall short of adequately meeting users' information\nand quality requirements and tend to overlook the complexities of real-world\nqueries. In this paper, we propose a novel method to address these limitations\nby introducing query-focused multi-table summarization. Our approach, which\ncomprises a table serialization module, a summarization controller, and a large\nlanguage model (LLM), utilizes textual queries and multiple tables to generate\nquery-dependent table summaries tailored to users' information needs. To\nfacilitate research in this area, we present a comprehensive dataset\nspecifically tailored for this task, consisting of 4909 query-summary pairs,\neach associated with multiple tables. Through extensive experiments using our\ncurated dataset, we demonstrate the effectiveness of our proposed method\ncompared to baseline approaches. Our findings offer insights into the\nchallenges of complex table reasoning for precise summarization, contributing\nto the advancement of research in query-focused multi-table summarization.\n","authors":["Weijia Zhang","Vaishali Pal","Jia-Hong Huang","Evangelos Kanoulas","Maarten de Rijke"],"pdf_url":"https://arxiv.org/pdf/2405.05109v2.pdf","comment":"Accepted by the 27th European Conference on Artificial Intelligence\n  (ECAI-2024)"},{"id":"http://arxiv.org/abs/2408.13891v1","updated":"2024-08-25T17:05:26Z","published":"2024-08-25T17:05:26Z","title":"SpeechCaps: Advancing Instruction-Based Universal Speech Models with\n  Multi-Talker Speaking Style Captioning","summary":"  Instruction-based speech processing is becoming popular. Studies show that\ntraining with multiple tasks boosts performance, but collecting diverse,\nlarge-scale tasks and datasets is expensive. Thus, it is highly desirable to\ndesign a fundamental task that benefits other downstream tasks. This paper\nintroduces a multi-talker speaking style captioning task to enhance the\nunderstanding of speaker and prosodic information. We used large language\nmodels to generate descriptions for multi-talker speech. Then, we trained our\nmodel with pre-training on this captioning task followed by instruction tuning.\nEvaluation on Dynamic-SUPERB shows our model outperforming the baseline\npre-trained only on single-talker tasks, particularly in speaker and emotion\nrecognition. Additionally, tests on a multi-talker QA task reveal that current\nmodels struggle with attributes such as gender, pitch, and speaking rate. The\ncode and dataset are available at https://github.com/cyhuang-tw/speechcaps.\n","authors":["Chien-yu Huang","Min-Han Shih","Ke-Han Lu","Chi-Yuan Hsiao","Hung-yi Lee"],"pdf_url":"https://arxiv.org/pdf/2408.13891v1.pdf","comment":"SynData4GenAI 2024"},{"id":"http://arxiv.org/abs/2408.13889v1","updated":"2024-08-25T16:43:19Z","published":"2024-08-25T16:43:19Z","title":"LLM with Relation Classifier for Document-Level Relation Extraction","summary":"  Large language models (LLMs) create a new paradigm for natural language\nprocessing. Despite their advancement, LLM-based methods still lag behind\ntraditional approaches in document-level relation extraction (DocRE), a\ncritical task for understanding complex entity relations. This paper\ninvestigates the causes of this performance gap, identifying the dispersion of\nattention by LLMs due to entity pairs without relations as a primary factor. We\nthen introduce a novel classifier-LLM approach to DocRE. The proposed approach\nbegins with a classifier specifically designed to select entity pair candidates\nexhibiting potential relations and thereby feeds them to LLM for the final\nrelation extraction. This method ensures that during inference, the LLM's focus\nis directed primarily at entity pairs with relations. Experiments on DocRE\nbenchmarks reveal that our method significantly outperforms recent LLM-based\nDocRE models and achieves competitive performance with several leading\ntraditional DocRE models.\n","authors":["Xingzuo Li","Kehai Chen","Yunfei Long","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.13889v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13863v1","updated":"2024-08-25T15:27:21Z","published":"2024-08-25T15:27:21Z","title":"CodeGraph: Enhancing Graph Reasoning of LLMs with Code","summary":"  With the increasing popularity of large language models (LLMs), reasoning on\nbasic graph algorithm problems is an essential intermediate step in assessing\ntheir abilities to process and infer complex graph reasoning tasks. Existing\nmethods usually convert graph-structured data to textual descriptions and then\nuse LLMs for reasoning and computation. However, LLMs often produce computation\nerrors on arithmetic parts in basic graph algorithm problems, such as counting\nnumber of edges. In addition, they struggle to control or understand the output\nof the reasoning process, raising concerns about whether LLMs are simply\nguessing. In this paper, we introduce CodeGraph, a method that encodes graph\nproblem solutions as code. The methods solve new graph problems by learning\nfrom exemplars, generating programs, and executing them via a program\ninterpreter. Using the few-shot setting, we evaluate CodeGraph with the base\nLLM being GPT-3.5 Turbo, Llama3-70B Instruct, Mixtral-8x22B Instruct, and\nMixtral-8x7B Instruct. Experimental results on six tasks with six graph\nencoding methods in the GraphQA dataset demonstrate that CodeGraph can boost\nperformance on graph reasoning tasks inside LLMs by 1.3% to 58.6%, depending on\nthe task. Compared to the existing methods, CodeGraph demonstrates strong\nperformance on arithmetic problems in graph tasks and offers a more\ncontrollable and interpretable approach to the reasoning process.\n","authors":["Qiaolong Cai","Zhaowei Wang","Shizhe Diao","James Kwok","Yangqiu Song"],"pdf_url":"https://arxiv.org/pdf/2408.13863v1.pdf","comment":"In Progress"},{"id":"http://arxiv.org/abs/2408.13860v1","updated":"2024-08-25T15:17:43Z","published":"2024-08-25T15:17:43Z","title":"Knowledge-Aware Reasoning over Multimodal Semi-structured Tables","summary":"  Existing datasets for tabular question answering typically focus exclusively\non text within cells. However, real-world data is inherently multimodal, often\nblending images such as symbols, faces, icons, patterns, and charts with\ntextual content in tables. With the evolution of AI models capable of\nmultimodal reasoning, it is pertinent to assess their efficacy in handling such\nstructured data. This study investigates whether current AI models can perform\nknowledge-aware reasoning on multimodal structured data. We explore their\nability to reason on tables that integrate both images and text, introducing\nMMTabQA, a new dataset designed for this purpose. Our experiments highlight\nsubstantial challenges for current AI models in effectively integrating and\ninterpreting multiple text and image inputs, understanding visual context, and\ncomparing visual content across images. These findings establish our dataset as\na robust benchmark for advancing AI's comprehension and capabilities in\nanalyzing multimodal structured data.\n","authors":["Suyash Vardhan Mathur","Jainit Sushil Bafna","Kunal Kartik","Harshita Khandelwal","Manish Shrivastava","Vivek Gupta","Mohit Bansal","Dan Roth"],"pdf_url":"https://arxiv.org/pdf/2408.13860v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02267v2","updated":"2024-08-25T14:41:32Z","published":"2024-05-03T17:34:57Z","title":"Structural Pruning of Pre-trained Language Models via Neural\n  Architecture Search","summary":"  Pre-trained language models (PLM), for example BERT or RoBERTa, mark the\nstate-of-the-art for natural language understanding task when fine-tuned on\nlabeled data. However, their large size poses challenges in deploying them for\ninference in real-world applications, due to significant GPU memory\nrequirements and high inference latency. This paper explores neural\narchitecture search (NAS) for structural pruning to find sub-parts of the\nfine-tuned network that optimally trade-off efficiency, for example in terms of\nmodel size or latency, and generalization performance. We also show how we can\nutilize more recently developed two-stage weight-sharing NAS approaches in this\nsetting to accelerate the search process. Unlike traditional pruning methods\nwith fixed thresholds, we propose to adopt a multi-objective approach that\nidentifies the Pareto optimal set of sub-networks, allowing for a more flexible\nand automated compression process.\n","authors":["Aaron Klein","Jacek Golebiowski","Xingchen Ma","Valerio Perrone","Cedric Archambeau"],"pdf_url":"https://arxiv.org/pdf/2405.02267v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02481v3","updated":"2024-08-25T14:21:29Z","published":"2024-06-04T16:49:06Z","title":"Large Language Models as Carriers of Hidden Messages","summary":"  With the help of simple fine-tuning, one can artificially embed hidden text\ninto large language models (LLMs). This text is revealed only when triggered by\na specific query to the LLM. Two primary applications are LLM fingerprinting\nand steganography. In the context of LLM fingerprinting, a unique text\nidentifier (fingerprint) is embedded within the model to verify licensing\ncompliance. In the context of steganography, the LLM serves as a carrier for\nhidden messages that can be disclosed through a chosen trigger question.\n  Our work demonstrates that embedding hidden text in the LLM via fine-tuning,\nthough seemingly secure due to the vast number of potential triggers (any\nsequence of characters or tokens could serve as a trigger), is susceptible to\nextraction through analysis of the LLM's output decoding process. We propose an\nextraction attack called Unconditional Token Forcing (UTF). It is premised on\nthe hypothesis that iteratively feeding each token from the LLM's vocabulary\ninto the model should reveal output sequences with abnormally high token\nprobabilities, indicating potential hidden text candidates. We also present a\ndefense method to hide text in such a way that it is resistant to both UTF and\nattacks based on sampling decoding methods, which we named Unconditional Token\nForcing Confusion (UTFC). To the best of our knowledge, there is no attack\nmethod that can extract text hidden with UTFC. UTFC has both benign\napplications (improving LLM fingerprinting) and malign applications (using LLMs\nto create covert communication channels).\n","authors":["Jakub Hoscilowicz","Pawel Popiolek","Jan Rudkowski","Jedrzej Bieniasz","Artur Janicki"],"pdf_url":"https://arxiv.org/pdf/2406.02481v3.pdf","comment":"Work in progress. Code is available at\n  https://github.com/j-hoscilowic/zurek-stegano"},{"id":"http://arxiv.org/abs/2408.13833v1","updated":"2024-08-25T13:36:22Z","published":"2024-08-25T13:36:22Z","title":"Biomedical Large Languages Models Seem not to be Superior to Generalist\n  Models on Unseen Medical Data","summary":"  Large language models (LLMs) have shown potential in biomedical applications,\nleading to efforts to fine-tune them on domain-specific data. However, the\neffectiveness of this approach remains unclear. This study evaluates the\nperformance of biomedically fine-tuned LLMs against their general-purpose\ncounterparts on a variety of clinical tasks. We evaluated their performance on\nclinical case challenges from the New England Journal of Medicine (NEJM) and\nthe Journal of the American Medical Association (JAMA) and on several clinical\ntasks (e.g., information extraction, document summarization, and clinical\ncoding). Using benchmarks specifically chosen to be likely outside the\nfine-tuning datasets of biomedical models, we found that biomedical LLMs mostly\nperform inferior to their general-purpose counterparts, especially on tasks not\nfocused on medical knowledge. While larger models showed similar performance on\ncase tasks (e.g., OpenBioLLM-70B: 66.4% vs. Llama-3-70B-Instruct: 65% on JAMA\ncases), smaller biomedical models showed more pronounced underperformance\n(e.g., OpenBioLLM-8B: 30% vs. Llama-3-8B-Instruct: 64.3% on NEJM cases).\nSimilar trends were observed across the CLUE (Clinical Language Understanding\nEvaluation) benchmark tasks, with general-purpose models often performing\nbetter on text generation, question answering, and coding tasks. Our results\nsuggest that fine-tuning LLMs to biomedical data may not provide the expected\nbenefits and may potentially lead to reduced performance, challenging\nprevailing assumptions about domain-specific adaptation of LLMs and\nhighlighting the need for more rigorous evaluation frameworks in healthcare AI.\nAlternative approaches, such as retrieval-augmented generation, may be more\neffective in enhancing the biomedical capabilities of LLMs without compromising\ntheir general knowledge.\n","authors":["Felix J. Dorfner","Amin Dada","Felix Busch","Marcus R. Makowski","Tianyu Han","Daniel Truhn","Jens Kleesiek","Madhumita Sushil","Jacqueline Lammert","Lisa C. Adams","Keno K. Bressem"],"pdf_url":"https://arxiv.org/pdf/2408.13833v1.pdf","comment":"10 pages, 3 tables, 1 figure"},{"id":"http://arxiv.org/abs/2408.13831v1","updated":"2024-08-25T13:29:34Z","published":"2024-08-25T13:29:34Z","title":"Guardians of the Machine Translation Meta-Evaluation: Sentinel Metrics\n  Fall In!","summary":"  Annually, at the Conference of Machine Translation (WMT), the Metrics Shared\nTask organizers conduct the meta-evaluation of Machine Translation (MT)\nmetrics, ranking them according to their correlation with human judgments.\nTheir results guide researchers toward enhancing the next generation of metrics\nand MT systems. With the recent introduction of neural metrics, the field has\nwitnessed notable advancements. Nevertheless, the inherent opacity of these\nmetrics has posed substantial challenges to the meta-evaluation process. This\nwork highlights two issues with the meta-evaluation framework currently\nemployed in WMT, and assesses their impact on the metrics rankings. To do this,\nwe introduce the concept of sentinel metrics, which are designed explicitly to\nscrutinize the meta-evaluation process's accuracy, robustness, and fairness. By\nemploying sentinel metrics, we aim to validate our findings, and shed light on\nand monitor the potential biases or inconsistencies in the rankings. We\ndiscover that the present meta-evaluation framework favors two categories of\nmetrics: i) those explicitly trained to mimic human quality assessments, and\nii) continuous metrics. Finally, we raise concerns regarding the evaluation\ncapabilities of state-of-the-art metrics, emphasizing that they might be basing\ntheir assessments on spurious correlations found in their training data.\n","authors":["Stefano Perrella","Lorenzo Proietti","Alessandro Scirè","Edoardo Barba","Roberto Navigli"],"pdf_url":"https://arxiv.org/pdf/2408.13831v1.pdf","comment":"Presented at ACL 2024 Main Conference. 29 pages"},{"id":"http://arxiv.org/abs/2402.13546v2","updated":"2024-08-25T11:23:50Z","published":"2024-02-21T05:56:52Z","title":"LLMs Meet Long Video: Advancing Long Video Question Answering with An\n  Interactive Visual Adapter in LLMs","summary":"  Long video understanding is a significant and ongoing challenge in the\nintersection of multimedia and artificial intelligence. Employing large\nlanguage models (LLMs) for comprehending video becomes an emerging and\npromising method. However, this approach incurs high computational costs due to\nthe extensive array of video tokens, experiences reduced visual clarity as a\nconsequence of token aggregation, and confronts challenges arising from\nirrelevant visual tokens while answering video-related questions. To alleviate\nthese issues, we present an Interactive Visual Adapter (IVA) within LLMs,\ndesigned to enhance interaction with fine-grained visual elements.\nSpecifically, we first transform long videos into temporal video tokens via\nleveraging a visual encoder alongside a pretrained causal transformer, then\nfeed them into LLMs with the video instructions. Subsequently, we integrated\nIVA, which contains a lightweight temporal frame selector and a spatial feature\ninteractor, within the internal blocks of LLMs to capture instruction-aware and\nfine-grained visual signals. Consequently, the proposed video-LLM facilitates a\ncomprehensive understanding of long video content through appropriate long\nvideo modeling and precise visual interactions. We conducted extensive\nexperiments on nine video understanding benchmarks and experimental results\nshow that our interactive visual adapter significantly improves the performance\nof video LLMs on long video QA tasks. Ablation studies further verify the\neffectiveness of IVA in understanding long and short video.\n","authors":["Yunxin Li","Xinyu Chen","Baotain Hu","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.13546v2.pdf","comment":"12 pages; working in progress"},{"id":"http://arxiv.org/abs/2310.05746v4","updated":"2024-08-25T11:19:33Z","published":"2023-10-09T14:22:09Z","title":"Put Your Money Where Your Mouth Is: Evaluating Strategic Planning and\n  Execution of LLM Agents in an Auction Arena","summary":"  Recent advancements in Large Language Models (LLMs) showcase advanced\nreasoning, yet NLP evaluations often depend on static benchmarks. Evaluating\nthis necessitates environments that test strategic reasoning in dynamic,\ncompetitive scenarios requiring long-term planning. We introduce AucArena, a\nnovel evaluation suite that simulates auctions, a setting chosen for being\nhighly unpredictable and involving many skills related to resource and risk\nmanagement, while also being easy to evaluate. We conduct controlled\nexperiments using state-of-the-art LLMs to power bidding agents to benchmark\ntheir planning and execution skills. Our research demonstrates that LLMs, such\nas GPT-4, possess key skills for auction participation, such as budget\nmanagement and goal adherence, which improve with adaptive strategies. This\nhighlights LLMs' potential in modeling complex social interactions in\ncompetitive contexts. However, variability in LLM performance and occasional\noutperformance by simpler methods indicate opportunities for further\nadvancements in LLM design and the value of our simulation environment for\nongoing testing and refinement.\n","authors":["Jiangjie Chen","Siyu Yuan","Rong Ye","Bodhisattwa Prasad Majumder","Kyle Richardson"],"pdf_url":"https://arxiv.org/pdf/2310.05746v4.pdf","comment":"Project page: https://auction-arena.github.io"},{"id":"http://arxiv.org/abs/2408.13810v1","updated":"2024-08-25T11:13:29Z","published":"2024-08-25T11:13:29Z","title":"Revisiting the Exit from Nuclear Energy in Germany with NLP","summary":"  Annotation of political discourse is resource-intensive, but recent\ndevelopments in NLP promise to automate complex annotation tasks. Fine-tuned\ntransformer-based models outperform human annotators in some annotation tasks,\nbut they require large manually annotated training datasets. In our\ncontribution, we explore to which degree a manually annotated dataset can be\nautomatically replicated with today's NLP methods, using unsupervised machine\nlearning and zero- and few-shot learning.\n","authors":["Sebastian Haunss","André Blessing"],"pdf_url":"https://arxiv.org/pdf/2408.13810v1.pdf","comment":"23 pages, 8 figures, Accepted for publication in Zeitschrift f\\\"ur\n  Diskursforschung/Journal for Discourse Studies, ISSN: 2195-867X"},{"id":"http://arxiv.org/abs/2408.13808v1","updated":"2024-08-25T11:09:15Z","published":"2024-08-25T11:09:15Z","title":"Towards Reliable Medical Question Answering: Techniques and Challenges\n  in Mitigating Hallucinations in Language Models","summary":"  The rapid advancement of large language models (LLMs) has significantly\nimpacted various domains, including healthcare and biomedicine. However, the\nphenomenon of hallucination, where LLMs generate outputs that deviate from\nfactual accuracy or context, poses a critical challenge, especially in\nhigh-stakes domains. This paper conducts a scoping study of existing techniques\nfor mitigating hallucinations in knowledge-based task in general and especially\nfor medical domains. Key methods covered in the paper include\nRetrieval-Augmented Generation (RAG)-based techniques, iterative feedback\nloops, supervised fine-tuning, and prompt engineering. These techniques, while\npromising in general contexts, require further adaptation and optimization for\nthe medical domain due to its unique demands for up-to-date, specialized\nknowledge and strict adherence to medical guidelines. Addressing these\nchallenges is crucial for developing trustworthy AI systems that enhance\nclinical decision-making and patient safety as well as accuracy of biomedical\nscientific research.\n","authors":["Duy Khoa Pham","Bao Quoc Vo"],"pdf_url":"https://arxiv.org/pdf/2408.13808v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2311.18743v4","updated":"2024-08-25T09:58:57Z","published":"2023-11-30T17:41:30Z","title":"AlignBench: Benchmarking Chinese Alignment of Large Language Models","summary":"  Alignment has become a critical step for instruction-tuned Large Language\nModels (LLMs) to become helpful assistants. However, the effective evaluation\nof alignment for emerging Chinese LLMs is still largely unexplored. To fill in\nthis gap, we introduce AlignBench, a comprehensive multi-dimensional benchmark\nfor evaluating LLMs' alignment in Chinese. We design a human-in-the-loop data\ncuration pipeline, containing eight main categories, 683 real-scenario rooted\nqueries and corresponding human verified references. To ensure the correctness\nof references, each knowledge-intensive query is accompanied with evidences\ncollected from reliable web sources (including URLs and quotations) by our\nannotators. For automatic evaluation, our benchmark employs a rule-calibrated\nmulti-dimensional LLM-as-Judge~\\cite{zheng2023judging} approach with\nChain-of-Thought to generate explanations and final ratings, ensuring high\nreliability and interpretability. All evaluation code, data, and LLM\ngenerations are available at \\url{https://github.com/THUDM/AlignBench}. Since\nits release, AlignBench has been adopted by top (Chinese) LLMs for evaluating\ntheir alignment capabilities in Chinese, including ChatGLM, Qwen, DeepSeek, Yi,\nBaichuan, and Abab.\n","authors":["Xiao Liu","Xuanyu Lei","Shengyuan Wang","Yue Huang","Zhuoer Feng","Bosi Wen","Jiale Cheng","Pei Ke","Yifan Xu","Weng Lam Tam","Xiaohan Zhang","Lichao Sun","Xiaotao Gu","Hongning Wang","Jing Zhang","Minlie Huang","Yuxiao Dong","Jie Tang"],"pdf_url":"https://arxiv.org/pdf/2311.18743v4.pdf","comment":"Accepted to ACL 2024"},{"id":"http://arxiv.org/abs/2407.01411v3","updated":"2024-08-25T09:39:23Z","published":"2024-07-01T16:00:53Z","title":"HyperLoader: Integrating Hypernetwork-Based LoRA and Adapter Layers into\n  Multi-Task Transformers for Sequence Labelling","summary":"  We present HyperLoader, a simple approach that combines different\nparameter-efficient fine-tuning methods in a multi-task setting. To achieve\nthis goal, our model uses a hypernetwork to generate the weights of these\nmodules based on the task, the transformer layer, and its position within this\nlayer. Our method combines the benefits of multi-task learning by capturing the\nstructure of all tasks while reducing the task interference problem by\nencapsulating the task-specific knowledge in the generated weights and the\nbenefits of combining different parameter-efficient methods to outperform\nfull-fine tuning. We provide empirical evidence that HyperLoader outperforms\nprevious approaches in most datasets and obtains the best average performance\nacross tasks in high-resource and low-resource scenarios.\n","authors":["Jesus-German Ortiz-Barajas","Helena Gomez-Adorno","Thamar Solorio"],"pdf_url":"https://arxiv.org/pdf/2407.01411v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13745v1","updated":"2024-08-25T07:10:36Z","published":"2024-08-25T07:10:36Z","title":"DOCE: Finding the Sweet Spot for Execution-Based Code Generation","summary":"  Recently, a diverse set of decoding and reranking procedures have been shown\neffective for LLM-based code generation. However, a comprehensive framework\nthat links and experimentally compares these methods is missing. We address\nthis by proposing Decoding Objectives for Code Execution, a comprehensive\nframework that includes candidate generation, $n$-best reranking, minimum Bayes\nrisk (MBR) decoding, and self-debugging as the core components. We then study\nthe contributions of these components through execution-based evaluation\nmetrics. Our findings highlight the importance of execution-based methods and\nthe difference gap between execution-based and execution-free methods.\nFurthermore, we assess the impact of filtering based on trial unit tests, a\nsimple and effective strategy that has been often overlooked in prior works. We\nalso propose self-debugging on multiple candidates, obtaining state-of-the-art\nperformance on reranking for code generation. We expect our framework to\nprovide a solid guideline for future research on code generation.\n","authors":["Haau-Sing Li","Patrick Fernandes","Iryna Gurevych","André F. T. Martins"],"pdf_url":"https://arxiv.org/pdf/2408.13745v1.pdf","comment":"10 pages (32 including appendix), 5 figures, 25 tables. arXiv admin\n  note: text overlap with arXiv:2304.05128 by other authors"},{"id":"http://arxiv.org/abs/2408.13739v1","updated":"2024-08-25T06:52:48Z","published":"2024-08-25T06:52:48Z","title":"Literary and Colloquial Tamil Dialect Identification","summary":"  Culture and language evolve together. The old literary form of Tamil is used\ncommonly for writing and the contemporary colloquial Tamil is used for\nspeaking. Human-computer interaction applications require Colloquial Tamil (CT)\nto make it more accessible and easy for the everyday user and, it requires\nLiterary Tamil (LT) when information is needed in a formal written format.\nContinuing the use of LT alongside CT in computer aided language learning\napplications will both preserve LT, and provide ease of use via CT, at the same\ntime. Hence there is a need for the conversion between LT and CT dialects,\nwhich demands as a first step, dialect identification. Dialect Identification\n(DID) of LT and CT is an unexplored area of research. In the current work,\nkeeping the nuances of both these dialects in mind, five methods are explored\nwhich include two implicit methods - Gaussian Mixture Model (GMM) and\nConvolutional Neural Network (CNN); two explicit methods - Parallel Phone\nRecognition (PPR) and Parallel Large Vocabulary Continuous Speech Recognition\n(P-LVCSR); two versions of the proposed explicit Unified Phone Recognition\nmethod (UPR-1 and UPR-2). These methods vary based on: the need for annotated\ndata, the size of the unit, the way in which modelling is carried out, and the\nway in which the final decision is made. Even though the average duration of\nthe test utterances is less - 4.9s for LT and 2.5s for CT - the systems\nperformed well, offering the following identification accuracies: 87.72% (GMM),\n93.97% (CNN), 89.24% (PPR), 94.21% (P-LVCSR), 88.57% (UPR-1), 93.53% (UPR-1\nwith P-LVCSR), 94.55% (UPR-2), and 95.61% (UPR-2 with P-LVCSR).\n","authors":["M. Nanmalar","P. Vijayalakshmi","T. Nagarajan"],"pdf_url":"https://arxiv.org/pdf/2408.13739v1.pdf","comment":"18 pages, 6 figures, submitted to \"Circuits, Systems, and Signal\n  Processing\""},{"id":"http://arxiv.org/abs/2408.13738v1","updated":"2024-08-25T06:49:03Z","published":"2024-08-25T06:49:03Z","title":"Poor-Supervised Evaluation for SuperLLM via Mutual Consistency","summary":"  The guidance from capability evaluations has greatly propelled the progress\nof both human society and Artificial Intelligence. However, as LLMs evolve, it\nbecomes challenging to construct evaluation benchmarks for them with accurate\nlabels on hard tasks that approach the boundaries of human capabilities. To\ncredibly conduct evaluation without accurate labels (denoted as poor-supervised\nevaluation), we propose the PoEM framework. We first prove that the capability\nof a model can be equivalently assessed by the consistency between it and\ncertain reference model, when their prediction distributions are independent\nand the sample size is infinite. To alleviate the insufficiencies of the\nconditions in reality, we further introduce an algorithm that treats humans\n(when available) and the models under evaluation as reference models,\nalternately conducting model weights calibration and filtering during E-step\nand M-step. Comprehensive experiments across 3 types of tasks with 16\nmainstream LLMs have shown that PoEM under poor supervision can achieve an\naverage of 0.98 Pearson correlation coefficient with supervised evaluation\nresults, demonstrating good effectiveness, efficiency and generalizability.\nMore generally, PoEM has advanced the evaluation paradigm evolution from\nhuman-centric to human&model-centric by treating both of them as reference\nmodels, mitigating the limitations of human evaluation in the era of LLMs.\n","authors":["Peiwen Yuan","Shaoxiong Feng","Yiwei Li","Xinglin Wang","Boyuan Pan","Heda Wang","Yao Hu","Kan Li"],"pdf_url":"https://arxiv.org/pdf/2408.13738v1.pdf","comment":"ACL findings"},{"id":"http://arxiv.org/abs/2305.04928v5","updated":"2024-08-25T06:22:00Z","published":"2023-05-05T12:14:22Z","title":"From Zero to Hero: Harnessing Transformers for Biomedical Named Entity\n  Recognition in Zero- and Few-shot Contexts","summary":"  Supervised named entity recognition (NER) in the biomedical domain depends on\nlarge sets of annotated texts with the given named entities. The creation of\nsuch datasets can be time-consuming and expensive, while extraction of new\nentities requires additional annotation tasks and retraining the model. To\naddress these challenges, this paper proposes a method for zero- and few-shot\nNER in the biomedical domain. The method is based on transforming the task of\nmulti-class token classification into binary token classification and\npre-training on a large amount of datasets and biomedical entities, which allow\nthe model to learn semantic relations between the given and potentially novel\nnamed entity labels. We have achieved average F1 scores of 35.44% for zero-shot\nNER, 50.10% for one-shot NER, 69.94% for 10-shot NER, and 79.51% for 100-shot\nNER on 9 diverse evaluated biomedical entities with fine-tuned PubMedBERT-based\nmodel. The results demonstrate the effectiveness of the proposed method for\nrecognizing new biomedical entities with no or limited number of examples,\noutperforming previous transformer-based methods, and being comparable to\nGPT3-based models using models with over 1000 times fewer parameters. We make\nmodels and developed code publicly available.\n","authors":["Miloš Košprdić","Nikola Prodanović","Adela Ljajić","Bojana Bašaragin","Nikola Milošević"],"pdf_url":"https://arxiv.org/pdf/2305.04928v5.pdf","comment":"Collaboration between Bayer Pharma R&D and Serbian Institute for\n  Artificial Intelligence Research and Development. Artificial Intelligence in\n  Medicine (2024)"},{"id":"http://arxiv.org/abs/2408.13704v1","updated":"2024-08-25T02:01:38Z","published":"2024-08-25T02:01:38Z","title":"DHP Benchmark: Are LLMs Good NLG Evaluators?","summary":"  Large Language Models (LLMs) are increasingly serving as evaluators in\nNatural Language Generation (NLG) tasks. However, the capabilities of LLMs in\nscoring NLG quality remain inadequately explored. Current studies depend on\nhuman assessments and simple metrics that fail to capture the discernment of\nLLMs across diverse NLG tasks. To address this gap, we propose the Discernment\nof Hierarchical Perturbation (DHP) benchmarking framework, which provides\nquantitative discernment scores for LLMs utilizing hierarchically perturbed\ntext data and statistical tests to measure the NLG evaluation capabilities of\nLLMs systematically. We have re-established six evaluation datasets for this\nbenchmark, covering four NLG tasks: Summarization, Story Completion, Question\nAnswering, and Translation. Our comprehensive benchmarking of five major LLM\nseries provides critical insight into their strengths and limitations as NLG\nevaluators.\n","authors":["Yicheng Wang","Jiayi Yuan","Yu-Neng Chuang","Zhuoer Wang","Yingchi Liu","Mark Cusick","Param Kulkarni","Zhengping Ji","Yasser Ibrahim","Xia Hu"],"pdf_url":"https://arxiv.org/pdf/2408.13704v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03827v2","updated":"2024-08-25T01:38:45Z","published":"2024-06-06T08:03:05Z","title":"Chaos with Keywords: Exposing Large Language Models Sycophantic\n  Hallucination to Misleading Keywords and Evaluating Defense Strategies","summary":"  This study explores the sycophantic tendencies of Large Language Models\n(LLMs), where these models tend to provide answers that match what users want\nto hear, even if they are not entirely correct. The motivation behind this\nexploration stems from the common behavior observed in individuals searching\nthe internet for facts with partial or misleading knowledge. Similar to using\nweb search engines, users may recall fragments of misleading keywords and\nsubmit them to an LLM, hoping for a comprehensive response. Our empirical\nanalysis of several LLMs shows the potential danger of these models amplifying\nmisinformation when presented with misleading keywords. Additionally, we\nthoroughly assess four existing hallucination mitigation strategies to reduce\nLLMs sycophantic behavior. Our experiments demonstrate the effectiveness of\nthese strategies for generating factually correct statements. Furthermore, our\nanalyses delve into knowledge-probing experiments on factual keywords and\ndifferent categories of sycophancy mitigation.\n","authors":["Aswin RRV","Nemika Tyagi","Md Nayem Uddin","Neeraj Varshney","Chitta Baral"],"pdf_url":"https://arxiv.org/pdf/2406.03827v2.pdf","comment":"Findings of ACL 2024"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2408.12574v2","updated":"2024-08-25T23:58:25Z","published":"2024-08-22T17:41:45Z","title":"MuMA-ToM: Multi-modal Multi-Agent Theory of Mind","summary":"  Understanding people's social interactions in complex real-world scenarios\noften relies on intricate mental reasoning. To truly understand how and why\npeople interact with one another, we must infer the underlying mental states\nthat give rise to the social interactions, i.e., Theory of Mind reasoning in\nmulti-agent interactions. Additionally, social interactions are often\nmulti-modal -- we can watch people's actions, hear their conversations, and/or\nread about their past behaviors. For AI systems to successfully and safely\ninteract with people in real-world environments, they also need to understand\npeople's mental states as well as their inferences about each other's mental\nstates based on multi-modal information about their interactions. For this, we\nintroduce MuMA-ToM, a Multi-modal Multi-Agent Theory of Mind benchmark.\nMuMA-ToM is the first multi-modal Theory of Mind benchmark that evaluates\nmental reasoning in embodied multi-agent interactions. In MuMA-ToM, we provide\nvideo and text descriptions of people's multi-modal behavior in realistic\nhousehold environments. Based on the context, we then ask questions about\npeople's goals, beliefs, and beliefs about others' goals. We validated MuMA-ToM\nin a human experiment and provided a human baseline. We also proposed a novel\nmulti-modal, multi-agent ToM model, LIMP (Language model-based Inverse\nMulti-agent Planning). Our experimental results show that LIMP significantly\noutperforms state-of-the-art methods, including large multi-modal models (e.g.,\nGPT-4o, Gemini-1.5 Pro) and a recent multi-modal ToM model, BIP-ALM.\n","authors":["Haojun Shi","Suyu Ye","Xinyu Fang","Chuanyang Jin","Leyla Isik","Yen-Ling Kuo","Tianmin Shu"],"pdf_url":"https://arxiv.org/pdf/2408.12574v2.pdf","comment":"Project website: https://scai.cs.jhu.edu/projects/MuMA-ToM/ Code:\n  https://github.com/SCAI-JHU/MuMA-ToM"},{"id":"http://arxiv.org/abs/2408.13963v1","updated":"2024-08-25T23:57:07Z","published":"2024-08-25T23:57:07Z","title":"Shifted Window Fourier Transform And Retention For Image Captioning","summary":"  Image Captioning is an important Language and Vision task that finds\napplication in a variety of contexts, ranging from healthcare to autonomous\nvehicles. As many real-world applications rely on devices with limited\nresources, much effort in the field was put into the development of lighter and\nfaster models. However, much of the current optimizations focus on the\nTransformer architecture in contrast to the existence of more efficient\nmethods. In this work, we introduce SwiFTeR, an architecture almost entirely\nbased on Fourier Transform and Retention, to tackle the main efficiency\nbottlenecks of current light image captioning models, being the visual\nbackbone's onerosity, and the decoder's quadratic cost. SwiFTeR is made of only\n20M parameters, and requires 3.1 GFLOPs for a single forward pass.\nAdditionally, it showcases superior scalability to the caption length and its\nsmall memory requirements enable more images to be processed in parallel,\ncompared to the traditional transformer-based architectures. For instance, it\ncan generate 400 captions in one second. Although, for the time being, the\ncaption quality is lower (110.2 CIDEr-D), most of the decrease is not\nattributed to the architecture but rather an incomplete training practice which\ncurrently leaves much room for improvements. Overall, SwiFTeR points toward a\npromising direction to new efficient architectural design. The implementation\ncode will be released in the future.\n","authors":["Jia Cheng Hu","Roberto Cavicchioli","Alessandro Capotondi"],"pdf_url":"https://arxiv.org/pdf/2408.13963v1.pdf","comment":"Pre-print version of paper accepted for ICONIP 2024"},{"id":"http://arxiv.org/abs/2408.13953v1","updated":"2024-08-25T22:26:46Z","published":"2024-08-25T22:26:46Z","title":"InterTrack: Tracking Human Object Interaction without Object Templates","summary":"  Tracking human object interaction from videos is important to understand\nhuman behavior from the rapidly growing stream of video data. Previous\nvideo-based methods require predefined object templates while\nsingle-image-based methods are template-free but lack temporal consistency. In\nthis paper, we present a method to track human object interaction without any\nobject shape templates. We decompose the 4D tracking problem into per-frame\npose tracking and canonical shape optimization. We first apply a single-view\nreconstruction method to obtain temporally-inconsistent per-frame interaction\nreconstructions. Then, for the human, we propose an efficient autoencoder to\npredict SMPL vertices directly from the per-frame reconstructions, introducing\ntemporally consistent correspondence. For the object, we introduce a pose\nestimator that leverages temporal information to predict smooth object\nrotations under occlusions. To train our model, we propose a method to generate\nsynthetic interaction videos and synthesize in total 10 hour videos of 8.5k\nsequences with full 3D ground truth. Experiments on BEHAVE and InterCap show\nthat our method significantly outperforms previous template-based video\ntracking and single-frame reconstruction methods. Our proposed synthetic video\ndataset also allows training video-based methods that generalize to real-world\nvideos. Our code and dataset will be publicly released.\n","authors":["Xianghui Xie","Jan Eric Lenssen","Gerard Pons-Moll"],"pdf_url":"https://arxiv.org/pdf/2408.13953v1.pdf","comment":"17 pages, 13 figures and 6 tables. Project page:\n  https://virtualhumans.mpi-inf.mpg.de/InterTrack/"},{"id":"http://arxiv.org/abs/2408.13945v1","updated":"2024-08-25T21:49:10Z","published":"2024-08-25T21:49:10Z","title":"Personalized Topology-Informed 12-Lead ECG Electrode Localization from\n  Incomplete Cardiac MRIs for Efficient Cardiac Digital Twins","summary":"  Cardiac digital twins (CDTs) offer personalized \\textit{in-silico} cardiac\nrepresentations for the inference of multi-scale properties tied to cardiac\nmechanisms. The creation of CDTs requires precise information about the\nelectrode position on the torso, especially for the personalized\nelectrocardiogram (ECG) calibration. However, current studies commonly rely on\nadditional acquisition of torso imaging and manual/semi-automatic methods for\nECG electrode localization. In this study, we propose a novel and efficient\ntopology-informed model to fully automatically extract personalized ECG\nelectrode locations from 2D clinically standard cardiac MRIs. Specifically, we\nobtain the sparse torso contours from the cardiac MRIs and then localize the\nelectrodes from the contours. Cardiac MRIs aim at imaging of the heart instead\nof the torso, leading to incomplete torso geometry within the imaging. To\ntackle the missing topology, we incorporate the electrodes as a subset of the\nkeypoints, which can be explicitly aligned with the 3D torso topology. The\nexperimental results demonstrate that the proposed model outperforms the\ntime-consuming conventional method in terms of accuracy (Euclidean distance:\n$1.24 \\pm 0.293$ cm vs. $1.48 \\pm 0.362$ cm) and efficiency ($2$~s vs.\n$30$-$35$~min). We further demonstrate the effectiveness of using the detected\nelectrodes for \\textit{in-silico} ECG simulation, highlighting their potential\nfor creating accurate and efficient CDT models. The code will be released\npublicly after the manuscript is accepted for publication.\n","authors":["Lei Li","Hannah Smith","Yilin Lyu","Julia Camps","Blanca Rodriguez","Abhirup Banerjee","Vicente Grau"],"pdf_url":"https://arxiv.org/pdf/2408.13945v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2301.03796v2","updated":"2024-08-25T21:45:24Z","published":"2023-01-10T05:40:28Z","title":"Enhancing Evaluation Methods for Infrared Small-Target Detection in\n  Real-world Scenarios","summary":"  Infrared small target detection (IRSTD) poses a significant challenge in the\nfield of computer vision. While substantial efforts have been made over the\npast two decades to improve the detection capabilities of IRSTD algorithms,\nthere has been a lack of extensive investigation into the evaluation metrics\nused for assessing their performance. In this paper, we employ a systematic\napproach to address this issue by first evaluating the effectiveness of\nexisting metrics and then proposing new metrics to overcome the limitations of\nconventional ones. To achieve this, we carefully analyze the necessary\nconditions for successful detection and identify the shortcomings of current\nevaluation metrics, including both pre-thresholding and post-thresholding\nmetrics. We then introduce new metrics that are designed to align with the\nrequirements of real-world systems. Furthermore, we utilize these newly\nproposed metrics to compare and evaluate the performance of five widely\nrecognized small infrared target detection algorithms. The results demonstrate\nthat the new metrics provide consistent and meaningful quantitative\nassessments, aligning with qualitative observations.\n","authors":["Saed Moradi","Alireza Memarmoghadam","Payman Moallem","Mohamad Farzan Sabahi"],"pdf_url":"https://arxiv.org/pdf/2301.03796v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13936v1","updated":"2024-08-25T20:53:53Z","published":"2024-08-25T20:53:53Z","title":"OpenNav: Efficient Open Vocabulary 3D Object Detection for Smart\n  Wheelchair Navigation","summary":"  Open vocabulary 3D object detection (OV3D) allows precise and extensible\nobject recognition crucial for adapting to diverse environments encountered in\nassistive robotics. This paper presents OpenNav, a zero-shot 3D object\ndetection pipeline based on RGB-D images for smart wheelchairs. Our pipeline\nintegrates an open-vocabulary 2D object detector with a mask generator for\nsemantic segmentation, followed by depth isolation and point cloud construction\nto create 3D bounding boxes. The smart wheelchair exploits these 3D bounding\nboxes to identify potential targets and navigate safely. We demonstrate\nOpenNav's performance through experiments on the Replica dataset and we report\npreliminary results with a real wheelchair. OpenNav improves state-of-the-art\nsignificantly on the Replica dataset at mAP25 (+9pts) and mAP50 (+5pts) with\nmarginal improvement at mAP. The code is publicly available at this link:\nhttps://github.com/EasyWalk-PRIN/OpenNav.\n","authors":["Muhammad Rameez ur Rahman","Piero Simonetto","Anna Polato","Francesco Pasti","Luca Tonin","Sebastiano Vascon"],"pdf_url":"https://arxiv.org/pdf/2408.13936v1.pdf","comment":"ECCVW"},{"id":"http://arxiv.org/abs/2408.13928v1","updated":"2024-08-25T20:09:46Z","published":"2024-08-25T20:09:46Z","title":"GeoPlant: Spatial Plant Species Prediction Dataset","summary":"  The difficulty of monitoring biodiversity at fine scales and over large areas\nlimits ecological knowledge and conservation efforts. To fill this gap, Species\nDistribution Models (SDMs) predict species across space from spatially explicit\nfeatures. Yet, they face the challenge of integrating the rich but\nheterogeneous data made available over the past decade, notably millions of\nopportunistic species observations and standardized surveys, as well as\nmulti-modal remote sensing data. In light of that, we have designed and\ndeveloped a new European-scale dataset for SDMs at high spatial resolution\n(10-50 m), including more than 10k species (i.e., most of the European flora).\nThe dataset comprises 5M heterogeneous Presence-Only records and 90k exhaustive\nPresence-Absence survey records, all accompanied by diverse environmental\nrasters (e.g., elevation, human footprint, and soil) that are traditionally\nused in SDMs. In addition, it provides Sentinel-2 RGB and NIR satellite images\nwith 10 m resolution, a 20-year time-series of climatic variables, and\nsatellite time-series from the Landsat program. In addition to the data, we\nprovide an openly accessible SDM benchmark (hosted on Kaggle), which has\nalready attracted an active community and a set of strong baselines for single\npredictor/modality and multimodal approaches. All resources, e.g., the dataset,\npre-trained models, and baseline methods (in the form of notebooks), are\navailable on Kaggle, allowing one to start with our dataset literally with two\nmouse clicks.\n","authors":["Lukas Picek","Christophe Botella","Maximilien Servajean","César Leblanc","Rémi Palard","Théo Larcher","Benjamin Deneu","Diego Marcos","Pierre Bonnet","Alexis Joly"],"pdf_url":"https://arxiv.org/pdf/2408.13928v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13925v1","updated":"2024-08-25T19:47:40Z","published":"2024-08-25T19:47:40Z","title":"Infrared Domain Adaptation with Zero-Shot Quantization","summary":"  Quantization is one of the most popular techniques for reducing computation\ntime and shrinking model size. However, ensuring the accuracy of quantized\nmodels typically involves calibration using training data, which may be\ninaccessible due to privacy concerns. In such cases, zero-shot quantization, a\ntechnique that relies on pretrained models and statistical information without\nthe need for specific training data, becomes valuable. Exploring zero-shot\nquantization in the infrared domain is important due to the prevalence of\ninfrared imaging in sensitive fields like medical and security applications. In\nthis work, we demonstrate how to apply zero-shot quantization to an object\ndetection model retrained with thermal imagery. We use batch normalization\nstatistics of the model to distill data for calibration. RGB image-trained\nmodels and thermal image-trained models are compared in the context of\nzero-shot quantization. Our investigation focuses on the contributions of mean\nand standard deviation statistics to zero-shot quantization performance.\nAdditionally, we compare zero-shot quantization with post-training quantization\non a thermal dataset. We demonstrated that zero-shot quantization successfully\ngenerates data that represents the training dataset for the quantization of\nobject detection models. Our results indicate that our zero-shot quantization\nframework is effective in the absence of training data and is well-suited for\nthe infrared domain.\n","authors":["Burak Sevsay","Erdem Akagündüz"],"pdf_url":"https://arxiv.org/pdf/2408.13925v1.pdf","comment":"ICMV 2024"},{"id":"http://arxiv.org/abs/2408.13922v1","updated":"2024-08-25T19:18:18Z","published":"2024-08-25T19:18:18Z","title":"COMPOSE: Comprehensive Portrait Shadow Editing","summary":"  Existing portrait relighting methods struggle with precise control over\nfacial shadows, particularly when faced with challenges such as handling hard\nshadows from directional light sources or adjusting shadows while remaining in\nharmony with existing lighting conditions. In many situations, completely\naltering input lighting is undesirable for portrait retouching applications:\none may want to preserve some authenticity in the captured environment.\nExisting shadow editing methods typically restrict their application to just\nthe facial region and often offer limited lighting control options, such as\nshadow softening or rotation. In this paper, we introduce COMPOSE: a novel\nshadow editing pipeline for human portraits, offering precise control over\nshadow attributes such as shape, intensity, and position, all while preserving\nthe original environmental illumination of the portrait. This level of\ndisentanglement and controllability is obtained thanks to a novel decomposition\nof the environment map representation into ambient light and an editable\ngaussian dominant light source. COMPOSE is a four-stage pipeline that consists\nof light estimation and editing, light diffusion, shadow synthesis, and finally\nshadow editing. We define facial shadows as the result of a dominant light\nsource, encoded using our novel gaussian environment map representation.\nUtilizing an OLAT dataset, we have trained models to: (1) predict this light\nsource representation from images, and (2) generate realistic shadows using\nthis representation. We also demonstrate comprehensive and intuitive shadow\nediting with our pipeline. Through extensive quantitative and qualitative\nevaluations, we have demonstrated the robust capability of our system in shadow\nediting.\n","authors":["Andrew Hou","Zhixin Shu","Xuaner Zhang","He Zhang","Yannick Hold-Geoffroy","Jae Shin Yoon","Xiaoming Liu"],"pdf_url":"https://arxiv.org/pdf/2408.13922v1.pdf","comment":"Accepted at ECCV 2024"},{"id":"http://arxiv.org/abs/2408.13912v1","updated":"2024-08-25T18:27:20Z","published":"2024-08-25T18:27:20Z","title":"Splatt3R: Zero-shot Gaussian Splatting from Uncalibarated Image Pairs","summary":"  In this paper, we introduce Splatt3R, a pose-free, feed-forward method for\nin-the-wild 3D reconstruction and novel view synthesis from stereo pairs. Given\nuncalibrated natural images, Splatt3R can predict 3D Gaussian Splats without\nrequiring any camera parameters or depth information. For generalizability, we\nstart from a 'foundation' 3D geometry reconstruction method, MASt3R, and extend\nit to be a full 3D structure and appearance reconstructor. Specifically, unlike\nthe original MASt3R which reconstructs only 3D point clouds, we predict the\nadditional Gaussian attributes required to construct a Gaussian primitive for\neach point. Hence, unlike other novel view synthesis methods, Splatt3R is first\ntrained by optimizing the 3D point cloud's geometry loss, and then a novel view\nsynthesis objective. By doing this, we avoid the local minima present in\ntraining 3D Gaussian Splats from stereo views. We also propose a novel loss\nmasking strategy that we empirically find is critical for strong performance on\nextrapolated viewpoints. We train Splatt3R on the ScanNet++ dataset and\ndemonstrate excellent generalisation to uncalibrated, in-the-wild images.\nSplatt3R can reconstruct scenes at 4FPS at 512 x 512 resolution, and the\nresultant splats can be rendered in real-time.\n","authors":["Brandon Smart","Chuanxia Zheng","Iro Laina","Victor Adrian Prisacariu"],"pdf_url":"https://arxiv.org/pdf/2408.13912v1.pdf","comment":"Our project page can be found at: https://splatt3r.active.vision/"},{"id":"http://arxiv.org/abs/2408.13909v1","updated":"2024-08-25T18:10:16Z","published":"2024-08-25T18:10:16Z","title":"LowCLIP: Adapting the CLIP Model Architecture for Low-Resource Languages\n  in Multimodal Image Retrieval Task","summary":"  This research explores the development of multimodal vision-language models\nfor image retrieval in low-resource languages, specifically Azerbaijani.\nExisting vision-language models primarily support high-resource languages, and\nfine-tuning them remains computationally demanding. To address challenges in\nvision-language retrieval for low-resource languages, we integrated the CLIP\nmodel architecture and employed several techniques to balance computational\nefficiency with performance. These techniques include synthetic data generation\nthrough machine translation, image augmentation, and further training the\nattention mechanisms of transformer-based models with domain-specific data. We\nintegrated Multilingual BERT as a text encoder with image encoders like\nResNet50, EfficientNet0, Vision Transformer (ViT), and Tiny Swin Transformer.\nOur study found that models like EfficientNet0 and Tiny Swin Transformer\nperform best on the datasets they were trained on, such as COCO, Flickr30k, and\nFlickr8k. Augmentation techniques boosted EfficientNet0 MAP on Flickr30k from\n0.84 to 0.87 and ResNet50 MAP on MSCOCO from 0.70 to 0.80, contributing to a\nnew state of the art in vision-language retrieval. We share our configurations\nand results to support further research. Code and pre-trained models are\navailable at https://github.com/aliasgerovs/azclip.\n","authors":["Ali Asgarov","Samir Rustamov"],"pdf_url":"https://arxiv.org/pdf/2408.13909v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13906v1","updated":"2024-08-25T18:02:36Z","published":"2024-08-25T18:02:36Z","title":"ConVis: Contrastive Decoding with Hallucination Visualization for\n  Mitigating Hallucinations in Multimodal Large Language Models","summary":"  Hallucinations in Multimodal Large Language Models (MLLMs) where generated\nresponses fail to accurately reflect the given image pose a significant\nchallenge to their reliability. To address this, we introduce ConVis, a novel\ntraining-free contrastive decoding method. ConVis leverages a text-to-image\n(T2I) generation model to semantically reconstruct the given image from\nhallucinated captions. By comparing the contrasting probability distributions\nproduced by the original and reconstructed images, ConVis enables MLLMs to\ncapture visual contrastive signals that penalize hallucination generation.\nNotably, this method operates purely within the decoding process, eliminating\nthe need for additional data or model updates. Our extensive experiments on\nfive popular benchmarks demonstrate that ConVis effectively reduces\nhallucinations across various MLLMs, highlighting its potential to enhance\nmodel reliability.\n","authors":["Yeji Park","Deokyeong Lee","Junsuk Choe","Buru Chang"],"pdf_url":"https://arxiv.org/pdf/2408.13906v1.pdf","comment":"First two authors contributed equally. Source code is available at\n  https://github.com/yejipark-m/ConVis"},{"id":"http://arxiv.org/abs/2407.10159v2","updated":"2024-08-25T17:59:22Z","published":"2024-07-14T10:59:34Z","title":"RAPiD-Seg: Range-Aware Pointwise Distance Distribution Networks for 3D\n  LiDAR Segmentation","summary":"  3D point clouds play a pivotal role in outdoor scene perception, especially\nin the context of autonomous driving. Recent advancements in 3D LiDAR\nsegmentation often focus intensely on the spatial positioning and distribution\nof points for accurate segmentation. However, these methods, while robust in\nvariable conditions, encounter challenges due to sole reliance on coordinates\nand point intensity, leading to poor isometric invariance and suboptimal\nsegmentation. To tackle this challenge, our work introduces Range-Aware\nPointwise Distance Distribution (RAPiD) features and the associated RAPiD-Seg\narchitecture. Our RAPiD features exhibit rigid transformation invariance and\neffectively adapt to variations in point density, with a design focus on\ncapturing the localized geometry of neighboring structures. They utilize\ninherent LiDAR isotropic radiation and semantic categorization for enhanced\nlocal representation and computational efficiency, while incorporating a 4D\ndistance metric that integrates geometric and surface material reflectivity for\nimproved semantic segmentation. To effectively embed high-dimensional RAPiD\nfeatures, we propose a double-nested autoencoder structure with a novel\nclass-aware embedding objective to encode high-dimensional features into\nmanageable voxel-wise embeddings. Additionally, we propose RAPiD-Seg which\nincorporates a channel-wise attention fusion and two effective RAPiD-Seg\nvariants, further optimizing the embedding for enhanced performance and\ngeneralization. Our method outperforms contemporary LiDAR segmentation work in\nterms of mIoU on SemanticKITTI (76.1) and nuScenes (83.6) datasets.\n","authors":["Li Li","Hubert P. H. Shum","Toby P. Breckon"],"pdf_url":"https://arxiv.org/pdf/2407.10159v2.pdf","comment":"ECCV 2024 (Oral); 18 pages, 6 figures, 7 tables; Code at\n  https://github.com/l1997i/rapid_seg"},{"id":"http://arxiv.org/abs/2408.13902v1","updated":"2024-08-25T17:59:17Z","published":"2024-08-25T17:59:17Z","title":"TraIL-Det: Transformation-Invariant Local Feature Networks for 3D LiDAR\n  Object Detection with Unsupervised Pre-Training","summary":"  3D point clouds are essential for perceiving outdoor scenes, especially\nwithin the realm of autonomous driving. Recent advances in 3D LiDAR Object\nDetection focus primarily on the spatial positioning and distribution of points\nto ensure accurate detection. However, despite their robust performance in\nvariable conditions, these methods are hindered by their sole reliance on\ncoordinates and point intensity, resulting in inadequate isometric invariance\nand suboptimal detection outcomes. To tackle this challenge, our work\nintroduces Transformation-Invariant Local (TraIL) features and the associated\nTraIL-Det architecture. Our TraIL features exhibit rigid transformation\ninvariance and effectively adapt to variations in point density, with a design\nfocus on capturing the localized geometry of neighboring structures. They\nutilize the inherent isotropic radiation of LiDAR to enhance local\nrepresentation, improve computational efficiency, and boost detection\nperformance. To effectively process the geometric relations among points within\neach proposal, we propose a Multi-head self-Attention Encoder (MAE) with\nasymmetric geometric features to encode high-dimensional TraIL features into\nmanageable representations. Our method outperforms contemporary self-supervised\n3D object detection approaches in terms of mAP on KITTI (67.8, 20% label,\nmoderate) and Waymo (68.9, 20% label, moderate) datasets under various label\nratios (20%, 50%, and 100%).\n","authors":["Li Li","Tanqiu Qiao","Hubert P. H. Shum","Toby P. Breckon"],"pdf_url":"https://arxiv.org/pdf/2408.13902v1.pdf","comment":"BMVC 2024; 15 pages, 3 figures, 3 tables; Code at\n  https://github.com/l1997i/rapid_seg"},{"id":"http://arxiv.org/abs/2408.13898v1","updated":"2024-08-25T17:42:05Z","published":"2024-08-25T17:42:05Z","title":"Evaluating Attribute Comprehension in Large Vision-Language Models","summary":"  Currently, large vision-language models have gained promising progress on\nmany downstream tasks. However, they still suffer many challenges in\nfine-grained visual understanding tasks, such as object attribute\ncomprehension. Besides, there have been growing efforts on the evaluations of\nlarge vision-language models, but lack of in-depth study of attribute\ncomprehension and the visual language fine-tuning process. In this paper, we\npropose to evaluate the attribute comprehension ability of large\nvision-language models from two perspectives: attribute recognition and\nattribute hierarchy understanding. We evaluate three vision-language\ninteractions, including visual question answering, image-text matching, and\nimage-text cosine similarity. Furthermore, we explore the factors affecting\nattribute comprehension during fine-tuning. Through a series of quantitative\nand qualitative experiments, we introduce three main findings: (1) Large\nvision-language models possess good attribute recognition ability, but their\nhierarchical understanding ability is relatively limited. (2) Compared to ITC,\nITM exhibits superior capability in capturing finer details, making it more\nsuitable for attribute understanding tasks. (3) The attribute information in\nthe captions used for fine-tuning plays a crucial role in attribute\nunderstanding. We hope this work can help guide future progress in fine-grained\nvisual understanding of large vision-language models.\n","authors":["Haiwen Zhang","Zixi Yang","Yuanzhi Liu","Xinran Wang","Zheqi He","Kongming Liang","Zhanyu Ma"],"pdf_url":"https://arxiv.org/pdf/2408.13898v1.pdf","comment":"15 pages, 4 figures"},{"id":"http://arxiv.org/abs/2408.13896v1","updated":"2024-08-25T17:33:40Z","published":"2024-08-25T17:33:40Z","title":"RT-Attack: Jailbreaking Text-to-Image Models via Random Token","summary":"  Recently, Text-to-Image(T2I) models have achieved remarkable success in image\ngeneration and editing, yet these models still have many potential issues,\nparticularly in generating inappropriate or Not-Safe-For-Work(NSFW) content.\nStrengthening attacks and uncovering such vulnerabilities can advance the\ndevelopment of reliable and practical T2I models. Most of the previous works\ntreat T2I models as white-box systems, using gradient optimization to generate\nadversarial prompts. However, accessing the model's gradient is often\nimpossible in real-world scenarios. Moreover, existing defense methods, those\nusing gradient masking, are designed to prevent attackers from obtaining\naccurate gradient information. While some black-box jailbreak attacks have been\nexplored, these typically rely on simply replacing sensitive words, leading to\nsuboptimal attack performance. To address this issue, we introduce a two-stage\nquery-based black-box attack method utilizing random search. In the first\nstage, we establish a preliminary prompt by maximizing the semantic similarity\nbetween the adversarial and target harmful prompts. In the second stage, we use\nthis initial prompt to refine our approach, creating a detailed adversarial\nprompt aimed at jailbreaking and maximizing the similarity in image features\nbetween the images generated from this prompt and those produced by the target\nharmful prompt. Extensive experiments validate the effectiveness of our method\nin attacking the latest prompt checkers, post-hoc image checkers, securely\ntrained T2I models, and online commercial models.\n","authors":["Sensen Gao","Xiaojun Jia","Yihao Huang","Ranjie Duan","Jindong Gu","Yang Liu","Qing Guo"],"pdf_url":"https://arxiv.org/pdf/2408.13896v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20891v2","updated":"2024-08-25T17:07:49Z","published":"2024-07-30T15:07:13Z","title":"Bayesian Low-Rank LeArning (Bella): A Practical Approach to Bayesian\n  Neural Networks","summary":"  Computational complexity of Bayesian learning is impeding its adoption in\npractical, large-scale tasks. Despite demonstrations of significant merits such\nas improved robustness and resilience to unseen or out-of-distribution inputs\nover their non- Bayesian counterparts, their practical use has faded to near\ninsignificance. In this study, we introduce an innovative framework to mitigate\nthe computational burden of Bayesian neural networks (BNNs). Our approach\nfollows the principle of Bayesian techniques based on deep ensembles, but\nsignificantly reduces their cost via multiple low-rank perturbations of\nparameters arising from a pre-trained neural network. Both vanilla version of\nensembles as well as more sophisticated schemes such as Bayesian learning with\nStein Variational Gradient Descent (SVGD), previously deemed impractical for\nlarge models, can be seamlessly implemented within the proposed framework,\ncalled Bayesian Low-Rank LeArning (Bella). In a nutshell, i) Bella achieves a\ndramatic reduction in the number of trainable parameters required to\napproximate a Bayesian posterior; and ii) it not only maintains, but in some\ninstances, surpasses the performance of conventional Bayesian learning methods\nand non-Bayesian baselines. Our results with large-scale tasks such as\nImageNet, CAMELYON17, DomainNet, VQA with CLIP, LLaVA demonstrate the\neffectiveness and versatility of Bella in building highly scalable and\npractical Bayesian deep models for real-world applications.\n","authors":["Bao Gia Doan","Afshar Shamsi","Xiao-Yu Guo","Arash Mohammadi","Hamid Alinejad-Rokny","Dino Sejdinovic","Damith C. Ranasinghe","Ehsan Abbasnejad"],"pdf_url":"https://arxiv.org/pdf/2407.20891v2.pdf","comment":"17 pages, 14 figures, 11 tables"},{"id":"http://arxiv.org/abs/2408.13890v1","updated":"2024-08-25T16:43:47Z","published":"2024-08-25T16:43:47Z","title":"Making Large Language Models Better Planners with Reasoning-Decision\n  Alignment","summary":"  Data-driven approaches for autonomous driving (AD) have been widely adopted\nin the past decade but are confronted with dataset bias and uninterpretability.\nInspired by the knowledge-driven nature of human driving, recent approaches\nexplore the potential of large language models (LLMs) to improve understanding\nand decision-making in traffic scenarios. They find that the pretrain-finetune\nparadigm of LLMs on downstream data with the Chain-of-Thought (CoT) reasoning\nprocess can enhance explainability and scene understanding. However, such a\npopular strategy proves to suffer from the notorious problems of misalignment\nbetween the crafted CoTs against the consequent decision-making, which remains\nuntouched by previous LLM-based AD methods. To address this problem, we\nmotivate an end-to-end decision-making model based on multimodality-augmented\nLLM, which simultaneously executes CoT reasoning and carries out planning\nresults. Furthermore, we propose a reasoning-decision alignment constraint\nbetween the paired CoTs and planning results, imposing the correspondence\nbetween reasoning and decision-making. Moreover, we redesign the CoTs to enable\nthe model to comprehend complex scenarios and enhance decision-making\nperformance. We dub our proposed large language planners with\nreasoning-decision alignment as RDA-Driver. Experimental evaluations on the\nnuScenes and DriveLM-nuScenes benchmarks demonstrate the effectiveness of our\nRDA-Driver in enhancing the performance of end-to-end AD systems. Specifically,\nour RDA-Driver achieves state-of-the-art planning performance on the nuScenes\ndataset with 0.80 L2 error and 0.32 collision rate, and also achieves leading\nresults on challenging DriveLM-nuScenes benchmarks with 0.82 L2 error and 0.38\ncollision rate.\n","authors":["Zhijian Huang","Tao Tang","Shaoxiang Chen","Sihao Lin","Zequn Jie","Lin Ma","Guangrun Wang","Xiaodan Liang"],"pdf_url":"https://arxiv.org/pdf/2408.13890v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14773v2","updated":"2024-08-25T16:11:03Z","published":"2023-12-22T15:39:37Z","title":"Cross-Age and Cross-Site Domain Shift Impacts on Deep Learning-Based\n  White Matter Fiber Estimation in Newborn and Baby Brains","summary":"  Deep learning models have shown great promise in estimating tissue\nmicrostructure from limited diffusion magnetic resonance imaging data. However,\nthese models face domain shift challenges when test and train data are from\ndifferent scanners and protocols, or when the models are applied to data with\ninherent variations such as the developing brains of infants and children\nscanned at various ages. Several techniques have been proposed to address some\nof these challenges, such as data harmonization or domain adaptation in the\nadult brain. However, those techniques remain unexplored for the estimation of\nfiber orientation distribution functions in the rapidly developing brains of\ninfants. In this work, we extensively investigate the age effect and domain\nshift within and across two different cohorts of 201 newborns and 165 babies\nusing the Method of Moments and fine-tuning strategies. Our results show that\nreduced variations in the microstructural development of babies in comparison\nto newborns directly impact the deep learning models' cross-age performance. We\nalso demonstrate that a small number of target domain samples can significantly\nmitigate domain shift problems.\n","authors":["Rizhong Lin","Ali Gholipour","Jean-Philippe Thiran","Davood Karimi","Hamza Kebiri","Meritxell Bach Cuadra"],"pdf_url":"https://arxiv.org/pdf/2312.14773v2.pdf","comment":"5 pages, 5 figures; accepted as an Oral Presentation at the 2024 IEEE\n  International Symposium on Biomedical Imaging (ISBI) in Athens, Greece"},{"id":"http://arxiv.org/abs/2408.13877v1","updated":"2024-08-25T15:56:33Z","published":"2024-08-25T15:56:33Z","title":"Camouflaged_Object_Tracking__A_Benchmark","summary":"  Visual tracking has seen remarkable advancements, largely driven by the\navailability of large-scale training datasets that have enabled the development\nof highly accurate and robust algorithms. While significant progress has been\nmade in tracking general objects, research on more challenging scenarios, such\nas tracking camouflaged objects, remains limited. Camouflaged objects, which\nblend seamlessly with their surroundings or other objects, present unique\nchallenges for detection and tracking in complex environments. This challenge\nis particularly critical in applications such as military, security,\nagriculture, and marine monitoring, where precise tracking of camouflaged\nobjects is essential. To address this gap, we introduce the Camouflaged Object\nTracking Dataset (COTD), a specialized benchmark designed specifically for\nevaluating camouflaged object tracking methods. The COTD dataset comprises 200\nsequences and approximately 80,000 frames, each annotated with detailed\nbounding boxes. Our evaluation of 20 existing tracking algorithms reveals\nsignificant deficiencies in their performance with camouflaged objects. To\naddress these issues, we propose a novel tracking framework, HiPTrack-MLS,\nwhich demonstrates promising results in improving tracking performance for\ncamouflaged objects. COTD and code are avialable at\nhttps://github.com/openat25/HIPTrack-MLS.\n","authors":["Xiaoyu Guo","Pengzhi Zhong","Hao Zhang","Ling Huang","Defeng Huang","Shuiwang Li"],"pdf_url":"https://arxiv.org/pdf/2408.13877v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12528v2","updated":"2024-08-25T15:46:51Z","published":"2024-08-22T16:32:32Z","title":"Show-o: One Single Transformer to Unify Multimodal Understanding and\n  Generation","summary":"  We present a unified transformer, i.e., Show-o, that unifies multimodal\nunderstanding and generation. Unlike fully autoregressive models, Show-o\nunifies autoregressive and (discrete) diffusion modeling to adaptively handle\ninputs and outputs of various and mixed modalities. The unified model flexibly\nsupports a wide range of vision-language tasks including visual\nquestion-answering, text-to-image generation, text-guided\ninpainting/extrapolation, and mixed-modality generation. Across various\nbenchmarks, it demonstrates comparable or superior performance to existing\nindividual models with an equivalent or larger number of parameters tailored\nfor understanding or generation. This significantly highlights its potential as\na next-generation foundation model. Code and models are released at\nhttps://github.com/showlab/Show-o.\n","authors":["Jinheng Xie","Weijia Mao","Zechen Bai","David Junhao Zhang","Weihao Wang","Kevin Qinghong Lin","Yuchao Gu","Zhijie Chen","Zhenheng Yang","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2408.12528v2.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2408.13868v1","updated":"2024-08-25T15:36:28Z","published":"2024-08-25T15:36:28Z","title":"Particle-Filtering-based Latent Diffusion for Inverse Problems","summary":"  Current strategies for solving image-based inverse problems apply latent\ndiffusion models to perform posterior sampling.However, almost all approaches\nmake no explicit attempt to explore the solution space, instead drawing only a\nsingle sample from a Gaussian distribution from which to generate their\nsolution. In this paper, we introduce a particle-filtering-based framework for\na nonlinear exploration of the solution space in the initial stages of reverse\nSDE methods. Our proposed particle-filtering-based latent diffusion (PFLD)\nmethod and proposed problem formulation and framework can be applied to any\ndiffusion-based solution for linear or nonlinear inverse problems. Our\nexperimental results show that PFLD outperforms the SoTA solver PSLD on the\nFFHQ-1K and ImageNet-1K datasets on inverse problem tasks of super resolution,\nGaussian debluring and inpainting.\n","authors":["Amir Nazemi","Mohammad Hadi Sepanj","Nicholas Pellegrino","Chris Czarnecki","Paul Fieguth"],"pdf_url":"https://arxiv.org/pdf/2408.13868v1.pdf","comment":"Mohammad Hadi Sepanj, Nicholas Pellegrino, and Chris Czarnecki\n  contributed equally"},{"id":"http://arxiv.org/abs/2408.13860v1","updated":"2024-08-25T15:17:43Z","published":"2024-08-25T15:17:43Z","title":"Knowledge-Aware Reasoning over Multimodal Semi-structured Tables","summary":"  Existing datasets for tabular question answering typically focus exclusively\non text within cells. However, real-world data is inherently multimodal, often\nblending images such as symbols, faces, icons, patterns, and charts with\ntextual content in tables. With the evolution of AI models capable of\nmultimodal reasoning, it is pertinent to assess their efficacy in handling such\nstructured data. This study investigates whether current AI models can perform\nknowledge-aware reasoning on multimodal structured data. We explore their\nability to reason on tables that integrate both images and text, introducing\nMMTabQA, a new dataset designed for this purpose. Our experiments highlight\nsubstantial challenges for current AI models in effectively integrating and\ninterpreting multiple text and image inputs, understanding visual context, and\ncomparing visual content across images. These findings establish our dataset as\na robust benchmark for advancing AI's comprehension and capabilities in\nanalyzing multimodal structured data.\n","authors":["Suyash Vardhan Mathur","Jainit Sushil Bafna","Kunal Kartik","Harshita Khandelwal","Manish Shrivastava","Vivek Gupta","Mohit Bansal","Dan Roth"],"pdf_url":"https://arxiv.org/pdf/2408.13860v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13858v1","updated":"2024-08-25T15:05:32Z","published":"2024-08-25T15:05:32Z","title":"Draw Like an Artist: Complex Scene Generation with Diffusion Model via\n  Composition, Painting, and Retouching","summary":"  Recent advances in text-to-image diffusion models have demonstrated\nimpressive capabilities in image quality. However, complex scene generation\nremains relatively unexplored, and even the definition of `complex scene'\nitself remains unclear. In this paper, we address this gap by providing a\nprecise definition of complex scenes and introducing a set of Complex\nDecomposition Criteria (CDC) based on this definition. Inspired by the artists\npainting process, we propose a training-free diffusion framework called Complex\nDiffusion (CxD), which divides the process into three stages: composition,\npainting, and retouching. Our method leverages the powerful chain-of-thought\ncapabilities of large language models (LLMs) to decompose complex prompts based\non CDC and to manage composition and layout. We then develop an attention\nmodulation method that guides simple prompts to specific regions to complete\nthe complex scene painting. Finally, we inject the detailed output of the LLM\ninto a retouching model to enhance the image details, thus implementing the\nretouching stage. Extensive experiments demonstrate that our method outperforms\nprevious SOTA approaches, significantly improving the generation of\nhigh-quality, semantically consistent, and visually diverse images for complex\nscenes, even with intricate prompts.\n","authors":["Minghao Liu","Le Zhang","Yingjie Tian","Xiaochao Qu","Luoqi Liu","Ting Liu"],"pdf_url":"https://arxiv.org/pdf/2408.13858v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13854v1","updated":"2024-08-25T14:47:25Z","published":"2024-08-25T14:47:25Z","title":"Tangram: A Challenging Benchmark for Geometric Element Recognizing","summary":"  Significant advancements in Large Multimodal Models (LMMs) have enabled them\nto tackle complex problems involving visual-mathematical reasoning. However,\ntheir ability to identify geometric elements remains understudied. To bridge\nthis gap, we introduce Tangram, a novel benchmark designed to evaluate the\nperformance of LMMs on geometric element recognition. Tangram includes 1,080\ndiverse geometric diagrams sourced from primary and secondary school exams,\ncompetitions, and textbooks, covering from simple basic geometric shapes to\ncomplex combinations. Each diagram is associated with four questions, resulting\nin a total of 4,320 visual-question-answer pairs. Unlike existing benchmarks\nthat seek higher-level cognition and reasoning, Tangram focuses on the\nunderstanding of geometric elements, requiring models to perform a \"simple but\ninteresting\" counting task. Systematic evaluation of 10 prominent LMMs, such as\nGPT-4o and Claude 3.5 Sonnet, shows that even in the seemingly simple task,\nthese models still face significant challenges. Notably, the overall accuracy\nof the top performer across all tested models is only 56.8%, marking a\nsignificant gap when compared to human performance. These findings highlight\nthe limitations of current multimodal artificial intelligence systems in\nhandling basic perception tasks, and will inspire the development of the next\ngeneration of expert-level multimodal foundational models. The Tangram and\nevaluation code will be available soon.\n","authors":["Jiamin Tang","Chao Zhang","Xudong Zhu","Mengchi Liu"],"pdf_url":"https://arxiv.org/pdf/2408.13854v1.pdf","comment":"12 pages, 7 figures"},{"id":"http://arxiv.org/abs/2408.13852v1","updated":"2024-08-25T14:46:29Z","published":"2024-08-25T14:46:29Z","title":"LaneTCA: Enhancing Video Lane Detection with Temporal Context\n  Aggregation","summary":"  In video lane detection, there are rich temporal contexts among successive\nframes, which is under-explored in existing lane detectors. In this work, we\npropose LaneTCA to bridge the individual video frames and explore how to\neffectively aggregate the temporal context. Technically, we develop an\naccumulative attention module and an adjacent attention module to abstract the\nlong-term and short-term temporal context, respectively. The accumulative\nattention module continuously accumulates visual information during the journey\nof a vehicle, while the adjacent attention module propagates this lane\ninformation from the previous frame to the current frame. The two modules are\nmeticulously designed based on the transformer architecture. Finally, these\nlong-short context features are fused with the current frame features to\npredict the lane lines in the current frame. Extensive quantitative and\nqualitative experiments are conducted on two prevalent benchmark datasets. The\nresults demonstrate the effectiveness of our method, achieving several new\nstate-of-the-art records. The codes and models are available at\nhttps://github.com/Alex-1337/LaneTCA\n","authors":["Keyi Zhou","Li Li","Wengang Zhou","Yonghui Wang","Hao Feng","Houqiang Li"],"pdf_url":"https://arxiv.org/pdf/2408.13852v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13845v1","updated":"2024-08-25T14:28:49Z","published":"2024-08-25T14:28:49Z","title":"Bring the Power of Diffusion Model to Defect Detection","summary":"  Due to the high complexity and technical requirements of industrial\nproduction processes, surface defects will inevitably appear, which seriously\naffects the quality of products. Although existing lightweight detection\nnetworks are highly efficient, they are susceptible to false or missed\ndetection of non-salient defects due to the lack of semantic information. In\ncontrast, the diffusion model can generate higher-order semantic\nrepresentations in the denoising process. Therefore, the aim of this paper is\nto incorporate the higher-order modelling capability of the diffusion model\ninto the detection model, so as to better assist in the classification and\nlocalization of difficult targets. First, the denoising diffusion probabilistic\nmodel (DDPM) is pre-trained to extract the features of denoising process to\nconstruct as a feature repository. In particular, to avoid the potential\nbottleneck of memory caused by the dataloader loading high-dimensional\nfeatures, a residual convolutional variational auto-encoder (ResVAE) is\ndesigned to further compress the feature repository. The image is fed into both\nimage backbone and feature repository for feature extraction and querying\nrespectively. The queried latent features are reconstructed and filtered to\nobtain high-dimensional DDPM features. A dynamic cross-fusion method is\nproposed to fully refine the contextual features of DDPM to optimize the\ndetection model. Finally, we employ knowledge distillation to migrate the\nhigher-order modelling capabilities back into the lightweight baseline model\nwithout additional efficiency cost. Experiment results demonstrate that our\nmethod achieves competitive results on several industrial datasets.\n","authors":["Xuyi Yu"],"pdf_url":"https://arxiv.org/pdf/2408.13845v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12253v2","updated":"2024-08-25T14:13:40Z","published":"2024-08-22T09:45:24Z","title":"Epsilon: Exploring Comprehensive Visual-Semantic Projection for\n  Multi-Label Zero-Shot Learning","summary":"  This paper investigates a challenging problem of zero-shot learning in the\nmulti-label scenario (MLZSL), wherein the model is trained to recognize\nmultiple unseen classes within a sample (e.g., an image) based on seen classes\nand auxiliary knowledge, e.g., semantic information. Existing methods usually\nresort to analyzing the relationship of various seen classes residing in a\nsample from the dimension of spatial or semantic characteristics and\ntransferring the learned model to unseen ones. However, they neglect the\nintegrity of local and global features. Although the use of the attention\nstructure will accurately locate local features, especially objects, it will\nsignificantly lose its integrity, and the relationship between classes will\nalso be affected. Rough processing of global features will also directly affect\ncomprehensiveness. This neglect will make the model lose its grasp of the main\ncomponents of the image. Relying only on the local existence of seen classes\nduring the inference stage introduces unavoidable bias. In this paper, we\npropose a novel and comprehensive visual-semantic framework for MLZSL, dubbed\nEpsilon, to fully make use of such properties and enable a more accurate and\nrobust visual-semantic projection. In terms of spatial information, we achieve\neffective refinement by group aggregating image features into several semantic\nprompts. It can aggregate semantic information rather than class information,\npreserving the correlation between semantics. In terms of global semantics, we\nuse global forward propagation to collect as much information as possible to\nensure that semantics are not omitted. Experiments on large-scale MLZSL\nbenchmark datasets NUS-Wide and Open-Images-v4 demonstrate that the proposed\nEpsilon outperforms other state-of-the-art methods with large margins.\n","authors":["Ziming Liu","Jingcai Guo","Song Guo","Xiaocheng Lu"],"pdf_url":"https://arxiv.org/pdf/2408.12253v2.pdf","comment":"11 pages, 6 figures. arXiv admin note: substantial text overlap with\n  arXiv:2309.00923"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2305.04928v5","updated":"2024-08-25T06:22:00Z","published":"2023-05-05T12:14:22Z","title":"From Zero to Hero: Harnessing Transformers for Biomedical Named Entity\n  Recognition in Zero- and Few-shot Contexts","summary":"  Supervised named entity recognition (NER) in the biomedical domain depends on\nlarge sets of annotated texts with the given named entities. The creation of\nsuch datasets can be time-consuming and expensive, while extraction of new\nentities requires additional annotation tasks and retraining the model. To\naddress these challenges, this paper proposes a method for zero- and few-shot\nNER in the biomedical domain. The method is based on transforming the task of\nmulti-class token classification into binary token classification and\npre-training on a large amount of datasets and biomedical entities, which allow\nthe model to learn semantic relations between the given and potentially novel\nnamed entity labels. We have achieved average F1 scores of 35.44% for zero-shot\nNER, 50.10% for one-shot NER, 69.94% for 10-shot NER, and 79.51% for 100-shot\nNER on 9 diverse evaluated biomedical entities with fine-tuned PubMedBERT-based\nmodel. The results demonstrate the effectiveness of the proposed method for\nrecognizing new biomedical entities with no or limited number of examples,\noutperforming previous transformer-based methods, and being comparable to\nGPT3-based models using models with over 1000 times fewer parameters. We make\nmodels and developed code publicly available.\n","authors":["Miloš Košprdić","Nikola Prodanović","Adela Ljajić","Bojana Bašaragin","Nikola Milošević"],"pdf_url":"https://arxiv.org/pdf/2305.04928v5.pdf","comment":"Collaboration between Bayer Pharma R&D and Serbian Institute for\n  Artificial Intelligence Research and Development. Artificial Intelligence in\n  Medicine (2024)"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2408.12574v2","updated":"2024-08-25T23:58:25Z","published":"2024-08-22T17:41:45Z","title":"MuMA-ToM: Multi-modal Multi-Agent Theory of Mind","summary":"  Understanding people's social interactions in complex real-world scenarios\noften relies on intricate mental reasoning. To truly understand how and why\npeople interact with one another, we must infer the underlying mental states\nthat give rise to the social interactions, i.e., Theory of Mind reasoning in\nmulti-agent interactions. Additionally, social interactions are often\nmulti-modal -- we can watch people's actions, hear their conversations, and/or\nread about their past behaviors. For AI systems to successfully and safely\ninteract with people in real-world environments, they also need to understand\npeople's mental states as well as their inferences about each other's mental\nstates based on multi-modal information about their interactions. For this, we\nintroduce MuMA-ToM, a Multi-modal Multi-Agent Theory of Mind benchmark.\nMuMA-ToM is the first multi-modal Theory of Mind benchmark that evaluates\nmental reasoning in embodied multi-agent interactions. In MuMA-ToM, we provide\nvideo and text descriptions of people's multi-modal behavior in realistic\nhousehold environments. Based on the context, we then ask questions about\npeople's goals, beliefs, and beliefs about others' goals. We validated MuMA-ToM\nin a human experiment and provided a human baseline. We also proposed a novel\nmulti-modal, multi-agent ToM model, LIMP (Language model-based Inverse\nMulti-agent Planning). Our experimental results show that LIMP significantly\noutperforms state-of-the-art methods, including large multi-modal models (e.g.,\nGPT-4o, Gemini-1.5 Pro) and a recent multi-modal ToM model, BIP-ALM.\n","authors":["Haojun Shi","Suyu Ye","Xinyu Fang","Chuanyang Jin","Leyla Isik","Yen-Ling Kuo","Tianmin Shu"],"pdf_url":"https://arxiv.org/pdf/2408.12574v2.pdf","comment":"Project website: https://scai.cs.jhu.edu/projects/MuMA-ToM/ Code:\n  https://github.com/SCAI-JHU/MuMA-ToM"},{"id":"http://arxiv.org/abs/2408.13961v1","updated":"2024-08-25T23:49:35Z","published":"2024-08-25T23:49:35Z","title":"Optimizing Luxury Vehicle Dealership Networks: A Graph Neural Network\n  Approach to Site Selection","summary":"  This study presents a novel application of Graph Neural Networks (GNNs) to\noptimize dealership network planning for a luxury car manufacturer in the U.S.\nBy conducting a comprehensive literature review on dealership location\ndeterminants, the study identifies 65 county-level explanatory variables,\naugmented by two additional measures of regional interconnectedness derived\nfrom social and mobility data. An ablation study involving 34 variable\ncombinations and ten state-of-the-art GNN operators reveals key insights into\nthe predictive power of various variables, particularly highlighting the\nsignificance of competition, demographic factors, and mobility patterns in\ninfluencing dealership location decisions. The analysis pinpoints seven\nspecific counties as promising targets for network expansion. This research not\nonly illustrates the effectiveness of GNNs in solving complex geospatial\ndecision-making problems but also provides actionable recommendations and\nvaluable methodological insights for industry practitioners.\n","authors":["Luca Silvano Carocci","Qiwei Han"],"pdf_url":"https://arxiv.org/pdf/2408.13961v1.pdf","comment":"10 pages, 4 figures, 6 tables"},{"id":"http://arxiv.org/abs/2408.13960v1","updated":"2024-08-25T23:48:11Z","published":"2024-08-25T23:48:11Z","title":"Time Series Analysis for Education: Methods, Applications, and Future\n  Directions","summary":"  Recent advancements in the collection and analysis of sequential educational\ndata have brought time series analysis to a pivotal position in educational\nresearch, highlighting its essential role in facilitating data-driven\ndecision-making. However, there is a lack of comprehensive summaries that\nconsolidate these advancements. To the best of our knowledge, this paper is the\nfirst to provide a comprehensive review of time series analysis techniques\nspecifically within the educational context. We begin by exploring the\nlandscape of educational data analytics, categorizing various data sources and\ntypes relevant to education. We then review four prominent time series\nmethods-forecasting, classification, clustering, and anomaly\ndetection-illustrating their specific application points in educational\nsettings. Subsequently, we present a range of educational scenarios and\napplications, focusing on how these methods are employed to address diverse\neducational tasks, which highlights the practical integration of multiple time\nseries methods to solve complex educational problems. Finally, we conclude with\na discussion on future directions, including personalized learning analytics,\nmultimodal data fusion, and the role of large language models (LLMs) in\neducational time series. The contributions of this paper include a detailed\ntaxonomy of educational data, a synthesis of time series techniques with\nspecific educational applications, and a forward-looking perspective on\nemerging trends and future research opportunities in educational analysis. The\nrelated papers and resources are available and regularly updated at the project\npage.\n","authors":["Shengzhong Mao","Chaoli Zhang","Yichi Song","Jindong Wang","Xiao-Jun Zeng","Zenglin Xu","Qingsong Wen"],"pdf_url":"https://arxiv.org/pdf/2408.13960v1.pdf","comment":"24 pages, 3 figures, 6 tables, project page: see\n  https://github.com/ai-for-edu/time-series-analysis-for-education"},{"id":"http://arxiv.org/abs/2408.13958v1","updated":"2024-08-25T23:41:39Z","published":"2024-08-25T23:41:39Z","title":"Prediction of COPD Using Machine Learning, Clinical Summary Notes, and\n  Vital Signs","summary":"  Chronic obstructive pulmonary disease (COPD) is a chronic inflammatory lung\ndisease that causes obstructed airflow from the lungs. In the United States,\nmore than 15.7 million Americans have been diagnosed with COPD, with 96% of\nindividuals living with at least one other chronic health condition. It is the\n4th leading cause of death in the country. Over 2.2 million patients are\nadmitted to hospitals annually due to COPD exacerbations. Monitoring and\npredicting patient exacerbations on-time could save their life. This paper\npresents two different predictive models to predict COPD exacerbation using AI\nand natural language processing (NLP) approaches. These models use respiration\nsummary notes, symptoms, and vital signs. To train and test these models, data\nrecords containing physiologic signals and vital signs time series were used.\nThese records were captured from patient monitors and comprehensive clinical\ndata obtained from hospital medical information systems for tens of thousands\nof Intensive Care Unit (ICU) patients. We achieved an area under the Receiver\noperating characteristic (ROC) curve of 0.82 in detection and prediction of\nCOPD exacerbation.\n","authors":["Negar Orangi-Fard"],"pdf_url":"https://arxiv.org/pdf/2408.13958v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.16168v3","updated":"2024-08-25T23:06:51Z","published":"2024-04-24T19:55:50Z","title":"The Over-Certainty Phenomenon in Modern UDA Algorithms","summary":"  When neural networks are confronted with unfamiliar data that deviate from\ntheir training set, this signifies a domain shift. While these networks output\npredictions on their inputs, they typically fail to account for their level of\nfamiliarity with these novel observations. While prevailing works navigate\nunsupervised domain adaptation with the goal of curtailing model entropy, they\nunintentionally birth models that grapple with sub-optimal calibration - a\ndilemma we term the over-certainty phenomenon. In this paper, we uncover a\nconcerning trend in unsupervised domain adaptation and propose a solution that\nnot only maintains accuracy but also addresses calibration.\n","authors":["Fin Amin","Jung-Eun Kim"],"pdf_url":"https://arxiv.org/pdf/2404.16168v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.04895v2","updated":"2024-08-25T22:30:42Z","published":"2024-08-09T06:46:06Z","title":"Better Not to Propagate: Understanding Edge Uncertainty and\n  Over-smoothing in Signed Graph Neural Networks","summary":"  Traditional Graph Neural Networks (GNNs) rely on network homophily, which can\nlead to performance degradation due to over-smoothing in many real-world\nheterophily scenarios. Recent studies analyze the smoothing effect\n(separability) after message-passing (MP), depending on the expectation of node\nfeatures. Regarding separability gain, they provided theoretical backgrounds on\nover-smoothing caused by various propagation schemes, including positive,\nsigned, and blocked MPs. More recently, by extending these theorems, some works\nhave suggested improvements in signed propagation under multiple classes.\nHowever, prior works assume that the error ratio of all propagation schemes is\nfixed, failing to investigate this phenomenon correctly. To solve this problem,\nwe propose a novel method for estimating homophily and edge error ratio,\nintegrated with dynamic selection between blocked and signed propagation during\ntraining. Our theoretical analysis, supported by extensive experiments,\ndemonstrates that blocking MP can be more effective than signed propagation\nunder high edge error ratios, improving the performance in both homophilic and\nheterophilic graphs.\n","authors":["Yoonhyuk Choi","Jiho Choi","Taewook Ko","Chong-Kwon Kim"],"pdf_url":"https://arxiv.org/pdf/2408.04895v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.09505v3","updated":"2024-08-25T21:59:43Z","published":"2023-09-18T06:33:28Z","title":"Outlier-Insensitive Kalman Filtering: Theory and Applications","summary":"  State estimation of dynamical systems from noisy observations is a\nfundamental task in many applications. It is commonly addressed using the\nlinear Kalman filter (KF), whose performance can significantly degrade in the\npresence of outliers in the observations, due to the sensitivity of its convex\nquadratic objective function. To mitigate such behavior, outlier detection\nalgorithms can be applied. In this work, we propose a parameter-free algorithm\nwhich mitigates the harmful effect of outliers while requiring only a short\niterative process of the standard update step of the KF. To that end, we model\neach potential outlier as a normal process with unknown variance and apply\nonline estimation through either expectation maximization or alternating\nmaximization algorithms. Simulations and field experiment evaluations\ndemonstrate competitive performance of our method, showcasing its robustness to\noutliers in filtering scenarios compared to alternative algorithms.\n","authors":["Shunit Truzman","Guy Revach","Nir Shlezinger","Itzik Klein"],"pdf_url":"https://arxiv.org/pdf/2309.09505v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12856v3","updated":"2024-08-25T20:59:10Z","published":"2024-03-19T16:01:25Z","title":"Equivariant Ensembles and Regularization for Reinforcement Learning in\n  Map-based Path Planning","summary":"  In reinforcement learning (RL), exploiting environmental symmetries can\nsignificantly enhance efficiency, robustness, and performance. However,\nensuring that the deep RL policy and value networks are respectively\nequivariant and invariant to exploit these symmetries is a substantial\nchallenge. Related works try to design networks that are equivariant and\ninvariant by construction, limiting them to a very restricted library of\ncomponents, which in turn hampers the expressiveness of the networks. This\npaper proposes a method to construct equivariant policies and invariant value\nfunctions without specialized neural network components, which we term\nequivariant ensembles. We further add a regularization term for adding\ninductive bias during training. In a map-based path planning case study, we\nshow how equivariant ensembles and regularization benefit sample efficiency and\nperformance.\n","authors":["Mirco Theile","Hongpeng Cao","Marco Caccamo","Alberto L. Sangiovanni-Vincentelli"],"pdf_url":"https://arxiv.org/pdf/2403.12856v3.pdf","comment":"Accepted at IROS 2024. A video can be found here:\n  https://youtu.be/L6NOdvU7n7s. The code is available at\n  https://github.com/theilem/uavSim"},{"id":"http://arxiv.org/abs/2408.13934v1","updated":"2024-08-25T20:43:34Z","published":"2024-08-25T20:43:34Z","title":"Learning to Move Like Professional Counter-Strike Players","summary":"  In multiplayer, first-person shooter games like Counter-Strike: Global\nOffensive (CS:GO), coordinated movement is a critical component of high-level\nstrategic play. However, the complexity of team coordination and the variety of\nconditions present in popular game maps make it impractical to author\nhand-crafted movement policies for every scenario. We show that it is possible\nto take a data-driven approach to creating human-like movement controllers for\nCS:GO. We curate a team movement dataset comprising 123 hours of professional\ngame play traces, and use this dataset to train a transformer-based movement\nmodel that generates human-like team movement for all players in a \"Retakes\"\nround of the game. Importantly, the movement prediction model is efficient.\nPerforming inference for all players takes less than 0.5 ms per game step\n(amortized cost) on a single CPU core, making it plausible for use in\ncommercial games today. Human evaluators assess that our model behaves more\nlike humans than both commercially-available bots and procedural movement\ncontrollers scripted by experts (16% to 59% higher by TrueSkill rating of\n\"human-like\"). Using experiments involving in-game bot vs. bot self-play, we\ndemonstrate that our model performs simple forms of teamwork, makes fewer\ncommon movement mistakes, and yields movement distributions, player lifetimes,\nand kill locations similar to those observed in professional CS:GO match play.\n","authors":["David Durst","Feng Xie","Vishnu Sarukkai","Brennan Shacklett","Iuri Frosio","Chen Tessler","Joohwan Kim","Carly Taylor","Gilbert Bernstein","Sanjiban Choudhury","Pat Hanrahan","Kayvon Fatahalian"],"pdf_url":"https://arxiv.org/pdf/2408.13934v1.pdf","comment":"The project website is at https://davidbdurst.com/mlmove/"},{"id":"http://arxiv.org/abs/2201.05760v4","updated":"2024-08-25T20:43:12Z","published":"2022-01-15T05:25:03Z","title":"Network Level Spatial Temporal Traffic State Forecasting with\n  Hierarchical Attention LSTM (HierAttnLSTM)","summary":"  Traffic state data, such as speed, volume and travel time collected from\nubiquitous traffic monitoring sensors require advanced network level analytics\nfor forecasting and identifying significant traffic patterns. This paper\nleverages diverse traffic state datasets from the Caltrans Performance\nMeasurement System (PeMS) hosted on the open benchmark and achieved promising\nperformance compared to well recognized spatial-temporal models. Drawing\ninspiration from the success of hierarchical architectures in various\nArtificial Intelligence (AI) tasks, we integrate cell and hidden states from\nlow-level to high-level Long Short-Term Memory (LSTM) networks with an\nattention pooling mechanism, similar to human perception systems. The developed\nhierarchical structure is designed to account for dependencies across different\ntime scales, capturing the spatial-temporal correlations of network-level\ntraffic states, enabling the prediction of traffic states for all corridors\nrather than a single link or route. The efficiency of designed attention-based\nLSTM is analyzed by ablation study. Comparative results with baseline LSTM\nmodels demonstrate that the Hierarchical Attention LSTM (HierAttnLSTM) model\nnot only provides higher prediction accuracy but also effectively forecasts\nunusual congestion patterns. Data and code are made publicly available to\nsupport reproducible scientific research.\n","authors":["Tianya Terry Zhang"],"pdf_url":"https://arxiv.org/pdf/2201.05760v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19911v2","updated":"2024-08-25T20:16:51Z","published":"2024-07-29T11:39:22Z","title":"Efficient Shield Synthesis via State-Space Transformation","summary":"  We consider the problem of synthesizing safety strategies for control\nsystems, also known as shields. Since the state space is infinite, shields are\ntypically computed over a finite-state abstraction, with the most common\nabstraction being a rectangular grid. However, for many systems, such a grid\ndoes not align well with the safety property or the system dynamics. That is\nwhy a coarse grid is rarely sufficient, but a fine grid is typically\ncomputationally infeasible to obtain. In this paper, we show that appropriate\nstate-space transformations can still allow to use a coarse grid at almost no\ncomputational overhead. We demonstrate in three case studies that our\ntransformation-based synthesis outperforms a standard synthesis by several\norders of magnitude. In the first two case studies, we use domain knowledge to\nselect a suitable transformation. In the third case study, we instead report on\nresults in engineering a transformation without domain knowledge.\n","authors":["Asger Horn Brorholt","Andreas Holck Høeg-Petersen","Kim Guldstrand Larsen","Christian Schilling"],"pdf_url":"https://arxiv.org/pdf/2407.19911v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13926v1","updated":"2024-08-25T19:51:27Z","published":"2024-08-25T19:51:27Z","title":"FedGlu: A personalized federated learning-based glucose forecasting\n  algorithm for improved performance in glycemic excursion regions","summary":"  Continuous glucose monitoring (CGM) devices provide real-time glucose\nmonitoring and timely alerts for glycemic excursions, improving glycemic\ncontrol among patients with diabetes. However, identifying rare events like\nhypoglycemia and hyperglycemia remain challenging due to their infrequency.\nMoreover, limited access to sensitive patient data hampers the development of\nrobust machine learning models. Our objective is to accurately predict glycemic\nexcursions while addressing data privacy concerns. To tackle excursion\nprediction, we propose a novel Hypo-Hyper (HH) loss function, which\nsignificantly improves performance in the glycemic excursion regions. The HH\nloss function demonstrates a 46% improvement over mean-squared error (MSE) loss\nacross 125 patients. To address privacy concerns, we propose FedGlu, a machine\nlearning model trained in a federated learning (FL) framework. FL allows\ncollaborative learning without sharing sensitive data by training models\nlocally and sharing only model parameters across other patients. FedGlu\nachieves a 35% superior glycemic excursion detection rate compared to local\nmodels. This improvement translates to enhanced performance in predicting both,\nhypoglycemia and hyperglycemia, for 105 out of 125 patients. These results\nunderscore the effectiveness of the proposed HH loss function in augmenting the\npredictive capabilities of glucose predictions. Moreover, implementing models\nwithin a federated learning framework not only ensures better predictive\ncapabilities but also safeguards sensitive data concurrently.\n","authors":["Darpit Dave","Kathan Vyas","Jagadish Kumaran Jayagopal","Alfredo Garcia","Madhav Erraguntla","Mark Lawley"],"pdf_url":"https://arxiv.org/pdf/2408.13926v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.05282v3","updated":"2024-08-25T19:02:37Z","published":"2024-05-07T12:34:18Z","title":"The Detection of KIC 1718360, A Rotating Variable with a Possible\n  Companion, Using Machine Learning","summary":"  This paper presents the detection of a periodic dimming event in the\nlightcurve of the G1.5IV-V type star KIC 1718360. This is based on\nvisible-light observations conducted by both the TESS and Kepler space\ntelescopes. Analysis of the data seems to point toward a high rotation rate in\nthe star, with a rotational period of 2.938 days. The high variability seen\nwithin the star's lightcurve points toward classification as a rotating\nvariable. The initial observation was made in Kepler Quarter 16 data using the\nOne-Class SVM machine learning method. Subsequent observations by the TESS\nspace telescope corroborated these findings. It appears that KIC 1718360 is a\nnearby rotating variable that appears in little to no major catalogs as such. A\nsecondary, additional periodic dip is also present, indicating a possible\nexoplanetary companion.\n","authors":["Jakob Roche"],"pdf_url":"https://arxiv.org/pdf/2405.05282v3.pdf","comment":"6 pages, 6 figures Revised to correct errors, update and add data"},{"id":"http://arxiv.org/abs/2408.13912v1","updated":"2024-08-25T18:27:20Z","published":"2024-08-25T18:27:20Z","title":"Splatt3R: Zero-shot Gaussian Splatting from Uncalibarated Image Pairs","summary":"  In this paper, we introduce Splatt3R, a pose-free, feed-forward method for\nin-the-wild 3D reconstruction and novel view synthesis from stereo pairs. Given\nuncalibrated natural images, Splatt3R can predict 3D Gaussian Splats without\nrequiring any camera parameters or depth information. For generalizability, we\nstart from a 'foundation' 3D geometry reconstruction method, MASt3R, and extend\nit to be a full 3D structure and appearance reconstructor. Specifically, unlike\nthe original MASt3R which reconstructs only 3D point clouds, we predict the\nadditional Gaussian attributes required to construct a Gaussian primitive for\neach point. Hence, unlike other novel view synthesis methods, Splatt3R is first\ntrained by optimizing the 3D point cloud's geometry loss, and then a novel view\nsynthesis objective. By doing this, we avoid the local minima present in\ntraining 3D Gaussian Splats from stereo views. We also propose a novel loss\nmasking strategy that we empirically find is critical for strong performance on\nextrapolated viewpoints. We train Splatt3R on the ScanNet++ dataset and\ndemonstrate excellent generalisation to uncalibrated, in-the-wild images.\nSplatt3R can reconstruct scenes at 4FPS at 512 x 512 resolution, and the\nresultant splats can be rendered in real-time.\n","authors":["Brandon Smart","Chuanxia Zheng","Iro Laina","Victor Adrian Prisacariu"],"pdf_url":"https://arxiv.org/pdf/2408.13912v1.pdf","comment":"Our project page can be found at: https://splatt3r.active.vision/"},{"id":"http://arxiv.org/abs/2308.06375v2","updated":"2024-08-25T18:04:21Z","published":"2023-08-11T20:17:22Z","title":"UAMM: Price-oracle based Automated Market Maker","summary":"  Automated market makers (AMMs) are pricing mechanisms utilized by\ndecentralized exchanges (DEX). Traditional AMM approaches are constrained by\npricing solely based on their own liquidity pool, without consideration of\nexternal markets or risk management for liquidity providers. In this paper, we\npropose a new approach known as UBET AMM (UAMM), which calculates prices by\nconsidering external market prices and the impermanent loss of the liquidity\npool. Despite relying on external market prices, our method maintains the\ndesired properties of a constant product curve when computing slippages. The\nkey element of UAMM is determining the appropriate slippage amount based on the\ndesired target balance, which encourages the liquidity pool to minimize\nimpermanent loss. We demonstrate that our approach eliminates arbitrage\nopportunities when external market prices are efficient.\n","authors":["Daniel Jiwoong Im","Alexander Kondratskiy","Vincent Harvey","Hsuan-Wei Fu"],"pdf_url":"https://arxiv.org/pdf/2308.06375v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13906v1","updated":"2024-08-25T18:02:36Z","published":"2024-08-25T18:02:36Z","title":"ConVis: Contrastive Decoding with Hallucination Visualization for\n  Mitigating Hallucinations in Multimodal Large Language Models","summary":"  Hallucinations in Multimodal Large Language Models (MLLMs) where generated\nresponses fail to accurately reflect the given image pose a significant\nchallenge to their reliability. To address this, we introduce ConVis, a novel\ntraining-free contrastive decoding method. ConVis leverages a text-to-image\n(T2I) generation model to semantically reconstruct the given image from\nhallucinated captions. By comparing the contrasting probability distributions\nproduced by the original and reconstructed images, ConVis enables MLLMs to\ncapture visual contrastive signals that penalize hallucination generation.\nNotably, this method operates purely within the decoding process, eliminating\nthe need for additional data or model updates. Our extensive experiments on\nfive popular benchmarks demonstrate that ConVis effectively reduces\nhallucinations across various MLLMs, highlighting its potential to enhance\nmodel reliability.\n","authors":["Yeji Park","Deokyeong Lee","Junsuk Choe","Buru Chang"],"pdf_url":"https://arxiv.org/pdf/2408.13906v1.pdf","comment":"First two authors contributed equally. Source code is available at\n  https://github.com/yejipark-m/ConVis"},{"id":"http://arxiv.org/abs/2407.10159v2","updated":"2024-08-25T17:59:22Z","published":"2024-07-14T10:59:34Z","title":"RAPiD-Seg: Range-Aware Pointwise Distance Distribution Networks for 3D\n  LiDAR Segmentation","summary":"  3D point clouds play a pivotal role in outdoor scene perception, especially\nin the context of autonomous driving. Recent advancements in 3D LiDAR\nsegmentation often focus intensely on the spatial positioning and distribution\nof points for accurate segmentation. However, these methods, while robust in\nvariable conditions, encounter challenges due to sole reliance on coordinates\nand point intensity, leading to poor isometric invariance and suboptimal\nsegmentation. To tackle this challenge, our work introduces Range-Aware\nPointwise Distance Distribution (RAPiD) features and the associated RAPiD-Seg\narchitecture. Our RAPiD features exhibit rigid transformation invariance and\neffectively adapt to variations in point density, with a design focus on\ncapturing the localized geometry of neighboring structures. They utilize\ninherent LiDAR isotropic radiation and semantic categorization for enhanced\nlocal representation and computational efficiency, while incorporating a 4D\ndistance metric that integrates geometric and surface material reflectivity for\nimproved semantic segmentation. To effectively embed high-dimensional RAPiD\nfeatures, we propose a double-nested autoencoder structure with a novel\nclass-aware embedding objective to encode high-dimensional features into\nmanageable voxel-wise embeddings. Additionally, we propose RAPiD-Seg which\nincorporates a channel-wise attention fusion and two effective RAPiD-Seg\nvariants, further optimizing the embedding for enhanced performance and\ngeneralization. Our method outperforms contemporary LiDAR segmentation work in\nterms of mIoU on SemanticKITTI (76.1) and nuScenes (83.6) datasets.\n","authors":["Li Li","Hubert P. H. Shum","Toby P. Breckon"],"pdf_url":"https://arxiv.org/pdf/2407.10159v2.pdf","comment":"ECCV 2024 (Oral); 18 pages, 6 figures, 7 tables; Code at\n  https://github.com/l1997i/rapid_seg"},{"id":"http://arxiv.org/abs/2408.13902v1","updated":"2024-08-25T17:59:17Z","published":"2024-08-25T17:59:17Z","title":"TraIL-Det: Transformation-Invariant Local Feature Networks for 3D LiDAR\n  Object Detection with Unsupervised Pre-Training","summary":"  3D point clouds are essential for perceiving outdoor scenes, especially\nwithin the realm of autonomous driving. Recent advances in 3D LiDAR Object\nDetection focus primarily on the spatial positioning and distribution of points\nto ensure accurate detection. However, despite their robust performance in\nvariable conditions, these methods are hindered by their sole reliance on\ncoordinates and point intensity, resulting in inadequate isometric invariance\nand suboptimal detection outcomes. To tackle this challenge, our work\nintroduces Transformation-Invariant Local (TraIL) features and the associated\nTraIL-Det architecture. Our TraIL features exhibit rigid transformation\ninvariance and effectively adapt to variations in point density, with a design\nfocus on capturing the localized geometry of neighboring structures. They\nutilize the inherent isotropic radiation of LiDAR to enhance local\nrepresentation, improve computational efficiency, and boost detection\nperformance. To effectively process the geometric relations among points within\neach proposal, we propose a Multi-head self-Attention Encoder (MAE) with\nasymmetric geometric features to encode high-dimensional TraIL features into\nmanageable representations. Our method outperforms contemporary self-supervised\n3D object detection approaches in terms of mAP on KITTI (67.8, 20% label,\nmoderate) and Waymo (68.9, 20% label, moderate) datasets under various label\nratios (20%, 50%, and 100%).\n","authors":["Li Li","Tanqiu Qiao","Hubert P. H. Shum","Toby P. Breckon"],"pdf_url":"https://arxiv.org/pdf/2408.13902v1.pdf","comment":"BMVC 2024; 15 pages, 3 figures, 3 tables; Code at\n  https://github.com/l1997i/rapid_seg"},{"id":"http://arxiv.org/abs/2405.14099v2","updated":"2024-08-25T17:35:33Z","published":"2024-05-23T02:01:05Z","title":"Automatic Differentiation is Essential in Training Neural Networks for\n  Solving Differential Equations","summary":"  Neural network-based approaches have recently shown significant promise in\nsolving partial differential equations (PDEs) in science and engineering,\nespecially in scenarios featuring complex domains or the incorporation of\nempirical data. One advantage of the neural network method for PDEs lies in its\nautomatic differentiation (AD), which necessitates only the sample points\nthemselves, unlike traditional finite difference (FD) approximations that\nrequire nearby local points to compute derivatives. In this paper, we\nquantitatively demonstrate the advantage of AD in training neural networks. The\nconcept of truncated entropy is introduced to characterize the training\nproperty. Specifically, through comprehensive experimental and theoretical\nanalyses conducted on random feature models and two-layer neural networks, we\ndiscover that the defined truncated entropy serves as a reliable metric for\nquantifying the residual loss of random feature models and the training speed\nof neural networks for both AD and FD methods. Our experimental and theoretical\nanalyses demonstrate that, from a training perspective, AD outperforms FD in\nsolving partial differential equations.\n","authors":["Chuqi Chen","Yahong Yang","Yang Xiang","Wenrui Hao"],"pdf_url":"https://arxiv.org/pdf/2405.14099v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20891v2","updated":"2024-08-25T17:07:49Z","published":"2024-07-30T15:07:13Z","title":"Bayesian Low-Rank LeArning (Bella): A Practical Approach to Bayesian\n  Neural Networks","summary":"  Computational complexity of Bayesian learning is impeding its adoption in\npractical, large-scale tasks. Despite demonstrations of significant merits such\nas improved robustness and resilience to unseen or out-of-distribution inputs\nover their non- Bayesian counterparts, their practical use has faded to near\ninsignificance. In this study, we introduce an innovative framework to mitigate\nthe computational burden of Bayesian neural networks (BNNs). Our approach\nfollows the principle of Bayesian techniques based on deep ensembles, but\nsignificantly reduces their cost via multiple low-rank perturbations of\nparameters arising from a pre-trained neural network. Both vanilla version of\nensembles as well as more sophisticated schemes such as Bayesian learning with\nStein Variational Gradient Descent (SVGD), previously deemed impractical for\nlarge models, can be seamlessly implemented within the proposed framework,\ncalled Bayesian Low-Rank LeArning (Bella). In a nutshell, i) Bella achieves a\ndramatic reduction in the number of trainable parameters required to\napproximate a Bayesian posterior; and ii) it not only maintains, but in some\ninstances, surpasses the performance of conventional Bayesian learning methods\nand non-Bayesian baselines. Our results with large-scale tasks such as\nImageNet, CAMELYON17, DomainNet, VQA with CLIP, LLaVA demonstrate the\neffectiveness and versatility of Bella in building highly scalable and\npractical Bayesian deep models for real-world applications.\n","authors":["Bao Gia Doan","Afshar Shamsi","Xiao-Yu Guo","Arash Mohammadi","Hamid Alinejad-Rokny","Dino Sejdinovic","Damith C. Ranasinghe","Ehsan Abbasnejad"],"pdf_url":"https://arxiv.org/pdf/2407.20891v2.pdf","comment":"17 pages, 14 figures, 11 tables"}],"Multimedia":[{"id":"http://arxiv.org/abs/2408.13786v1","updated":"2024-08-25T09:29:20Z","published":"2024-08-25T09:29:20Z","title":"Localization of Synthetic Manipulations in Western Blot Images","summary":"  Recent breakthroughs in deep learning and generative systems have\nsignificantly fostered the creation of synthetic media, as well as the local\nalteration of real content via the insertion of highly realistic synthetic\nmanipulations. Local image manipulation, in particular, poses serious\nchallenges to the integrity of digital content and societal trust. This problem\nis not only confined to multimedia data, but also extends to biological images\nincluded in scientific publications, like images depicting Western blots. In\nthis work, we address the task of localizing synthetic manipulations in Western\nblot images. To discriminate between pristine and synthetic pixels of an\nanalyzed image, we propose a synthetic detector that operates on small patches\nextracted from the image. We aggregate patch contributions to estimate a\ntampering heatmap, highlighting synthetic pixels out of pristine ones. Our\nmethodology proves effective when tested over two manipulated Western blot\nimage datasets, one altered automatically and the other manually by exploiting\nadvanced AI-based image manipulation tools that are unknown at our training\nstage. We also explore the robustness of our method over an external dataset of\nother scientific images depicting different semantics, manipulated through\nunseen generation techniques.\n","authors":["Anmol Manjunath","Viola Negroni","Sara Mandelli","Daniel Moreira","Paolo Bestagini"],"pdf_url":"https://arxiv.org/pdf/2408.13786v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13784v1","updated":"2024-08-25T09:28:04Z","published":"2024-08-25T09:28:04Z","title":"Analyzing the Impact of Splicing Artifacts in Partially Fake Speech\n  Signals","summary":"  Speech deepfake detection has recently gained significant attention within\nthe multimedia forensics community. Related issues have also been explored,\nsuch as the identification of partially fake signals, i.e., tracks that include\nboth real and fake speech segments. However, generating high-quality spliced\naudio is not as straightforward as it may appear. Spliced signals are typically\ncreated through basic signal concatenation. This process could introduce\nnoticeable artifacts that can make the generated data easier to detect. We\nanalyze spliced audio tracks resulting from signal concatenation, investigate\ntheir artifacts and assess whether such artifacts introduce any bias in\nexisting datasets. Our findings reveal that by analyzing splicing artifacts, we\ncan achieve a detection EER of 6.16% and 7.36% on PartialSpoof and HAD\ndatasets, respectively, without needing to train any detector. These results\nunderscore the complexities of generating reliable spliced audio data and lead\nto discussions that can help improve future research in this area.\n","authors":["Viola Negroni","Davide Salvi","Paolo Bestagini","Stefano Tubaro"],"pdf_url":"https://arxiv.org/pdf/2408.13784v1.pdf","comment":"Accepted at ASVspoof 5 Workshop (Interspeech2024 Satellite)"},{"id":"http://arxiv.org/abs/2404.13621v4","updated":"2024-08-25T06:13:24Z","published":"2024-04-21T11:21:27Z","title":"Attack on Scene Flow using Point Clouds","summary":"  Deep neural networks have made significant advancements in accurately\nestimating scene flow using point clouds, which is vital for many applications\nlike video analysis, action recognition, and navigation. The robustness of\nthese techniques, however, remains a concern, particularly in the face of\nadversarial attacks that have been proven to deceive state-of-the-art deep\nneural networks in many domains. Surprisingly, the robustness of scene flow\nnetworks against such attacks has not been thoroughly investigated. To address\nthis problem, the proposed approach aims to bridge this gap by introducing\nadversarial white-box attacks specifically tailored for scene flow networks.\nExperimental results show that the generated adversarial examples obtain up to\n33.7 relative degradation in average end-point error on the KITTI and\nFlyingThings3D datasets. The study also reveals the significant impact that\nattacks targeting point clouds in only one dimension or color channel have on\naverage end-point error. Analyzing the success and failure of these attacks on\nthe scene flow networks and their 2D optical flow network variants shows a\nhigher vulnerability for the optical flow networks. Code is available at\nhttps://github.com/aheldis/Attack-on-Scene-Flow-using-Point-Clouds.git.\n","authors":["Haniyeh Ehsani Oskouie","Mohammad-Shahram Moin","Shohreh Kasaei"],"pdf_url":"https://arxiv.org/pdf/2404.13621v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13712v1","updated":"2024-08-25T03:21:48Z","published":"2024-08-25T03:21:48Z","title":"Riemann-based Multi-scale Attention Reasoning Network for Text-3D\n  Retrieval","summary":"  Due to the challenges in acquiring paired Text-3D data and the inherent\nirregularity of 3D data structures, combined representation learning of 3D\npoint clouds and text remains unexplored. In this paper, we propose a novel\nRiemann-based Multi-scale Attention Reasoning Network (RMARN) for text-3D\nretrieval. Specifically, the extracted text and point cloud features are\nrefined by their respective Adaptive Feature Refiner (AFR). Furthermore, we\nintroduce the innovative Riemann Local Similarity (RLS) module and the Global\nPooling Similarity (GPS) module. However, as 3D point cloud data and text data\noften possess complex geometric structures in high-dimensional space, the\nproposed RLS employs a novel Riemann Attention Mechanism to reflect the\nintrinsic geometric relationships of the data. Without explicitly defining the\nmanifold, RMARN learns the manifold parameters to better represent the\ndistances between text-point cloud samples. To address the challenges of\nlacking paired text-3D data, we have created the large-scale Text-3D Retrieval\ndataset T3DR-HIT, which comprises over 3,380 pairs of text and point cloud\ndata. T3DR-HIT contains coarse-grained indoor 3D scenes and fine-grained\nChinese artifact scenes, consisting of 1,380 and over 2,000 text-3D pairs,\nrespectively. Experiments on our custom datasets demonstrate the superior\nperformance of the proposed method. Our code and proposed datasets are\navailable at \\url{https://github.com/liwrui/RMARN}.\n","authors":["Wenrui Li","Wei Han","Yandu Chen","Yeyu Chai","Yidan Lu","Xingtao Wang","Xiaopeng Fan"],"pdf_url":"https://arxiv.org/pdf/2408.13712v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13711v1","updated":"2024-08-25T02:56:26Z","published":"2024-08-25T02:56:26Z","title":"SceneDreamer360: Text-Driven 3D-Consistent Scene Generation with\n  Panoramic Gaussian Splatting","summary":"  Text-driven 3D scene generation has seen significant advancements recently.\nHowever, most existing methods generate single-view images using generative\nmodels and then stitch them together in 3D space. This independent generation\nfor each view often results in spatial inconsistency and implausibility in the\n3D scenes. To address this challenge, we proposed a novel text-driven\n3D-consistent scene generation model: SceneDreamer360. Our proposed method\nleverages a text-driven panoramic image generation model as a prior for 3D\nscene generation and employs 3D Gaussian Splatting (3DGS) to ensure consistency\nacross multi-view panoramic images. Specifically, SceneDreamer360 enhances the\nfine-tuned Panfusion generator with a three-stage panoramic enhancement,\nenabling the generation of high-resolution, detail-rich panoramic images.\nDuring the 3D scene construction, a novel point cloud fusion initialization\nmethod is used, producing higher quality and spatially consistent point clouds.\nOur extensive experiments demonstrate that compared to other methods,\nSceneDreamer360 with its panoramic image generation and 3DGS can produce higher\nquality, spatially consistent, and visually appealing 3D scenes from any text\nprompt. Our codes are available at\n\\url{https://github.com/liwrui/SceneDreamer360}.\n","authors":["Wenrui Li","Yapeng Mi","Fucheng Cai","Zhe Yang","Wangmeng Zuo","Xingtao Wang","Xiaopeng Fan"],"pdf_url":"https://arxiv.org/pdf/2408.13711v1.pdf","comment":null}]},"2024-08-24T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2408.13678v1","updated":"2024-08-24T22:03:40Z","published":"2024-08-24T22:03:40Z","title":"A layer-wise analysis of Mandarin and English suprasegmentals in SSL\n  speech models","summary":"  This study asks how self-supervised speech models represent suprasegmental\ncategories like Mandarin lexical tone, English lexical stress, and English\nphrasal accents. Through a series of probing tasks, we make layer-wise\ncomparisons of English and Mandarin 12 layer monolingual models. Our findings\nsuggest that 1) English and Mandarin wav2vec 2.0 models learn contextual\nrepresentations of abstract suprasegmental categories which are strongest in\nthe middle third of the network. 2) Models are better at representing features\nthat exist in the language of their training data, and this difference is\ndriven by enriched context in transformer blocks, not local acoustic\nrepresentation. 3) Fine-tuned wav2vec 2.0 improves performance in later layers\ncompared to pre-trained models mainly for lexically contrastive features like\ntone and stress, 4) HuBERT and WavLM learn similar representations to wav2vec\n2.0, differing mainly in later layer performance. Our results extend previous\nunderstanding of how models represent suprasegmentals and offer new insights\ninto the language-specificity and contextual nature of these representations.\n","authors":["Antón de la Fuente","Dan Jurafsky"],"pdf_url":"https://arxiv.org/pdf/2408.13678v1.pdf","comment":"4 pages, 3 figures, to be published in Interspeech 2024 proceedings"},{"id":"http://arxiv.org/abs/2310.06830v2","updated":"2024-08-24T21:30:00Z","published":"2023-10-10T17:57:45Z","title":"Lemur: Harmonizing Natural Language and Code for Language Agents","summary":"  We introduce Lemur and Lemur-Chat, openly accessible language models\noptimized for both natural language and coding capabilities to serve as the\nbackbone of versatile language agents. The evolution from language chat models\nto functional language agents demands that models not only master human\ninteraction, reasoning, and planning but also ensure grounding in the relevant\nenvironments. This calls for a harmonious blend of language and coding\ncapabilities in the models. Lemur and Lemur-Chat are proposed to address this\nnecessity, demonstrating balanced proficiencies in both domains, unlike\nexisting open-source models that tend to specialize in either. Through\nmeticulous pre-training using a code-intensive corpus and instruction\nfine-tuning on text and code data, our models achieve state-of-the-art averaged\nperformance across diverse text and coding benchmarks among open-source models.\nComprehensive experiments demonstrate Lemur's superiority over existing\nopen-source models and its proficiency across various agent tasks involving\nhuman communication, tool usage, and interaction under fully- and partially-\nobservable environments. The harmonization between natural and programming\nlanguages enables Lemur-Chat to significantly narrow the gap with proprietary\nmodels on agent abilities, providing key insights into developing advanced\nopen-source agents adept at reasoning, planning, and operating seamlessly\nacross environments. https://github.com/OpenLemur/Lemur\n","authors":["Yiheng Xu","Hongjin Su","Chen Xing","Boyu Mi","Qian Liu","Weijia Shi","Binyuan Hui","Fan Zhou","Yitao Liu","Tianbao Xie","Zhoujun Cheng","Siheng Zhao","Lingpeng Kong","Bailin Wang","Caiming Xiong","Tao Yu"],"pdf_url":"https://arxiv.org/pdf/2310.06830v2.pdf","comment":"ICLR 2024 Spotlight; https://github.com/OpenLemur/Lemur"},{"id":"http://arxiv.org/abs/2401.13463v3","updated":"2024-08-24T20:28:38Z","published":"2024-01-24T14:08:38Z","title":"SpeechDPR: End-to-End Spoken Passage Retrieval for Open-Domain Spoken\n  Question Answering","summary":"  Spoken Question Answering (SQA) is essential for machines to reply to user's\nquestion by finding the answer span within a given spoken passage. SQA has been\npreviously achieved without ASR to avoid recognition errors and\nOut-of-Vocabulary (OOV) problems. However, the real-world problem of\nOpen-domain SQA (openSQA), in which the machine needs to first retrieve\npassages that possibly contain the answer from a spoken archive in addition,\nwas never considered. This paper proposes the first known end-to-end framework,\nSpeech Dense Passage Retriever (SpeechDPR), for the retrieval component of the\nopenSQA problem. SpeechDPR learns a sentence-level semantic representation by\ndistilling knowledge from the cascading model of unsupervised ASR (UASR) and\ntext dense retriever (TDR). No manually transcribed speech data is needed.\nInitial experiments showed performance comparable to the cascading model of\nUASR and TDR, and significantly better when UASR was poor, verifying this\napproach is more robust to speech recognition errors.\n","authors":["Chyi-Jiunn Lin","Guan-Ting Lin","Yung-Sung Chuang","Wei-Lun Wu","Shang-Wen Li","Abdelrahman Mohamed","Hung-yi Lee","Lin-shan Lee"],"pdf_url":"https://arxiv.org/pdf/2401.13463v3.pdf","comment":"Accepted at ICASSP 2024"},{"id":"http://arxiv.org/abs/2408.09172v3","updated":"2024-08-24T20:26:43Z","published":"2024-08-17T11:33:23Z","title":"Unc-TTP: A Method for Classifying LLM Uncertainty to Improve In-Context\n  Example Selection","summary":"  Nowadays, Large Language Models (LLMs) have demonstrated exceptional\nperformance across various downstream tasks. However, it is challenging for\nusers to discern whether the responses are generated with certainty or are\nfabricated to meet user expectations. Estimating the uncertainty of LLMs is\nparticularly challenging due to their vast scale and the lack of white-box\naccess. In this work, we propose a novel Uncertainty Tripartite Testing\nParadigm (Unc-TTP) to classify LLM uncertainty, via evaluating the consistency\nof LLM outputs when incorporating label interference into the sampling-based\napproach. Based on Unc-TTP outputs, we aggregate instances into certain and\nuncertain categories. Further, we conduct a detailed analysis of the\nuncertainty properties of LLMs and show Unc-TTP's superiority over the existing\nsampling-based methods. In addition, we leverage the obtained uncertainty\ninformation to guide in-context example selection, demonstrating that Unc-TTP\nobviously outperforms retrieval-based and sampling-based approaches in\nselecting more informative examples. Our work paves a new way to classify the\nuncertainty of both open- and closed-source LLMs, and introduces a practical\napproach to exploit this uncertainty to improve LLMs performance.\n","authors":["Hsiu-Yuan Huang","Zichen Wu","Yutong Yang","Junzhao Zhang","Yunfang Wu"],"pdf_url":"https://arxiv.org/pdf/2408.09172v3.pdf","comment":"The model diagram in Figure 1 on page 3 of the paper has significant\n  ambiguities. It may lead readers to mistakenly believe that the experiments\n  were conducted in a multi-turn dialogue format. Therefore, we request the\n  withdrawal of this submission"},{"id":"http://arxiv.org/abs/2408.13654v1","updated":"2024-08-24T19:11:54Z","published":"2024-08-24T19:11:54Z","title":"Symbolic Working Memory Enhances Language Models for Complex Rule\n  Application","summary":"  Large Language Models (LLMs) have shown remarkable reasoning performance but\nstruggle with multi-step deductive reasoning involving a series of rule\napplication steps, especially when rules are presented non-sequentially. Our\npreliminary analysis shows that while LLMs excel in single-step rule\napplication, their performance drops significantly in multi-step scenarios due\nto the challenge in rule grounding. It requires anchoring the applicable rule\nand supporting facts at each step, amidst multiple input rules, facts, and\ninferred facts. To address this, we propose augmenting LLMs with external\nworking memory and introduce a neurosymbolic framework for rule application.\nThe memory stores facts and rules in both natural language and symbolic forms,\nenabling precise tracking. Utilizing this memory, our framework iteratively\nperforms symbolic rule grounding and LLM-based rule implementation. The former\nmatches predicates and variables of symbolic rules and facts to ground\napplicable rules at each step. Experiments indicate our framework's\neffectiveness in rule application and its robustness across various steps and\nsettings~\\footnote{Code and data are available at\n\\url{https://github.com/SiyuanWangw/RuleApplication}.}.\n","authors":["Siyuan Wang","Zhongyu Wei","Yejin Choi","Xiang Ren"],"pdf_url":"https://arxiv.org/pdf/2408.13654v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13651v1","updated":"2024-08-24T18:51:47Z","published":"2024-08-24T18:51:47Z","title":"Narratives at Conflict: Computational Analysis of News Framing in\n  Multilingual Disinformation Campaigns","summary":"  Any report frames issues to favor a particular interpretation by highlighting\nor excluding certain aspects of a story. Despite the widespread use of framing\nin disinformation, framing properties and detection methods remain\nunderexplored outside the English-speaking world. We explore how multilingual\nframing of the same issue differs systematically. We use eight years of\nRussia-backed disinformation campaigns, spanning 8k news articles in 4\nlanguages targeting 15 countries. We find that disinformation campaigns\nconsistently and intentionally favor specific framing, depending on the target\nlanguage of the audience. We further discover how Russian-language articles\nconsistently highlight selected frames depending on the region of the media\ncoverage. We find that the two most prominent models for automatic frame\nanalysis underperform and show high disagreement, highlighting the need for\nfurther research.\n","authors":["Antonina Sinelnik","Dirk Hovy"],"pdf_url":"https://arxiv.org/pdf/2408.13651v1.pdf","comment":"Published in ACL SRW 2024 Proceedings, see\n  https://aclanthology.org/2024.acl-srw.21/"},{"id":"http://arxiv.org/abs/2408.13631v1","updated":"2024-08-24T17:17:46Z","published":"2024-08-24T17:17:46Z","title":"Ancient but Digitized: Developing Handwritten Optical Character\n  Recognition for East Syriac Script Through Creating KHAMIS Dataset","summary":"  Many languages have vast amounts of handwritten texts, such as ancient\nscripts about folktale stories and historical narratives or contemporary\ndocuments and letters. Digitization of those texts has various applications,\nsuch as daily tasks, cultural studies, and historical research. Syriac is an\nancient, endangered, and low-resourced language that has not received the\nattention it requires and deserves. This paper reports on a research project\naimed at developing a optical character recognition (OCR) model based on the\nhandwritten Syriac texts as a starting point to build more digital services for\nthis endangered language. A dataset was created, KHAMIS (inspired by the East\nSyriac poet, Khamis bar Qardahe), which consists of handwritten sentences in\nthe East Syriac script. We used it to fine-tune the Tesseract-OCR engine's\npretrained Syriac model on handwritten data. The data was collected from\nvolunteers capable of reading and writing in the language to create KHAMIS.\nKHAMIS currently consists of 624 handwritten Syriac sentences collected from 31\nuniversity students and one professor, and it will be partially available\nonline and the whole dataset available in the near future for development and\nresearch purposes. As a result, the handwritten OCR model was able to achieve a\ncharacter error rate of 1.097-1.610% and 8.963-10.490% on both training and\nevaluation sets, respectively, and both a character error rate of 18.89-19.71%\nand a word error rate of 62.83-65.42% when evaluated on the test set, which is\ntwice as better than the default Syriac model of Tesseract.\n","authors":["Ameer Majeed","Hossein Hassani"],"pdf_url":"https://arxiv.org/pdf/2408.13631v1.pdf","comment":"15 pages, 12 figures, 5 tables"},{"id":"http://arxiv.org/abs/2407.09817v2","updated":"2024-08-24T17:01:19Z","published":"2024-07-13T09:28:24Z","title":"Empowering Whisper as a Joint Multi-Talker and Target-Talker Speech\n  Recognition System","summary":"  Multi-talker speech recognition and target-talker speech recognition, both\ninvolve transcription in multi-talker contexts, remain significant challenges.\nHowever, existing methods rarely attempt to simultaneously address both tasks.\nIn this study, we propose a pioneering approach to empower Whisper, which is a\nspeech foundation model, to tackle joint multi-talker and target-talker speech\nrecognition tasks. Specifically, (i) we freeze Whisper and plug a Sidecar\nseparator into its encoder to separate mixed embedding for multiple talkers;\n(ii) a Target Talker Identifier is introduced to identify the embedding flow of\nthe target talker on the fly, requiring only three-second enrollment speech as\na cue; (iii) soft prompt tuning for decoder is explored for better task\nadaptation. Our method outperforms previous methods on two- and three-talker\nLibriMix and LibriSpeechMix datasets for both tasks, and delivers acceptable\nzero-shot performance on multi-talker ASR on AishellMix Mandarin dataset.\n","authors":["Lingwei Meng","Jiawen Kang","Yuejiao Wang","Zengrui Jin","Xixin Wu","Xunying Liu","Helen Meng"],"pdf_url":"https://arxiv.org/pdf/2407.09817v2.pdf","comment":"Accepted to INTERSPEECH 2024"},{"id":"http://arxiv.org/abs/2408.13624v1","updated":"2024-08-24T16:35:00Z","published":"2024-08-24T16:35:00Z","title":"No Dataset Needed for Downstream Knowledge Benchmarking: Response\n  Dispersion Inversely Correlates with Accuracy on Domain-specific QA","summary":"  This research seeks to obviate the need for creating QA datasets and grading\n(chatbot) LLM responses when comparing LLMs' knowledge in specific topic\ndomains. This is done in an entirely end-user centric way without need for\naccess to any inner workings of the LLM, so long as it can be prompted and\ngiven a random seed to create different generations to the same prompt. The\npaper does this by, for a given topic domain, defining the \"response\ndispersion\" of an LLM by repeatedly asking an LLM the same opinion question\nabout that topic domain. Namely, the response dispersion is the count of\nsingular values needed to explain 95% of the variance in the embedding matrix\nof the LLM's responses. It is found that the response dispersion is inversely\ncorrelated with accuracy on relevant QA evaluations (average spearman rank\ncorrelation stronger than -.59). A use-case analysis shows that when comparing\ntwo different LLMs on the same topic domain, comparing their response\ndispersion is a suitable replacement for comparing their QA accuracy between\n74% and 89% of the time, the range depending on certain reasonable\naccuracy-difference tolerances that may be acceptable to an end-user in\nexchange for the labor being saved using response dispersion instead of QA\naccuracy for comparison. Two response embeddings are studied for creating the\nembedding matrix in this study, one is from OpenAI's APIs and one is a novel\nembedding, here named reference sentence similarity embeddings, that can be\ncomputed locally and performs very nearly as well in calculating response\ndispersion. Also in this research, a pre-existing dataset called the IRC-Wiki\nTrivia dataset, originally developed for trivia games, has been re-purposed,\ncurated, and the curation, called IRC-WikiTriviaQA, is made available for the\npurpose of this research.\n","authors":["Robert L Simione II"],"pdf_url":"https://arxiv.org/pdf/2408.13624v1.pdf","comment":"16 pages, 3 tables, 1 figure"},{"id":"http://arxiv.org/abs/2402.02563v4","updated":"2024-08-24T14:46:55Z","published":"2024-02-04T16:45:01Z","title":"Synergy-of-Thoughts: Eliciting Efficient Reasoning in Hybrid Language\n  Models","summary":"  Large language models (LLMs) have shown impressive emergent abilities in a\nwide range of tasks, but the associated expensive API cost greatly limits the\nreal application. Previous works like chain-of-thought (CoT) and\ntree-of-thoughts (ToT) have predominately focused on enhancing accuracy, but\noverlook the rapidly increasing API cost, which could be particularly\nproblematic for open-ended real-world tasks with huge solution spaces.\nMotivated by the dual process theory of human cognition, we propose \"Synergy of\nThoughts\"(SoT) to unleash the synergistic potential of hybrid LLMs with\ndifferent scales for efficient reasoning. By default, SoT uses smaller-scale\nlanguage models to generate multiple low-cost intuitive thoughts, which\nresembles the parallel intuitions produced by System 1. We then design a\nconfidence evaluator where the intuitive thoughts are cross-evaluated and\nintroduce a controllable threshold mechanism to decide their mutual conflict.\nIf these intuitive thoughts exhibit conflicts, SoT will invoke the reflective\nreasoning of scaled-up language models to emulate the intervention of System 2,\nwhich will override the intuitive thoughts and rectify the reasoning results.\nThis framework is model-agnostic and training-free, which can be flexibly\nimplemented with various off-the-shelf LLMs. Experiments on six representative\nreasoning tasks show that SoT substantially reduces the API cost by\n38.3%-75.1%, and simultaneously achieves state-of-the-art reasoning accuracy\nand solution diversity. Notably, the average token cost reduction on open-ended\ntasks reaches up to 69.1%.\n","authors":["Yu Shang","Yu Li","Fengli Xu","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2402.02563v4.pdf","comment":"19 pages, 16 figures, 12 tables"},{"id":"http://arxiv.org/abs/2407.12725v2","updated":"2024-08-24T14:44:11Z","published":"2024-07-17T16:42:03Z","title":"Is Sarcasm Detection A Step-by-Step Reasoning Process in Large Language\n  Models?","summary":"  Elaborating a series of intermediate reasoning steps significantly improves\nthe ability of large language models (LLMs) to solve complex problems, as such\nsteps would evoke LLMs to think sequentially. However, human sarcasm\nunderstanding is often considered an intuitive and holistic cognitive process,\nin which various linguistic, contextual, and emotional cues are integrated to\nform a comprehensive understanding, in a way that does not necessarily follow a\nstep-by-step fashion. To verify the validity of this argument, we introduce a\nnew prompting framework (called SarcasmCue) containing four sub-methods, viz.\nchain of contradiction (CoC), graph of cues (GoC), bagging of cues (BoC) and\ntensor of cues (ToC), which elicits LLMs to detect human sarcasm by considering\nsequential and non-sequential prompting methods. Through a comprehensive\nempirical comparison on four benchmarks, we highlight three key findings: (1)\nCoC and GoC show superior performance with more advanced models like GPT-4 and\nClaude 3.5, with an improvement of 3.5%. (2) ToC significantly outperforms\nother methods when smaller LLMs are evaluated, boosting the F1 score by 29.7%\nover the best baseline. (3) Our proposed framework consistently pushes the\nstate-of-the-art (i.e., ToT) by 4.2%, 2.0%, 29.7%, and 58.2% in F1 scores\nacross four datasets. This demonstrates the effectiveness and stability of the\nproposed framework.\n","authors":["Ben Yao","Yazhou Zhang","Qiuchi Li","Jing Qin"],"pdf_url":"https://arxiv.org/pdf/2407.12725v2.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.13586v1","updated":"2024-08-24T14:14:32Z","published":"2024-08-24T14:14:32Z","title":"Balancing Diversity and Risk in LLM Sampling: How to Select Your Method\n  and Parameter for Open-Ended Text Generation","summary":"  Sampling-based decoding strategies have been widely adopted for Large\nLanguage Models (LLMs) in numerous applications, which target a balance between\ndiversity and quality via temperature tuning and tail truncation (e.g., top-k\nand top-p sampling). Considering the high dynamic range of the candidate\nnext-token given different prefixes, recent studies propose to adaptively\ntruncate the tail of LLM's predicted distribution. Although improved results\nhaven been reported with these methods on open-ended text generation tasks, the\nresults are highly dependent on the curated truncation parameters and exemplar\ntext. In this paper, we propose a systematic way to estimate the intrinsic\ncapacity of a truncation sampling method by considering the trade-off between\ndiversity and risk at each decoding step, based on our collected prefix tree\nwhich preserves the context of a full sentence. Our work provides a\ncomprehensive comparison between existing truncation sampling methods, as well\nas their recommended parameters as a guideline for users.\n","authors":["Yuxuan Zhou","Margret Keuper","Mario Fritz"],"pdf_url":"https://arxiv.org/pdf/2408.13586v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13585v1","updated":"2024-08-24T13:59:41Z","published":"2024-08-24T13:59:41Z","title":"FLEURS-ASL: Including American Sign Language in Massively Multilingual\n  Multitask Evaluation","summary":"  Sign language translation has historically been peripheral to mainstream\nmachine translation research. In order to help converge the fields, we\nintroduce FLEURS-ASL, an extension of the multiway parallel benchmarks FLORES\n(for text) and FLEURS (for speech) to support their first sign language (as\nvideo), American Sign Language, translated by 5 Certified Deaf Interpreters.\nFLEURS-ASL can be used to evaluate a variety of tasks -- primarily sentence-\nand discourse-level translation -- between ASL and 200 other languages as text,\nor 102 languages as speech. We provide baselines for tasks from ASL to English\ntext using a unified modeling approach that incorporates timestamp tokens and\nprevious text tokens in a 34-second context window, trained on random video\nclips from YouTube-ASL. This model meets or exceeds the performance of\nphrase-level baselines while supporting a multitude of new tasks. We also use\nFLEURS-ASL to show that multimodal frontier models have virtually no\nunderstanding of ASL, underscoring the importance of including sign languages\nin standard evaluation suites.\n","authors":["Garrett Tanzer"],"pdf_url":"https://arxiv.org/pdf/2408.13585v1.pdf","comment":"Access FLEURS-ASL at\n  https://www.kaggle.com/datasets/googleai/fleurs-asl. arXiv admin note: text\n  overlap with arXiv:2408.07065"},{"id":"http://arxiv.org/abs/2408.08688v2","updated":"2024-08-24T12:34:01Z","published":"2024-08-16T12:01:55Z","title":"The Fellowship of the LLMs: Multi-Agent Workflows for Synthetic\n  Preference Optimization Dataset Generation","summary":"  This paper presents synthetic Preference Optimization (PO) datasets generated\nusing multi-agent workflows and evaluates the effectiveness and potential of\nthese workflows in the dataset generation process. PO dataset generation\nrequires two modules: (1) response evaluation, and (2) response generation. In\nthe response evaluation module, the responses from Large Language Models (LLMs)\nare evaluated and ranked - a task typically carried out by human annotators\nthat we automate using LLMs. We assess the response evaluation module in a 2\nstep process. In step 1, we assess LLMs as evaluators using three distinct\nprompting strategies. In step 2, we apply the winning prompting strategy to\ncompare the performance of LLM-as-a-Judge, LLMs-as-a-Jury, and LLM Debate. In\neach step, we use inter-rater agreement using Cohen's Kappa between human\nannotators and LLMs. For the response generation module, we compare different\nconfigurations for the LLM Feedback Loop using the identified LLM evaluator\nconfiguration. We use the win rate (the fraction of times a generation\nframework is selected as the best by an LLM evaluator) to determine the best\nmulti-agent configuration for generation. After identifying the best\nconfigurations for both modules, we use models from the GPT, Gemma, and Llama\nfamilies to generate our PO datasets using the above pipeline. We generate two\ntypes of PO datasets, one to improve the generation capabilities of individual\nLLM and the other to improve the multi-agent workflow. Our evaluation shows\nthat GPT-4o-as-a-Judge is more consistent across datasets when the candidate\nresponses do not include responses from the GPT family. Additionally, we find\nthat the LLM Feedback Loop, with Llama as the generator and Gemma as the\nreviewer, achieves a notable 71.8% and 73.8% win rate over single-agent Llama\nand Gemma, respectively.\n","authors":["Samee Arif","Sualeha Farid","Abdul Hameed Azeemi","Awais Athar","Agha Ali Raza"],"pdf_url":"https://arxiv.org/pdf/2408.08688v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.15806v4","updated":"2024-08-24T12:01:30Z","published":"2023-09-27T17:29:41Z","title":"Lyra: Orchestrating Dual Correction in Automated Theorem Proving","summary":"  Large Language Models (LLMs) present an intriguing avenue for exploration in\nthe field of formal theorem proving. Nevertheless, their full potential,\nparticularly concerning the mitigation of hallucinations and refinement through\nprover error messages, remains an area that has yet to be thoroughly\ninvestigated. To enhance the effectiveness of LLMs in the field, we introduce\nthe Lyra, a new framework that employs two distinct correction mechanisms: Tool\nCorrection (TC) and Conjecture Correction (CC). To implement Tool Correction in\nthe post-processing of formal proofs, we leverage prior knowledge to utilize\npredefined prover tools (e.g., Sledgehammer) for guiding the replacement of\nincorrect tools. Tool Correction significantly contributes to mitigating\nhallucinations, thereby improving the overall accuracy of the proof. In\naddition, we introduce Conjecture Correction, an error feedback mechanism\ndesigned to interact with prover to refine formal proof conjectures with prover\nerror messages. Compared to the previous refinement framework, the proposed\nConjecture Correction refines generation with instruction but does not collect\npaired (generation, error & refinement) prompts. Our method has achieved\nstate-of-the-art (SOTA) performance on both miniF2F validation (48.0% -> 55.3%)\nand test (45.5% -> 51.2%). We also present 3 IMO problems solved by Lyra. We\nbelieve Tool Correction (post-process for hallucination mitigation) and\nConjecture Correction (subgoal adjustment from interaction with environment)\ncould provide a promising avenue for future research in this field.\n","authors":["Chuanyang Zheng","Haiming Wang","Enze Xie","Zhengying Liu","Jiankai Sun","Huajian Xin","Jianhao Shen","Zhenguo Li","Yu Li"],"pdf_url":"https://arxiv.org/pdf/2309.15806v4.pdf","comment":"Accepted to TMLR: https://openreview.net/forum?id=9Z0yB8rmQ2"},{"id":"http://arxiv.org/abs/2408.13545v1","updated":"2024-08-24T10:34:20Z","published":"2024-08-24T10:34:20Z","title":"IQA-EVAL: Automatic Evaluation of Human-Model Interactive Question\n  Answering","summary":"  To evaluate Large Language Models (LLMs) for question answering (QA),\ntraditional methods typically focus on directly assessing the immediate\nresponses generated by the models based on the given question and context. In\nthe common use case of humans seeking AI assistant's help in finding\ninformation, these non-interactive evaluations do not account for the dynamic\nnature of human-model conversations, and interaction-aware evaluations have\nshown that accurate QA models are preferred by humans (Lee et al., 2023).\nRecent works in human-computer interaction (HCI) have employed human evaluators\nto conduct interactions and evaluations, but they are often prohibitively\nexpensive and time-consuming to scale. In this work, we introduce an automatic\nevaluation framework IQA-EVAL to Interactive Question Answering Evaluation.\nMore specifically, we introduce LLM-based Evaluation Agent (LEA) that can: (1)\nsimulate human behaviors to generate interactions with IQA models; (2)\nautomatically evaluate the generated interactions. Moreover, we propose\nassigning personas to LEAs to better simulate groups of real human evaluators.\nWe show that: (1) our evaluation framework with GPT-4 (or Claude) as the\nbackbone model achieves a high correlation with human evaluations on the IQA\ntask; (2) assigning personas to LEA to better represent the crowd further\nsignificantly improves correlations. Finally, we use our automatic metric to\nevaluate five recent representative LLMs with over 1000 questions from complex\nand ambiguous question answering tasks, which comes with a substantial cost of\n$5k if evaluated by humans.\n","authors":["Ruosen Li","Barry Wang","Ruochen Li","Xinya Du"],"pdf_url":"https://arxiv.org/pdf/2408.13545v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.07536v3","updated":"2024-08-24T09:59:31Z","published":"2023-11-13T18:22:32Z","title":"A Comprehensive Evaluation of GPT-4V on Knowledge-Intensive Visual\n  Question Answering","summary":"  The emergence of multimodal large models (MLMs) has significantly advanced\nthe field of visual understanding, offering remarkable capabilities in the\nrealm of visual question answering (VQA). Yet, the true challenge lies in the\ndomain of knowledge-intensive VQA tasks, which necessitate not just recognition\nof visual elements, but also a deep comprehension of the visual information in\nconjunction with a vast repository of learned knowledge. To uncover such\ncapabilities of MLMs, particularly the newly introduced GPT-4V and Gemini, we\nprovide an in-depth evaluation from three perspectives: 1) Commonsense\nKnowledge, which assesses how well models can understand visual cues and\nconnect to general knowledge; 2) Fine-grained World Knowledge, which tests the\nmodel's skill in reasoning out specific knowledge from images, showcasing their\nproficiency across various specialized fields; 3) Comprehensive Knowledge with\nDecision-making Rationales, which examines model's capability to provide\nlogical explanations for its inference, facilitating a deeper analysis from the\ninterpretability perspective. Additionally, we utilize a visual\nknowledge-enhanced training strategy and multimodal retrieval-augmented\ngeneration approach to enhance MLMs, highlighting the future need for\nadvancements in this research direction. Extensive experiments indicate that:\na) GPT-4V demonstrates enhanced explanation generation when using composite\nimages as few-shots; b) GPT-4V and other MLMs produce severe hallucinations\nwhen dealing with world knowledge; c) Visual knowledge enhanced training and\nprompting technicals present potential to improve performance. Codes:\nhttps://github.com/HITsz-TMG/Cognitive-Visual-Language-Mapper\n","authors":["Yunxin Li","Longyue Wang","Baotian Hu","Xinyu Chen","Wanqi Zhong","Chenyang Lyu","Wei Wang","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.07536v3.pdf","comment":"20 pages, 15 pages; technical paper"},{"id":"http://arxiv.org/abs/2408.13534v1","updated":"2024-08-24T09:25:18Z","published":"2024-08-24T09:25:18Z","title":"Cultural Adaptation of Menus: A Fine-Grained Approach","summary":"  Machine Translation of Culture-Specific Items (CSIs) poses significant\nchallenges. Recent work on CSI translation has shown some success using Large\nLanguage Models (LLMs) to adapt to different languages and cultures; however, a\ndeeper analysis is needed to examine the benefits and pitfalls of each method.\nIn this paper, we introduce the ChineseMenuCSI dataset, the largest for\nChinese-English menu corpora, annotated with CSI vs Non-CSI labels and a\nfine-grained test set. We define three levels of CSI figurativeness for a more\nnuanced analysis and develop a novel methodology for automatic CSI\nidentification, which outperforms GPT-based prompts in most categories.\nImportantly, we are the first to integrate human translation theories into\nLLM-driven translation processes, significantly improving translation accuracy,\nwith COMET scores increasing by up to 7 points.\n","authors":["Zhonghe Zhang","Xiaoyu He","Vivek Iyer","Alexandra Birch"],"pdf_url":"https://arxiv.org/pdf/2408.13534v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13533v1","updated":"2024-08-24T09:23:01Z","published":"2024-08-24T09:23:01Z","title":"Pandora's Box or Aladdin's Lamp: A Comprehensive Analysis Revealing the\n  Role of RAG Noise in Large Language Models","summary":"  Retrieval-Augmented Generation (RAG) has emerged as a crucial method for\naddressing hallucinations in large language models (LLMs). While recent\nresearch has extended RAG models to complex noisy scenarios, these explorations\noften confine themselves to limited noise types and presuppose that noise is\ninherently detrimental to LLMs, potentially deviating from real-world retrieval\nenvironments and restricting practical applicability. In this paper, we define\nseven distinct noise types from a linguistic perspective and establish a Noise\nRAG Benchmark (NoiserBench), a comprehensive evaluation framework encompassing\nmultiple datasets and reasoning tasks. Through empirical evaluation of eight\nrepresentative LLMs with diverse architectures and scales, we reveal that these\nnoises can be further categorized into two practical groups: noise that is\nbeneficial to LLMs (aka beneficial noise) and noise that is harmful to LLMs\n(aka harmful noise). While harmful noise generally impairs performance,\nbeneficial noise may enhance several aspects of model capabilities and overall\nperformance. Our analysis offers insights for developing more robust, adaptable\nRAG solutions and mitigating hallucinations across diverse retrieval scenarios.\n","authors":["Jinyang Wu","Feihu Che","Chuyuan Zhang","Jianhua Tao","Shuai Zhang","Pengpeng Shao"],"pdf_url":"https://arxiv.org/pdf/2408.13533v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12881v2","updated":"2024-08-24T09:11:13Z","published":"2023-12-20T09:45:44Z","title":"Big Tech influence over AI research revisited: memetic analysis of\n  attribution of ideas to affiliation","summary":"  There exists a growing discourse around the domination of Big Tech on the\nlandscape of artificial intelligence (AI) research, yet our comprehension of\nthis phenomenon remains cursory. This paper aims to broaden and deepen our\nunderstanding of Big Tech's reach and power within AI research. It highlights\nthe dominance not merely in terms of sheer publication volume but rather in the\npropagation of new ideas or memes. Current studies often oversimplify the\nconcept of influence to the share of affiliations in academic papers, typically\nsourced from limited databases such as arXiv or specific academic conferences.\n  The main goal of this paper is to unravel the specific nuances of such\ninfluence, determining which AI ideas are predominantly driven by Big Tech\nentities. By employing network and memetic analysis on AI-oriented paper\nabstracts and their citation network, we are able to grasp a deeper insight\ninto this phenomenon. By utilizing two databases: OpenAlex and S2ORC, we are\nable to perform such analysis on a much bigger scale than previous attempts.\n  Our findings suggest that while Big Tech-affiliated papers are\ndisproportionately more cited in some areas, the most cited papers are those\naffiliated with both Big Tech and Academia. Focusing on the most contagious\nmemes, their attribution to specific affiliation groups (Big Tech, Academia,\nmixed affiliation) seems equally distributed between those three groups. This\nsuggests that the notion of Big Tech domination over AI research is\noversimplified in the discourse.\n","authors":["Stanisław Giziński","Paulina Kaczyńska","Hubert Ruczyński","Emilia Wiśnios","Bartosz Pieliński","Przemysław Biecek","Julian Sienkiewicz"],"pdf_url":"https://arxiv.org/pdf/2312.12881v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13521v1","updated":"2024-08-24T08:50:25Z","published":"2024-08-24T08:50:25Z","title":"HRGraph: Leveraging LLMs for HR Data Knowledge Graphs with Information\n  Propagation-based Job Recommendation","summary":"  Knowledge Graphs (KGs) serving as semantic networks, prove highly effective\nin managing complex interconnected data in different domains, by offering a\nunified, contextualized, and structured representation with flexibility that\nallows for easy adaptation to evolving knowledge. Processing complex Human\nResources (HR) data, KGs can help in different HR functions like recruitment,\njob matching, identifying learning gaps, and enhancing employee retention.\nDespite their potential, limited efforts have been made to implement practical\nHR knowledge graphs. This study addresses this gap by presenting a framework\nfor effectively developing HR knowledge graphs from documents using Large\nLanguage Models. The resulting KG can be used for a variety of downstream\ntasks, including job matching, identifying employee skill gaps, and many more.\nIn this work, we showcase instances where HR KGs prove instrumental in precise\njob matching, yielding advantages for both employers and employees. Empirical\nevidence from experiments with information propagation in KGs and Graph Neural\nNets, along with case studies underscores the effectiveness of KGs in tasks\nsuch as job and employee recommendations and job area classification. Code and\ndata are available at : https://github.com/azminewasi/HRGraph\n","authors":["Azmine Toushik Wasi"],"pdf_url":"https://arxiv.org/pdf/2408.13521v1.pdf","comment":"7 Pages, 4 Figures. View in ACL Anthology:\n  https://aclanthology.org/2024.kallm-1.6/"},{"id":"http://arxiv.org/abs/2408.13518v1","updated":"2024-08-24T08:44:04Z","published":"2024-08-24T08:44:04Z","title":"Selective Preference Optimization via Token-Level Reward Function\n  Estimation","summary":"  Recent advancements in large language model alignment leverage token-level\nsupervisions to perform fine-grained preference optimization. However, existing\ntoken-level alignment methods either optimize on all available tokens, which\ncan be noisy and inefficient, or perform selective training with complex and\nexpensive key token selection strategies. In this work, we propose Selective\nPreference Optimization (SePO), a novel selective alignment strategy that\ncenters on efficient key token selection. SePO proposes the first token\nselection method based on Direct Preference Optimization (DPO), which trains an\noracle model to estimate a token-level reward function on the target data. This\nmethod applies to any existing alignment datasets with response-level\nannotations and enables cost-efficient token selection with small-scale oracle\nmodels and training data. The estimated reward function is then utilized to\nscore all tokens within the target dataset, where only the key tokens are\nselected to supervise the target policy model with a reference model-free\ncontrastive objective function. Extensive experiments on three public\nevaluation benchmarks show that SePO significantly outperforms competitive\nbaseline methods by only optimizing 30% key tokens on the target dataset. SePO\napplications on weak-to-strong generalization show that weak oracle models\neffectively supervise strong policy models with up to 16.8x more parameters.\nSePO also effectively selects key tokens from out-of-distribution data to\nenhance strong policy models and alleviate the over-optimization problem.\n","authors":["Kailai Yang","Zhiwei Liu","Qianqian Xie","Jimin Huang","Erxue Min","Sophia Ananiadou"],"pdf_url":"https://arxiv.org/pdf/2408.13518v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2408.13501v1","updated":"2024-08-24T06:59:55Z","published":"2024-08-24T06:59:55Z","title":"Utilizing Large Language Models for Named Entity Recognition in\n  Traditional Chinese Medicine against COVID-19 Literature: Comparative Study","summary":"  Objective: To explore and compare the performance of ChatGPT and other\nstate-of-the-art LLMs on domain-specific NER tasks covering different entity\ntypes and domains in TCM against COVID-19 literature. Methods: We established a\ndataset of 389 articles on TCM against COVID-19, and manually annotated 48 of\nthem with 6 types of entities belonging to 3 domains as the ground truth,\nagainst which the NER performance of LLMs can be assessed. We then performed\nNER tasks for the 6 entity types using ChatGPT (GPT-3.5 and GPT-4) and 4\nstate-of-the-art BERT-based question-answering (QA) models (RoBERTa, MiniLM,\nPubMedBERT and SciBERT) without prior training on the specific task. A domain\nfine-tuned model (GSAP-NER) was also applied for a comprehensive comparison.\nResults: The overall performance of LLMs varied significantly in exact match\nand fuzzy match. In the fuzzy match, ChatGPT surpassed BERT-based QA models in\n5 out of 6 tasks, while in exact match, BERT-based QA models outperformed\nChatGPT in 5 out of 6 tasks but with a smaller F-1 difference. GPT-4 showed a\nsignificant advantage over other models in fuzzy match, especially on the\nentity type of TCM formula and the Chinese patent drug (TFD) and ingredient\n(IG). Although GPT-4 outperformed BERT-based models on entity type of herb,\ntarget, and research method, none of the F-1 scores exceeded 0.5. GSAP-NER,\noutperformed GPT-4 in terms of F-1 by a slight margin on RM. ChatGPT achieved\nconsiderably higher recalls than precisions, particularly in the fuzzy match.\nConclusions: The NER performance of LLMs is highly dependent on the entity\ntype, and their performance varies across application scenarios. ChatGPT could\nbe a good choice for scenarios where high recall is favored. However, for\nknowledge acquisition in rigorous scenarios, neither ChatGPT nor BERT-based QA\nmodels are off-the-shelf tools for professional practitioners.\n","authors":["Xu Tong","Nina Smirnova","Sharmila Upadhyaya","Ran Yu","Jack H. Culbert","Chao Sun","Wolfgang Otto","Philipp Mayr"],"pdf_url":"https://arxiv.org/pdf/2408.13501v1.pdf","comment":"22 pages with 2 figures"},{"id":"http://arxiv.org/abs/2408.13473v1","updated":"2024-08-24T05:15:15Z","published":"2024-08-24T05:15:15Z","title":"Why Antiwork: A RoBERTa-Based System for Work-Related Stress\n  Identification and Leading Factor Analysis","summary":"  Harsh working environments and work-related stress have been known to\ncontribute to mental health problems such as anxiety, depression, and suicidal\nideation. As such, it is paramount to create solutions that can both detect\nemployee unhappiness and find the root cause of the problem. While prior works\nhave examined causes of mental health using machine learning, they typically\nfocus on general mental health analysis, with few of them focusing on\nexplainable solutions or looking at the workplace-specific setting. r/antiwork\nis a subreddit for the antiwork movement, which is the desire to stop working\naltogether. Using this subreddit as a proxy for work environment\ndissatisfaction, we create a new dataset for antiwork sentiment detection and\nsubsequently train a model that highlights the words with antiwork sentiments.\nFollowing this, we performed a qualitative and quantitative analysis to uncover\nsome of the key insights into the mindset of individuals who identify with the\nantiwork movement and how their working environments influenced them. We find\nthat working environments that do not give employees authority or\nresponsibility, frustrating recruiting experiences, and unfair compensation,\nare some of the leading causes of the antiwork sentiment, resulting in a lack\nof self-confidence and motivation among their employees.\n","authors":["Tao Lu","Muzhe Wu","Xinyi Lu","Siyuan Xu","Shuyu Zhan","Anuj Tambwekar","Emily Mower Provost"],"pdf_url":"https://arxiv.org/pdf/2408.13473v1.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2408.13457v1","updated":"2024-08-24T04:03:35Z","published":"2024-08-24T04:03:35Z","title":"Make Every Penny Count: Difficulty-Adaptive Self-Consistency for\n  Cost-Efficient Reasoning","summary":"  Self-consistency (SC), a widely used decoding strategy for chain-of-thought\nreasoning, shows significant gains across various multi-step reasoning tasks\nbut comes with a high cost due to multiple sampling with the preset size. Its\nvariants, Adaptive self-consistency (ASC) and Early-stopping self-consistency\n(ESC), dynamically adjust the number of samples based on the posterior\ndistribution of a set of pre-samples, reducing the cost of SC with minimal\nimpact on performance. Both methods, however, do not exploit the prior\ninformation about question difficulty. It often results in unnecessary repeated\nsampling for easy questions that could be accurately answered with just one\nattempt, wasting resources. To tackle this problem, we propose\nDifficulty-Adaptive Self-Consistency (DSC), which leverages the difficulty\ninformation from both prior and posterior perspectives to adaptively allocate\ninference resources, further reducing the cost of SC. To demonstrate the\neffectiveness of DSC, we conduct extensive experiments on three popular\ncategories of reasoning tasks: arithmetic, commonsense and symbolic reasoning\non six benchmarks. The empirical results show that DSC consistently surpasses\nthe strong baseline ASC and ESC in terms of costs by a significant margin,\nwhile attaining comparable performances.\n","authors":["Xinglin Wang","Shaoxiong Feng","Yiwei Li","Peiwen Yuan","Yueqi Zhang","Boyuan Pan","Heda Wang","Yao Hu","Kan Li"],"pdf_url":"https://arxiv.org/pdf/2408.13457v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2408.11319v2","updated":"2024-08-24T03:58:40Z","published":"2024-08-21T03:59:51Z","title":"SarcasmBench: Towards Evaluating Large Language Models on Sarcasm\n  Understanding","summary":"  In the era of large language models (LLMs), the task of ``System I''~-~the\nfast, unconscious, and intuitive tasks, e.g., sentiment analysis, text\nclassification, etc., have been argued to be successfully solved. However,\nsarcasm, as a subtle linguistic phenomenon, often employs rhetorical devices\nlike hyperbole and figuration to convey true sentiments and intentions,\ninvolving a higher level of abstraction than sentiment analysis. There is\ngrowing concern that the argument about LLMs' success may not be fully tenable\nwhen considering sarcasm understanding. To address this question, we select\neleven SOTA LLMs and eight SOTA pre-trained language models (PLMs) and present\ncomprehensive evaluations on six widely used benchmark datasets through\ndifferent prompting approaches, i.e., zero-shot input/output (IO) prompting,\nfew-shot IO prompting, chain of thought (CoT) prompting. Our results highlight\nthree key findings: (1) current LLMs underperform supervised PLMs based sarcasm\ndetection baselines across six sarcasm benchmarks. This suggests that\nsignificant efforts are still required to improve LLMs' understanding of human\nsarcasm. (2) GPT-4 consistently and significantly outperforms other LLMs across\nvarious prompting methods, with an average improvement of 14.0\\%$\\uparrow$.\nClaude 3 and ChatGPT demonstrate the next best performance after GPT-4. (3)\nFew-shot IO prompting method outperforms the other two methods: zero-shot IO\nand few-shot CoT. The reason is that sarcasm detection, being a holistic,\nintuitive, and non-rational cognitive process, is argued not to adhere to\nstep-by-step logical reasoning, making CoT less effective in understanding\nsarcasm compared to its effectiveness in mathematical reasoning tasks.\n","authors":["Yazhou Zhang","Chunwang Zou","Zheng Lian","Prayag Tiwari","Jing Qin"],"pdf_url":"https://arxiv.org/pdf/2408.11319v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15228v2","updated":"2024-08-24T03:51:44Z","published":"2024-04-23T16:59:02Z","title":"Re-Thinking Inverse Graphics With Large Language Models","summary":"  Inverse graphics -- the task of inverting an image into physical variables\nthat, when rendered, enable reproduction of the observed scene -- is a\nfundamental challenge in computer vision and graphics. Successfully\ndisentangling an image into its constituent elements, such as the shape, color,\nand material properties of the objects of the 3D scene that produced it,\nrequires a comprehensive understanding of the environment. This complexity\nlimits the ability of existing carefully engineered approaches to generalize\nacross domains. Inspired by the zero-shot ability of large language models\n(LLMs) to generalize to novel contexts, we investigate the possibility of\nleveraging the broad world knowledge encoded in such models to solve\ninverse-graphics problems. To this end, we propose the Inverse-Graphics Large\nLanguage Model (IG-LLM), an inverse-graphics framework centered around an LLM,\nthat autoregressively decodes a visual embedding into a structured,\ncompositional 3D-scene representation. We incorporate a frozen pre-trained\nvisual encoder and a continuous numeric head to enable end-to-end training.\nThrough our investigation, we demonstrate the potential of LLMs to facilitate\ninverse graphics through next-token prediction, without the application of\nimage-space supervision. Our analysis enables new possibilities for precise\nspatial reasoning about images that exploit the visual knowledge of LLMs. We\nrelease our code and data at https://ig-llm.is.tue.mpg.de/ to ensure the\nreproducibility of our investigation and to facilitate future research.\n","authors":["Peter Kulits","Haiwen Feng","Weiyang Liu","Victoria Abrevaya","Michael J. Black"],"pdf_url":"https://arxiv.org/pdf/2404.15228v2.pdf","comment":"TMLR camera-ready; 31 pages; project page:\n  https://ig-llm.is.tue.mpg.de/"},{"id":"http://arxiv.org/abs/2408.10923v3","updated":"2024-08-24T03:22:09Z","published":"2024-08-20T15:05:02Z","title":"LBC: Language-Based-Classifier for Out-Of-Variable Generalization","summary":"  Large Language Models (LLMs) have great success in natural language\nprocessing tasks such as response generation. However, their use in tabular\ndata has been limited due to their inferior performance compared to traditional\nmachine learning models (TMLs) such as XGBoost. We find that the pre-trained\nknowledge of LLMs enables them to interpret new variables that appear in a test\nwithout additional training, a capability central to the concept of\nOut-of-Variable (OOV). From the findings, we propose a\nLanguage-Based-Classifier (LBC), a classifier that maximizes the benefits of\nLLMs to outperform TMLs on OOV tasks. LBC employs three key methodological\nstrategies: 1) Categorical changes to adjust data to better fit the model's\nunderstanding, 2) Advanced order and indicator to enhance data representation\nto the model, and 3) Using verbalizer to map logit scores to classes during\ninference to generate model predictions. These strategies, combined with the\npre-trained knowledge of LBC, emphasize the model's ability to effectively\nhandle OOV tasks. We empirically and theoretically validate the superiority of\nLBC. LBC is the first study to apply an LLM-based model to OOV tasks. The\nsource code is at https://github.com/sksmssh/LBCforOOVGen\n","authors":["Kangjun Noh","Baekryun Seong","Hoyoon Byun","Youngjun Choi","Sungjin Song","Kyungwoo Song"],"pdf_url":"https://arxiv.org/pdf/2408.10923v3.pdf","comment":"16 pages, 7 figures, 4 tables"},{"id":"http://arxiv.org/abs/2408.06266v2","updated":"2024-08-24T03:19:13Z","published":"2024-08-12T16:24:51Z","title":"Anchored Preference Optimization and Contrastive Revisions: Addressing\n  Underspecification in Alignment","summary":"  Large Language Models (LLMs) are often aligned using contrastive alignment\nobjectives and preference pair datasets. The interaction between model, paired\ndata, and objective makes alignment a complicated procedure, sometimes\nproducing subpar results. We study this and find that (i) preference data gives\na better learning signal when the underlying responses are contrastive, and\n(ii) alignment objectives lead to better performance when they specify more\ncontrol over the model during training. Based on these insights, we introduce\nContrastive Learning from AI Revisions (CLAIR), a data-creation method which\nleads to more contrastive preference pairs, and Anchored Preference\nOptimization (APO), a controllable and more stable alignment objective. We\nalign Llama-3-8B-Instruct using various comparable datasets and alignment\nobjectives and measure MixEval-Hard scores, which correlate highly with human\njudgments. The CLAIR preferences lead to the strongest performance out of all\ndatasets, and APO consistently outperforms less controllable objectives. Our\nbest model, trained on 32K CLAIR preferences with APO, improves\nLlama-3-8B-Instruct by 7.65%, closing the gap with GPT4-turbo by 45%. Our code\nis available at https://github.com/ContextualAI/CLAIR_and_APO.\n","authors":["Karel D'Oosterlinck","Winnie Xu","Chris Develder","Thomas Demeester","Amanpreet Singh","Christopher Potts","Douwe Kiela","Shikib Mehri"],"pdf_url":"https://arxiv.org/pdf/2408.06266v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13442v1","updated":"2024-08-24T02:48:40Z","published":"2024-08-24T02:48:40Z","title":"A Law of Next-Token Prediction in Large Language Models","summary":"  Large language models (LLMs) have been widely employed across various\napplication domains, yet their black-box nature poses significant challenges to\nunderstanding how these models process input data internally to make\npredictions. In this paper, we introduce a precise and quantitative law that\ngoverns the learning of contextualized token embeddings through intermediate\nlayers in pre-trained LLMs for next-token prediction. Our findings reveal that\neach layer contributes equally to enhancing prediction accuracy, from the\nlowest to the highest layer -- a universal phenomenon observed across a diverse\narray of open-source LLMs, built on architectures such as Transformer, RWKV,\nand Mamba. We demonstrate that this law offers new perspectives and insights to\ninform and guide practices in LLM development and applications, including model\nscaling, pre-training tasks, and information flow. Overall, our law enables\nmore fine-grained approaches to the design, training, and interpretation of\nLLMs through scrutinizing their internal data processing mechanisms.\n","authors":["Hangfeng He","Weijie J. Su"],"pdf_url":"https://arxiv.org/pdf/2408.13442v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13440v1","updated":"2024-08-24T02:40:28Z","published":"2024-08-24T02:40:28Z","title":"Knowledge-Aware Conversation Derailment Forecasting Using Graph\n  Convolutional Networks","summary":"  Online conversations are particularly susceptible to derailment, which can\nmanifest itself in the form of toxic communication patterns including\ndisrespectful comments and abuse. Forecasting conversation derailment predicts\nsigns of derailment in advance enabling proactive moderation of conversations.\nState-of-the-art approaches to conversation derailment forecasting sequentially\nencode conversations and use graph neural networks to model dialogue user\ndynamics. However, existing graph models are not able to capture complex\nconversational characteristics such as context propagation and emotional\nshifts. The use of common sense knowledge enables a model to capture such\ncharacteristics, thus improving performance. Following this approach, here we\nderive commonsense statements from a knowledge base of dialogue contextual\ninformation to enrich a graph neural network classification architecture. We\nfuse the multi-source information on utterance into capsules, which are used by\na transformer-based forecaster to predict conversation derailment. Our model\ncaptures conversation dynamics and context propagation, outperforming the\nstate-of-the-art models on the CGA and CMV benchmark datasets\n","authors":["Enas Altarawneh","Ameeta Agrawal","Michael Jenkin","Manos Papagelis"],"pdf_url":"https://arxiv.org/pdf/2408.13440v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2306.12982;\n  text overlap with arXiv:2106.01071 by other authors"},{"id":"http://arxiv.org/abs/2408.13432v1","updated":"2024-08-24T01:58:28Z","published":"2024-08-24T01:58:28Z","title":"Integrating Multi-Head Convolutional Encoders with Cross-Attention for\n  Improved SPARQL Query Translation","summary":"  The main task of the KGQA system (Knowledge Graph Question Answering) is to\nconvert user input questions into query syntax (such as SPARQL). With the rise\nof modern popular encoders and decoders like Transformer and ConvS2S, many\nscholars have shifted the research direction of SPARQL generation to the Neural\nMachine Translation (NMT) architecture or the generative AI field of\nText-to-SPARQL. In NMT-based QA systems, the system treats knowledge base query\nsyntax as a language. It uses NMT-based translation models to translate natural\nlanguage questions into query syntax. Scholars use popular architectures\nequipped with cross-attention, such as Transformer, ConvS2S, and BiLSTM, to\ntrain translation models for query syntax. To achieve better query results,\nthis paper improved the ConvS2S encoder and added multi-head attention from the\nTransformer, proposing a Multi-Head Conv encoder (MHC encoder) based on the\nn-gram language model. The principle is to use convolutional layers to capture\nlocal hidden features in the input sequence with different receptive fields,\nusing multi-head attention to calculate dependencies between them. Ultimately,\nwe found that the translation model based on the Multi-Head Conv encoder\nachieved better performance than other encoders, obtaining 76.52\\% and 83.37\\%\nBLEU-1 (BiLingual Evaluation Understudy) on the QALD-9 and LC-QuAD-1.0\ndatasets, respectively. Additionally, in the end-to-end system experiments on\nthe QALD-9 and LC-QuAD-1.0 datasets, we achieved leading results over other\nKGQA systems, with Macro F1-measures reaching 52\\% and 66\\%, respectively.\nMoreover, the experimental results show that with limited computational\nresources, if one possesses an excellent encoder-decoder architecture and\ncross-attention, experts and scholars can achieve outstanding performance\nequivalent to large pre-trained models using only general embeddings.\n","authors":["Yi-Hui Chen","Eric Jui-Lin Lu","Kwan-Ho Cheng"],"pdf_url":"https://arxiv.org/pdf/2408.13432v1.pdf","comment":"24 pages, 20 figures, using the engrXiv template; the full version\n  has been submitted to ACM Transactions on Information Systems and is\n  currently under review. (2024)"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2408.13672v1","updated":"2024-08-24T21:22:15Z","published":"2024-08-24T21:22:15Z","title":"ColBERT's [MASK]-based Query Augmentation: Effects of Quadrupling the\n  Query Input Length","summary":"  A unique aspect of ColBERT is its use of [MASK] tokens in queries to score\ndocuments (query augmentation). Prior work shows [MASK] tokens weighting\nnon-[MASK] query terms, emphasizing certain tokens over others , rather than\nintroducing whole new terms as initially proposed. We begin by demonstrating\nthat a term weighting behavior previously reported for [MASK] tokens in\nColBERTv1 holds for ColBERTv2. We then examine the effect of changing the\nnumber of [MASK] tokens from zero to up to four times past the query input\nlength used in training, both for first stage retrieval, and for scoring\ncandidates, observing an initial decrease in performance with few [MASK]s, a\nlarge increase when enough [MASK]s are added to pad queries to an average\nlength of 32, then a plateau in performance afterwards. Additionally, we\ncompare baseline performance to performance when the query length is extended\nto 128 tokens, and find that differences are small (e.g., within 1% on various\nmetrics) and generally statistically insignificant, indicating performance does\nnot collapse if ColBERT is presented with more [MASK] tokens than expected.\n","authors":["Ben Giacalone","Richard Zanibbi"],"pdf_url":"https://arxiv.org/pdf/2408.13672v1.pdf","comment":"5 pages, 3 figures, two tables"},{"id":"http://arxiv.org/abs/2401.13463v3","updated":"2024-08-24T20:28:38Z","published":"2024-01-24T14:08:38Z","title":"SpeechDPR: End-to-End Spoken Passage Retrieval for Open-Domain Spoken\n  Question Answering","summary":"  Spoken Question Answering (SQA) is essential for machines to reply to user's\nquestion by finding the answer span within a given spoken passage. SQA has been\npreviously achieved without ASR to avoid recognition errors and\nOut-of-Vocabulary (OOV) problems. However, the real-world problem of\nOpen-domain SQA (openSQA), in which the machine needs to first retrieve\npassages that possibly contain the answer from a spoken archive in addition,\nwas never considered. This paper proposes the first known end-to-end framework,\nSpeech Dense Passage Retriever (SpeechDPR), for the retrieval component of the\nopenSQA problem. SpeechDPR learns a sentence-level semantic representation by\ndistilling knowledge from the cascading model of unsupervised ASR (UASR) and\ntext dense retriever (TDR). No manually transcribed speech data is needed.\nInitial experiments showed performance comparable to the cascading model of\nUASR and TDR, and significantly better when UASR was poor, verifying this\napproach is more robust to speech recognition errors.\n","authors":["Chyi-Jiunn Lin","Guan-Ting Lin","Yung-Sung Chuang","Wei-Lun Wu","Shang-Wen Li","Abdelrahman Mohamed","Hung-yi Lee","Lin-shan Lee"],"pdf_url":"https://arxiv.org/pdf/2401.13463v3.pdf","comment":"Accepted at ICASSP 2024"},{"id":"http://arxiv.org/abs/2408.11623v2","updated":"2024-08-24T20:24:42Z","published":"2024-08-21T13:48:00Z","title":"End-to-End Cost-Effective Incentive Recommendation under Budget\n  Constraint with Uplift Modeling","summary":"  In modern online platforms, incentives are essential factors that enhance\nuser engagement and increase platform revenue. Over recent years, uplift\nmodeling has been introduced as a strategic approach to assign incentives to\nindividual customers. Especially in many real-world applications, online\nplatforms can only incentivize customers with specific budget constraints. This\nproblem can be reformulated as the multi-choice knapsack problem. This\noptimization aims to select the optimal incentive for each customer to maximize\nthe return on investment. Recent works in this field frequently tackle the\nbudget allocation problem using a two-stage approach. However, this solution is\nconfronted with the following challenges: (1) The causal inference methods\noften ignore the domain knowledge in online marketing, where the expected\nresponse curve of a customer should be monotonic and smooth as the incentive\nincreases. (2) An optimality gap between the two stages results in inferior\nsub-optimal allocation performance due to the loss of the incentive\nrecommendation information for the uplift prediction under the limited budget\nconstraint. To address these challenges, we propose a novel End-to-End\nCost-Effective Incentive Recommendation (E3IR) model under budget constraints.\nSpecifically, our methods consist of two modules, i.e., the uplift prediction\nmodule and the differentiable allocation module. In the uplift prediction\nmodule, we construct prediction heads to capture the incremental improvement\nbetween adjacent treatments with the marketing domain constraints (i.e.,\nmonotonic and smooth). We incorporate integer linear programming (ILP) as a\ndifferentiable layer input in the allocation module. Furthermore, we conduct\nextensive experiments on public and real product datasets, demonstrating that\nour E3IR improves allocation performance compared to existing two-stage\napproaches.\n","authors":["Zexu Sun","Hao Yang","Dugang Liu","Yunpeng Weng","Xing Tang","Xiuqiang He"],"pdf_url":"https://arxiv.org/pdf/2408.11623v2.pdf","comment":"Accepted by RecSys 2024"},{"id":"http://arxiv.org/abs/2408.13521v1","updated":"2024-08-24T08:50:25Z","published":"2024-08-24T08:50:25Z","title":"HRGraph: Leveraging LLMs for HR Data Knowledge Graphs with Information\n  Propagation-based Job Recommendation","summary":"  Knowledge Graphs (KGs) serving as semantic networks, prove highly effective\nin managing complex interconnected data in different domains, by offering a\nunified, contextualized, and structured representation with flexibility that\nallows for easy adaptation to evolving knowledge. Processing complex Human\nResources (HR) data, KGs can help in different HR functions like recruitment,\njob matching, identifying learning gaps, and enhancing employee retention.\nDespite their potential, limited efforts have been made to implement practical\nHR knowledge graphs. This study addresses this gap by presenting a framework\nfor effectively developing HR knowledge graphs from documents using Large\nLanguage Models. The resulting KG can be used for a variety of downstream\ntasks, including job matching, identifying employee skill gaps, and many more.\nIn this work, we showcase instances where HR KGs prove instrumental in precise\njob matching, yielding advantages for both employers and employees. Empirical\nevidence from experiments with information propagation in KGs and Graph Neural\nNets, along with case studies underscores the effectiveness of KGs in tasks\nsuch as job and employee recommendations and job area classification. Code and\ndata are available at : https://github.com/azminewasi/HRGraph\n","authors":["Azmine Toushik Wasi"],"pdf_url":"https://arxiv.org/pdf/2408.13521v1.pdf","comment":"7 Pages, 4 Figures. View in ACL Anthology:\n  https://aclanthology.org/2024.kallm-1.6/"},{"id":"http://arxiv.org/abs/2408.13501v1","updated":"2024-08-24T06:59:55Z","published":"2024-08-24T06:59:55Z","title":"Utilizing Large Language Models for Named Entity Recognition in\n  Traditional Chinese Medicine against COVID-19 Literature: Comparative Study","summary":"  Objective: To explore and compare the performance of ChatGPT and other\nstate-of-the-art LLMs on domain-specific NER tasks covering different entity\ntypes and domains in TCM against COVID-19 literature. Methods: We established a\ndataset of 389 articles on TCM against COVID-19, and manually annotated 48 of\nthem with 6 types of entities belonging to 3 domains as the ground truth,\nagainst which the NER performance of LLMs can be assessed. We then performed\nNER tasks for the 6 entity types using ChatGPT (GPT-3.5 and GPT-4) and 4\nstate-of-the-art BERT-based question-answering (QA) models (RoBERTa, MiniLM,\nPubMedBERT and SciBERT) without prior training on the specific task. A domain\nfine-tuned model (GSAP-NER) was also applied for a comprehensive comparison.\nResults: The overall performance of LLMs varied significantly in exact match\nand fuzzy match. In the fuzzy match, ChatGPT surpassed BERT-based QA models in\n5 out of 6 tasks, while in exact match, BERT-based QA models outperformed\nChatGPT in 5 out of 6 tasks but with a smaller F-1 difference. GPT-4 showed a\nsignificant advantage over other models in fuzzy match, especially on the\nentity type of TCM formula and the Chinese patent drug (TFD) and ingredient\n(IG). Although GPT-4 outperformed BERT-based models on entity type of herb,\ntarget, and research method, none of the F-1 scores exceeded 0.5. GSAP-NER,\noutperformed GPT-4 in terms of F-1 by a slight margin on RM. ChatGPT achieved\nconsiderably higher recalls than precisions, particularly in the fuzzy match.\nConclusions: The NER performance of LLMs is highly dependent on the entity\ntype, and their performance varies across application scenarios. ChatGPT could\nbe a good choice for scenarios where high recall is favored. However, for\nknowledge acquisition in rigorous scenarios, neither ChatGPT nor BERT-based QA\nmodels are off-the-shelf tools for professional practitioners.\n","authors":["Xu Tong","Nina Smirnova","Sharmila Upadhyaya","Ran Yu","Jack H. Culbert","Chao Sun","Wolfgang Otto","Philipp Mayr"],"pdf_url":"https://arxiv.org/pdf/2408.13501v1.pdf","comment":"22 pages with 2 figures"},{"id":"http://arxiv.org/abs/2302.07335v2","updated":"2024-08-24T02:22:16Z","published":"2023-02-14T20:44:12Z","title":"Intelligent Model Update Strategy for Sequential Recommendation","summary":"  Modern online platforms are increasingly employing recommendation systems to\naddress information overload and improve user engagement. There is an evolving\nparadigm in this research field that recommendation network learning occurs\nboth on the cloud and on edges with knowledge transfer in between (i.e.,\nedge-cloud collaboration). Recent works push this field further by enabling\nedge-specific context-aware adaptivity, where model parameters are updated in\nreal-time based on incoming on-edge data. However, we argue that frequent data\nexchanges between the cloud and edges often lead to inefficiency and waste of\ncommunication/computation resources, as considerable parameter updates might be\nredundant. To investigate this problem, we introduce Intelligent Edge-Cloud\nParameter Request Model, abbreviated as IntellectReq.\n  IntellectReq is designed to operate on edge, evaluating the cost-benefit\nlandscape of parameter requests with minimal computation and communication\noverhead. We formulate this as a novel learning task, aimed at the detection of\nout-of-distribution data, thereby fine-tuning adaptive communication\nstrategies. Further, we employ statistical mapping techniques to convert\nreal-time user behavior into a normal distribution, thereby employing\nmulti-sample outputs to quantify the model's uncertainty and thus its\ngeneralization capabilities. Rigorous empirical validation on four\nwidely-adopted benchmarks evaluates our approach, evidencing a marked\nimprovement in the efficiency and generalizability of edge-cloud collaborative\nand dynamic recommendation systems.\n","authors":["Zheqi Lv","Wenqiao Zhang","Zhengyu Chen","Shengyu Zhang","Kun Kuang"],"pdf_url":"https://arxiv.org/pdf/2302.07335v2.pdf","comment":"Published on WWW'24(Oral): Proceedings of the ACM on Web Conference\n  2024 (pp. 3117-3128)"},{"id":"http://arxiv.org/abs/2408.13484v1","updated":"2024-08-24T06:07:25Z","published":"2024-08-24T06:07:25Z","title":"IntOPE: Off-Policy Evaluation in the Presence of Interference","summary":"  Off-Policy Evaluation (OPE) is employed to assess the potential impact of a\nhypothetical policy using logged contextual bandit feedback, which is crucial\nin areas such as personalized medicine and recommender systems, where online\ninteractions are associated with significant risks and costs. Traditionally,\nOPE methods rely on the Stable Unit Treatment Value Assumption (SUTVA), which\nassumes that the reward for any given individual is unaffected by the actions\nof others. However, this assumption often fails in real-world scenarios due to\nthe presence of interference, where an individual's reward is affected not just\nby their own actions but also by the actions of their peers. This realization\nreveals significant limitations of existing OPE methods in real-world\napplications. To address this limitation, we propose IntIPW, an IPW-style\nestimator that extends the Inverse Probability Weighting (IPW) framework by\nintegrating marginalized importance weights to account for both individual\nactions and the influence of adjacent entities. Extensive experiments are\nconducted on both synthetic and real-world data to demonstrate the\neffectiveness of the proposed IntIPW method.\n","authors":["Yuqi Bai","Ziyu Zhao","Minqin Zhu","Kun Kuang"],"pdf_url":"https://arxiv.org/pdf/2408.13484v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2408.13608v1","updated":"2024-08-24T15:36:08Z","published":"2024-08-24T15:36:08Z","title":"SpeechCraft: A Fine-grained Expressive Speech Dataset with Natural\n  Language Description","summary":"  Speech-language multi-modal learning presents a significant challenge due to\nthe fine nuanced information inherent in speech styles. Therefore, a\nlarge-scale dataset providing elaborate comprehension of speech style is\nurgently needed to facilitate insightful interplay between speech audio and\nnatural language. However, constructing such datasets presents a major\ntrade-off between large-scale data collection and high-quality annotation. To\ntackle this challenge, we propose an automatic speech annotation system for\nexpressiveness interpretation that annotates in-the-wild speech clips with\nexpressive and vivid human language descriptions. Initially, speech audios are\nprocessed by a series of expert classifiers and captioning models to capture\ndiverse speech characteristics, followed by a fine-tuned LLaMA for customized\nannotation generation. Unlike previous tag/templet-based annotation frameworks\nwith limited information and diversity, our system provides in-depth\nunderstandings of speech style through tailored natural language descriptions,\nthereby enabling accurate and voluminous data generation for large model\ntraining. With this system, we create SpeechCraft, a fine-grained bilingual\nexpressive speech dataset. It is distinguished by highly descriptive natural\nlanguage style prompts, containing approximately 2,000 hours of audio data and\nencompassing over two million speech clips. Extensive experiments demonstrate\nthat the proposed dataset significantly boosts speech-language task performance\nin stylist speech synthesis and speech style understanding.\n","authors":["Zeyu Jin","Jia Jia","Qixin Wang","Kehan Li","Shuoyi Zhou","Songtao Zhou","Xiaoyu Qin","Zhiyong Wu"],"pdf_url":"https://arxiv.org/pdf/2408.13608v1.pdf","comment":"Accepted by ACM Multimedia 2024"},{"id":"http://arxiv.org/abs/2408.13520v1","updated":"2024-08-24T08:47:09Z","published":"2024-08-24T08:47:09Z","title":"An Open, Cross-Platform, Web-Based Metaverse Using WebXR and A-Frame","summary":"  The metaverse has received much attention in the literature and industry in\nthe last few years, but the lack of an open and cross-platform architecture has\nled to many distinct metaverses that cannot communicate with each other. This\nwork proposes a WebXR-based cross-platform architecture for developing spatial\nweb apps using the A-Frame and Networked-Aframe frameworks with a view to an\nopen and interoperable metaverse, accessible from both the web and extended\nreality devices. A prototype was implemented and evaluated, supporting the\ncapability of the technology stack to enable immersive experiences across\ndifferent platforms and devices. Positive feedback on ease of use of the\nimmersive environment further corroborates the proposed approach, underscoring\nits effectiveness in facilitating engaging and interactive virtual spaces. By\nadhering to principles of interoperability and inclusivity, it lives up to Tim\nBerners-Lee's vision of the World Wide Web as an open platform that transcends\ngeographical and technical boundaries.\n","authors":["Giuseppe Macario"],"pdf_url":"https://arxiv.org/pdf/2408.13520v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2404.05317"},{"id":"http://arxiv.org/abs/2403.02905v3","updated":"2024-08-24T00:29:50Z","published":"2024-03-05T12:13:18Z","title":"MMoFusion: Multi-modal Co-Speech Motion Generation with Diffusion Model","summary":"  The body movements accompanying speech aid speakers in expressing their\nideas. Co-speech motion generation is one of the important approaches for\nsynthesizing realistic avatars. Due to the intricate correspondence between\nspeech and motion, generating realistic and diverse motion is a challenging\ntask. In this paper, we propose MMoFusion, a Multi-modal co-speech Motion\ngeneration framework based on the diffusion model to ensure both the\nauthenticity and diversity of generated motion. We propose a progressive fusion\nstrategy to enhance the interaction of inter-modal and intra-modal, efficiently\nintegrating multi-modal information. Specifically, we employ a masked style\nmatrix based on emotion and identity information to control the generation of\ndifferent motion styles. Temporal modeling of speech and motion is partitioned\ninto style-guided specific feature encoding and shared feature encoding, aiming\nto learn both inter-modal and intra-modal features. Besides, we propose a\ngeometric loss to enforce the joints' velocity and acceleration coherence among\nframes. Our framework generates vivid, diverse, and style-controllable motion\nof arbitrary length through inputting speech and editing identity and emotion.\nExtensive experiments demonstrate that our method outperforms current co-speech\nmotion generation methods including upper body and challenging full body.\n","authors":["Sen Wang","Jiangning Zhang","Xin Tan","Zhifeng Xie","Chengjie Wang","Lizhuang Ma"],"pdf_url":"https://arxiv.org/pdf/2403.02905v3.pdf","comment":null}]},"2024-08-27T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2401.16553v7","updated":"2024-08-27T17:57:07Z","published":"2024-01-29T20:44:10Z","title":"SelectLLM: Can LLMs Select Important Instructions to Annotate?","summary":"  Instruction tuning benefits from large and diverse datasets; however,\ncreating such datasets involves a high cost of human labeling. While synthetic\ndatasets generated by large language models (LLMs) have partly solved this\nissue, they often contain low-quality data. One effective solution is\nselectively annotating unlabelled instructions, especially given the relative\nease of acquiring unlabeled instructions or texts from various sources.\nHowever, how to select unlabelled instructions is not well-explored, especially\nin the context of LLMs. Therefore, we introduce SelectLLM, an alternative\nframework that leverages the capabilities of LLMs to select unlabeled\ninstructions more effectively. Specifically, SelectLLM consists of two key\nsteps: Coreset-based clustering of unlabelled instructions for enlarging\ndiversity and prompting of LLM to identify the most beneficial instructions\nwithin each cluster. We evaluate SelectLLM on AlpacaEval2 and MT-Bench,\ndemonstrating its ability to outperform state-of-the-art methods like\nAlpagasus. In addition, we compare the performance and compatibility of\nSelectLLM with various LLMs, such as ChatGPT, LLaMA-3.1-70B, and Gemma-2-27b.\nSelectLLM's adaptability and robustness are further evidenced by its ability to\nmaintain high performance across both human and synthetic datasets. All code\nand data are publicly available (https://github.com/minnesotanlp/select-llm).\n","authors":["Ritik Sachin Parkar","Jaehyung Kim","Jong Inn Park","Dongyeop Kang"],"pdf_url":"https://arxiv.org/pdf/2401.16553v7.pdf","comment":"First Authors: Ritik Sachin Parkar and Jaehyung Kim | Second Author:\n  Jong Inn Park | PI: Dongyeop Kang"},{"id":"http://arxiv.org/abs/2408.15232v1","updated":"2024-08-27T17:50:03Z","published":"2024-08-27T17:50:03Z","title":"Into the Unknown Unknowns: Engaged Human Learning through Participation\n  in Language Model Agent Conversations","summary":"  While language model (LM)-powered chatbots and generative search engines\nexcel at answering concrete queries, discovering information in the terrain of\nunknown unknowns remains challenging for users. To emulate the common\neducational scenario where children/students learn by listening to and\nparticipating in conversations of their parents/teachers, we create\nCollaborative STORM (Co-STORM). Unlike QA systems that require users to ask all\nthe questions, Co-STORM lets users observe and occasionally steer the discourse\namong several LM agents. The agents ask questions on the user's behalf,\nallowing the user to discover unknown unknowns serendipitously. To facilitate\nuser interaction, Co-STORM assists users in tracking the discourse by\norganizing the uncovered information into a dynamic mind map, ultimately\ngenerating a comprehensive report as takeaways. For automatic evaluation, we\nconstruct the WildSeek dataset by collecting real information-seeking records\nwith user goals. Co-STORM outperforms baseline methods on both discourse trace\nand report quality. In a further human evaluation, 70% of participants prefer\nCo-STORM over a search engine, and 78% favor it over a RAG chatbot.\n","authors":["Yucheng Jiang","Yijia Shao","Dekun Ma","Sina J. Semnani","Monica S. Lam"],"pdf_url":"https://arxiv.org/pdf/2408.15232v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03334v2","updated":"2024-08-27T17:46:31Z","published":"2024-03-05T21:36:23Z","title":"DIVERSE: A Dataset of YouTube Video Comment Stances with a Data\n  Programming Model","summary":"  Stance detection of social media text is a key component of many real-world\napplications like evaluating marketing campaigns, evaluating political policies\nor candidates, or evaluating information environments. However, creating\nautomatic stance labeling systems requires the manual annotation of stances,\nwhich is both tedious and resource-intensive. This paper introduces a stance\nlabeling method that makes use of weak signals of sentence tone, then\nconsolidating these signals with a Data Programmingmodel for the final stance\nlabel. In a time of international conflict, understanding the public opinion\ntowards the country's military is crucial for recruitment. We present DIVERSE,\na dataset involve stances towards YouTube videos of the US military (Dataset\navailable at https://doi.org/10.5281/zenodo.10493803). On average, the videos\nhave 200 comments each, and the stances skew slightly towards the \"against\"\ncharacterization for both the US army and the video.\n","authors":["Iain J. Cruickshank","Amir Soofi","Lynnette Hui Xian Ng"],"pdf_url":"https://arxiv.org/pdf/2403.03334v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15221v1","updated":"2024-08-27T17:33:30Z","published":"2024-08-27T17:33:30Z","title":"LLM Defenses Are Not Robust to Multi-Turn Human Jailbreaks Yet","summary":"  Recent large language model (LLM) defenses have greatly improved models'\nability to refuse harmful queries, even when adversarially attacked. However,\nLLM defenses are primarily evaluated against automated adversarial attacks in a\nsingle turn of conversation, an insufficient threat model for real-world\nmalicious use. We demonstrate that multi-turn human jailbreaks uncover\nsignificant vulnerabilities, exceeding 70% attack success rate (ASR) on\nHarmBench against defenses that report single-digit ASRs with automated\nsingle-turn attacks. Human jailbreaks also reveal vulnerabilities in machine\nunlearning defenses, successfully recovering dual-use biosecurity knowledge\nfrom unlearned models. We compile these results into Multi-Turn Human\nJailbreaks (MHJ), a dataset of 2,912 prompts across 537 multi-turn jailbreaks.\nWe publicly release MHJ alongside a compendium of jailbreak tactics developed\nacross dozens of commercial red teaming engagements, supporting research\ntowards stronger LLM defenses.\n","authors":["Nathaniel Li","Ziwen Han","Ian Steneker","Willow Primack","Riley Goodside","Hugh Zhang","Zifan Wang","Cristina Menghini","Summer Yue"],"pdf_url":"https://arxiv.org/pdf/2408.15221v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15213v1","updated":"2024-08-27T17:19:57Z","published":"2024-08-27T17:19:57Z","title":"Classifying populist language in American presidential and governor\n  speeches using automatic text analysis","summary":"  Populism is a concept that is often used but notoriously difficult to\nmeasure. Common qualitative measurements like holistic grading or content\nanalysis require great amounts of time and labour, making it difficult to\nquickly scope out which politicians should be classified as populist and which\nshould not, while quantitative methods show mixed results when it comes to\nclassifying populist rhetoric. In this paper, we develop a pipeline to train\nand validate an automated classification model to estimate the use of populist\nlanguage. We train models based on sentences that were identified as populist\nand pluralist in 300 US governors' speeches from 2010 to 2018 and in 45\nspeeches of presidential candidates in 2016. We find that these models classify\nmost speeches correctly, including 84% of governor speeches and 89% of\npresidential speeches. These results extend to different time periods (with 92%\naccuracy on more recent American governors), different amounts of data (with as\nfew as 70 training sentences per category achieving similar results), and when\nclassifying politicians instead of individual speeches. This pipeline is thus\nan effective tool that can optimise the systematic and swift classification of\nthe use of populist language in politicians' speeches.\n","authors":["Olaf van der Veen","Semir Dzebo","Levi Littvay","Kirk Hawkins","Oren Dar"],"pdf_url":"https://arxiv.org/pdf/2408.15213v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15204v1","updated":"2024-08-27T17:03:18Z","published":"2024-08-27T17:03:18Z","title":"Can Unconfident LLM Annotations Be Used for Confident Conclusions?","summary":"  Large language models (LLMs) have shown high agreement with human raters\nacross a variety of tasks, demonstrating potential to ease the challenges of\nhuman data collection. In computational social science (CSS), researchers are\nincreasingly leveraging LLM annotations to complement slow and expensive human\nannotations. Still, guidelines for collecting and using LLM annotations,\nwithout compromising the validity of downstream conclusions, remain limited. We\nintroduce Confidence-Driven Inference: a method that combines LLM annotations\nand LLM confidence indicators to strategically select which human annotations\nshould be collected, with the goal of producing accurate statistical estimates\nand provably valid confidence intervals while reducing the number of human\nannotations needed. Our approach comes with safeguards against LLM annotations\nof poor quality, guaranteeing that the conclusions will be both valid and no\nless accurate than if we only relied on human annotations. We demonstrate the\neffectiveness of Confidence-Driven Inference over baselines in statistical\nestimation tasks across three CSS settings--text politeness, stance, and\nbias--reducing the needed number of human annotations by over 25% in each.\nAlthough we use CSS settings for demonstration, Confidence-Driven Inference can\nbe used to estimate most standard quantities across a broad range of NLP\nproblems.\n","authors":["Kristina Gligorić","Tijana Zrnic","Cinoo Lee","Emmanuel J. Candès","Dan Jurafsky"],"pdf_url":"https://arxiv.org/pdf/2408.15204v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15188v1","updated":"2024-08-27T16:44:41Z","published":"2024-08-27T16:44:41Z","title":"Infusing Acoustic Pause Context into Text-Based Dementia Assessment","summary":"  Speech pauses, alongside content and structure, offer a valuable and\nnon-invasive biomarker for detecting dementia. This work investigates the use\nof pause-enriched transcripts in transformer-based language models to\ndifferentiate the cognitive states of subjects with no cognitive impairment,\nmild cognitive impairment, and Alzheimer's dementia based on their speech from\na clinical assessment. We address three binary classification tasks: Onset,\nmonitoring, and dementia exclusion. The performance is evaluated through\nexperiments on a German Verbal Fluency Test and a Picture Description Test,\ncomparing the model's effectiveness across different speech production\ncontexts. Starting from a textual baseline, we investigate the effect of\nincorporation of pause information and acoustic context. We show the test\nshould be chosen depending on the task, and similarly, lexical pause\ninformation and acoustic cross-attention contribute differently.\n","authors":["Franziska Braun","Sebastian P. Bayerl","Florian Hönig","Hartmut Lehfeld","Thomas Hillemacher","Tobias Bocklet","Korbinian Riedhammer"],"pdf_url":"https://arxiv.org/pdf/2408.15188v1.pdf","comment":"Accepted at INTERSPEECH 2024"},{"id":"http://arxiv.org/abs/2408.15176v1","updated":"2024-08-27T16:18:51Z","published":"2024-08-27T16:18:51Z","title":"Unlocking Potential in Pre-Trained Music Language Models for Versatile\n  Multi-Track Music Arrangement","summary":"  Large language models have shown significant capabilities across various\ndomains, including symbolic music generation. However, leveraging these\npre-trained models for controllable music arrangement tasks, each requiring\ndifferent forms of musical information as control, remains a novel challenge.\nIn this paper, we propose a unified sequence-to-sequence framework that enables\nthe fine-tuning of a symbolic music language model for multiple multi-track\narrangement tasks, including band arrangement, piano reduction, drum\narrangement, and voice separation. Our experiments demonstrate that the\nproposed approach consistently achieves higher musical quality compared to\ntask-specific baselines across all four tasks. Furthermore, through additional\nexperiments on probing analysis, we show the pre-training phase equips the\nmodel with essential knowledge to understand musical conditions, which is hard\nto acquired solely through task-specific fine-tuning.\n","authors":["Longshen Ou","Jingwei Zhao","Ziyu Wang","Gus Xia","Ye Wang"],"pdf_url":"https://arxiv.org/pdf/2408.15176v1.pdf","comment":"Submitted to AAAI 2025"},{"id":"http://arxiv.org/abs/2408.15172v1","updated":"2024-08-27T16:10:21Z","published":"2024-08-27T16:10:21Z","title":"X-Reflect: Cross-Reflection Prompting for Multimodal Recommendation","summary":"  Large Language Models (LLMs) and Large Multimodal Models (LMMs) have been\nshown to enhance the effectiveness of enriching item descriptions, thereby\nimproving the accuracy of recommendation systems. However, most existing\napproaches either rely on text-only prompting or employ basic multimodal\nstrategies that do not fully exploit the complementary information available\nfrom both textual and visual modalities. This paper introduces a novel\nframework, Cross-Reflection Prompting, termed X-Reflect, designed to address\nthese limitations by prompting LMMs to explicitly identify and reconcile\nsupportive and conflicting information between text and images. By capturing\nnuanced insights from both modalities, this approach generates more\ncomprehensive and contextually richer item representations. Extensive\nexperiments conducted on two widely used benchmarks demonstrate that our method\noutperforms existing prompting baselines in downstream recommendation accuracy.\nAdditionally, we evaluate the generalizability of our framework across\ndifferent LMM backbones and the robustness of the prompting strategies,\noffering insights for optimization. This work underscores the importance of\nintegrating multimodal information and presents a novel solution for improving\nitem understanding in multimodal recommendation systems.\n","authors":["Hanjia Lyu","Ryan Rossi","Xiang Chen","Md Mehrab Tanjim","Stefano Petrangeli","Somdeb Sarkhel","Jiebo Luo"],"pdf_url":"https://arxiv.org/pdf/2408.15172v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15171v1","updated":"2024-08-27T16:09:56Z","published":"2024-08-27T16:09:56Z","title":"Measuring text summarization factuality using atomic facts entailment\n  metrics in the context of retrieval augmented generation","summary":"  The use of large language models (LLMs) has significantly increased since the\nintroduction of ChatGPT in 2022, demonstrating their value across various\napplications. However, a major challenge for enterprise and commercial adoption\nof LLMs is their tendency to generate inaccurate information, a phenomenon\nknown as \"hallucination.\" This project proposes a method for estimating the\nfactuality of a summary generated by LLMs when compared to a source text. Our\napproach utilizes Naive Bayes classification to assess the accuracy of the\ncontent produced.\n","authors":["N. E. Kriman"],"pdf_url":"https://arxiv.org/pdf/2408.15171v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2408.15138v1","updated":"2024-08-27T15:23:09Z","published":"2024-08-27T15:23:09Z","title":"How transformers learn structured data: insights from hierarchical\n  filtering","summary":"  We introduce a hierarchical filtering procedure for generative models of\nsequences on trees, enabling control over the range of positional correlations\nin the data. Leveraging this controlled setting, we provide evidence that\nvanilla encoder-only transformer architectures can implement the optimal Belief\nPropagation algorithm on both root classification and masked language modeling\ntasks. Correlations at larger distances corresponding to increasing layers of\nthe hierarchy are sequentially included as the network is trained. We analyze\nhow the transformer layers succeed by focusing on attention maps from models\ntrained with varying degrees of filtering. These attention maps show clear\nevidence for iterative hierarchical reconstruction of correlations, and we can\nrelate these observations to a plausible implementation of the exact inference\nalgorithm for the network sizes considered.\n","authors":["Jerome Garnier-Brun","Marc Mézard","Emanuele Moscato","Luca Saglietti"],"pdf_url":"https://arxiv.org/pdf/2408.15138v1.pdf","comment":"18 pages, 9 figures"},{"id":"http://arxiv.org/abs/2408.07531v2","updated":"2024-08-27T15:16:06Z","published":"2024-08-14T13:03:41Z","title":"Development of a Large Language Model-based Multi-Agent Clinical\n  Decision Support System for Korean Triage and Acuity Scale (KTAS)-Based\n  Triage and Treatment Planning in Emergency Departments","summary":"  Emergency department (ED) overcrowding and the complexity of rapid\ndecision-making in critical care settings pose significant challenges to\nhealthcare systems worldwide. While clinical decision support systems (CDSS)\nhave shown promise, the integration of large language models (LLMs) offers new\npossibilities for enhancing triage accuracy and clinical decision-making. This\nstudy presents an LLM-driven CDSS designed to assist ED physicians and nurses\nin patient triage, treatment planning, and overall emergency care management.\n  We developed a multi-agent CDSS utilizing Llama-3-70b as the base LLM,\norchestrated by CrewAI and Langchain. The system comprises four AI agents\nemulating key ED roles: Triage Nurse, Emergency Physician, Pharmacist, and ED\nCoordinator. It incorporates the Korean Triage and Acuity Scale (KTAS) for\ntriage assessment and integrates with the RxNorm API for medication management.\n  The model was evaluated using the Asclepius dataset, with performance\nassessed by a clinical emergency medicine specialist. The CDSS demonstrated\nhigh accuracy in triage decision-making compared to the baseline of a\nsingle-agent system. Furthermore, the system exhibited strong performance in\ncritical areas, including primary diagnosis, critical findings identification,\ndisposition decision-making, treatment planning, and resource allocation.\n  Our multi-agent CDSS demonstrates significant potential for supporting\ncomprehensive emergency care management. By leveraging state-of-the-art AI\ntechnologies, this system offers a scalable and adaptable tool that could\nenhance emergency medical care delivery, potentially alleviating ED\novercrowding and improving patient outcomes. This work contributes to the\ngrowing field of AI applications in emergency medicine and offers a promising\ndirection for future research and clinical implementation.\n","authors":["Seungjun Han","Wongyung Choi"],"pdf_url":"https://arxiv.org/pdf/2408.07531v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15091v1","updated":"2024-08-27T14:22:02Z","published":"2024-08-27T14:22:02Z","title":"Relation Also Knows: Rethinking the Recall and Editing of Factual\n  Associations in Auto-Regressive Transformer Language Models","summary":"  The storage and recall of factual associations in auto-regressive transformer\nlanguage models (LMs) have drawn a great deal of attention, inspiring knowledge\nediting by directly modifying the located model weights. Most editing works\nachieve knowledge editing under the guidance of existing interpretations of\nknowledge recall that mainly focus on subject knowledge. However, these\ninterpretations are seriously flawed, neglecting relation information and\nleading to the over-generalizing problem for editing. In this work, we discover\na novel relation-focused perspective to interpret the knowledge recall of\ntransformer LMs during inference and apply it on knowledge editing to avoid\nover-generalizing. Experimental results on the dataset supplemented with a new\nR-Specificity criterion demonstrate that our editing approach significantly\nalleviates over-generalizing while remaining competitive on other criteria,\nbreaking the domination of subject-focused editing for future research.\n","authors":["Xiyu Liu","Zhengxiao Liu","Naibin Gu","Zheng Lin","Wanli Ma","Ji Xiang","Weiping Wang"],"pdf_url":"https://arxiv.org/pdf/2408.15091v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.05195v2","updated":"2024-08-27T14:20:57Z","published":"2023-11-09T08:19:34Z","title":"PRODIGy: a PROfile-based DIalogue Generation dataset","summary":"  Providing dialogue agents with a profile representation can improve their\nconsistency and coherence, leading to better conversations. However, current\nprofile-based dialogue datasets for training such agents contain either\nexplicit profile representations that are simple and dialogue-specific, or\nimplicit representations that are difficult to collect. In this work, we\npropose a unified framework in which we bring together both standard and more\nsophisticated profile representations by creating a new resource where each\ndialogue is aligned with all possible speaker representations such as\ncommunication style, biographies, and personality. This framework allows to\ntest several baselines built using generative language models with several\nprofile configurations. The automatic evaluation shows that profile-based\nmodels have better generalisation capabilities than models trained on dialogues\nonly, both in-domain and cross-domain settings. These results are consistent\nfor fine-tuned models and instruction-based LLMs. Additionally, human\nevaluation demonstrates a clear preference for generations consistent with both\nprofile and context. Finally, to account for possible privacy concerns, all\nexperiments are done under two configurations: inter-character and\nintra-character. In the former, the LM stores the information about the\ncharacter in its internal representation, while in the latter, the LM does not\nretain any personal information but uses it only at inference time.\n","authors":["Daniela Occhipinti","Serra Sinem Tekiroglu","Marco Guerini"],"pdf_url":"https://arxiv.org/pdf/2311.05195v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14340v2","updated":"2024-08-27T14:09:44Z","published":"2024-08-26T15:13:14Z","title":"Foundation Models for Music: A Survey","summary":"  In recent years, foundation models (FMs) such as large language models (LLMs)\nand latent diffusion models (LDMs) have profoundly impacted diverse sectors,\nincluding music. This comprehensive review examines state-of-the-art (SOTA)\npre-trained models and foundation models in music, spanning from representation\nlearning, generative learning and multimodal learning. We first contextualise\nthe significance of music in various industries and trace the evolution of AI\nin music. By delineating the modalities targeted by foundation models, we\ndiscover many of the music representations are underexplored in FM development.\nThen, emphasis is placed on the lack of versatility of previous methods on\ndiverse music applications, along with the potential of FMs in music\nunderstanding, generation and medical application. By comprehensively exploring\nthe details of the model pre-training paradigm, architectural choices,\ntokenisation, finetuning methodologies and controllability, we emphasise the\nimportant topics that should have been well explored, like instruction tuning\nand in-context learning, scaling law and emergent ability, as well as\nlong-sequence modelling etc. A dedicated section presents insights into music\nagents, accompanied by a thorough analysis of datasets and evaluations\nessential for pre-training and downstream tasks. Finally, by underscoring the\nvital importance of ethical considerations, we advocate that following research\non FM for music should focus more on such issues as interpretability,\ntransparency, human responsibility, and copyright issues. The paper offers\ninsights into future challenges and trends on FMs for music, aiming to shape\nthe trajectory of human-AI collaboration in the music realm.\n","authors":["Yinghao Ma","Anders Øland","Anton Ragni","Bleiz MacSen Del Sette","Charalampos Saitis","Chris Donahue","Chenghua Lin","Christos Plachouras","Emmanouil Benetos","Elio Quinton","Elona Shatri","Fabio Morreale","Ge Zhang","György Fazekas","Gus Xia","Huan Zhang","Ilaria Manco","Jiawen Huang","Julien Guinot","Liwei Lin","Luca Marinelli","Max W. Y. Lam","Megha Sharma","Qiuqiang Kong","Roger B. Dannenberg","Ruibin Yuan","Shangda Wu","Shih-Lun Wu","Shuqi Dai","Shun Lei","Shiyin Kang","Simon Dixon","Wenhu Chen","Wenhao Huang","Xingjian Du","Xingwei Qu","Xu Tan","Yizhi Li","Zeyue Tian","Zhiyong Wu","Zhizheng Wu","Ziyang Ma","Ziyu Wang"],"pdf_url":"https://arxiv.org/pdf/2408.14340v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15079v1","updated":"2024-08-27T14:08:23Z","published":"2024-08-27T14:08:23Z","title":"BaichuanSEED: Sharing the Potential of ExtensivE Data Collection and\n  Deduplication by Introducing a Competitive Large Language Model Baseline","summary":"  The general capabilities of Large Language Models (LLM) highly rely on the\ncomposition and selection on extensive pretraining datasets, treated as\ncommercial secrets by several institutions. To mitigate this issue, we\nopen-source the details of a universally applicable data processing pipeline\nand validate its effectiveness and potential by introducing a competitive LLM\nbaseline. Specifically, the data processing pipeline consists of broad\ncollection to scale up and reweighting to improve quality. We then pretrain a\n7B model BaichuanSEED with 3T tokens processed by our pipeline without any\ndeliberate downstream task-related optimization, followed by an easy but\neffective supervised fine-tuning stage. BaichuanSEED demonstrates consistency\nand predictability throughout training and achieves comparable performance on\ncomprehensive benchmarks with several commercial advanced large language\nmodels, such as Qwen1.5 and Llama3. We also conduct several heuristic\nexperiments to discuss the potential for further optimization of downstream\ntasks, such as mathematics and coding.\n","authors":["Guosheng Dong","Da Pan","Yiding Sun","Shusen Zhang","Zheng Liang","Xin Wu","Yanjun Shen","Fan Yang","Haoze Sun","Tianpeng Li","Mingan Lin","Jianhua Xu","Yufan Zhang","Xiaonan Nie","Lei Su","Bingning Wang","Wentao Zhang","Jiaxin Mao","Zenan Zhou","Weipeng Chen"],"pdf_url":"https://arxiv.org/pdf/2408.15079v1.pdf","comment":"19 pages, 6 figures"},{"id":"http://arxiv.org/abs/2408.15050v1","updated":"2024-08-27T13:19:32Z","published":"2024-08-27T13:19:32Z","title":"Self-supervised Topic Taxonomy Discovery in the Box Embedding Space","summary":"  Topic taxonomy discovery aims at uncovering topics of different abstraction\nlevels and constructing hierarchical relations between them. Unfortunately,\nmost of prior work can hardly model semantic scopes of words and topics by\nholding the Euclidean embedding space assumption. What's worse, they infer\nasymmetric hierarchical relations by symmetric distances between topic\nembeddings. As a result, existing methods suffer from problems of low-quality\ntopics at high abstraction levels and inaccurate hierarchical relations. To\nalleviate these problems, this paper develops a Box embedding-based Topic Model\n(BoxTM) that maps words and topics into the box embedding space, where the\nasymmetric metric is defined to properly infer hierarchical relations among\ntopics. Additionally, our BoxTM explicitly infers upper-level topics based on\ncorrelation between specific topics through recursive clustering on topic\nboxes. Finally, extensive experiments validate high-quality of the topic\ntaxonomy learned by BoxTM.\n","authors":["Yuyin Lu","Hegang Chen","Pengbo Mao","Yanghui Rao","Haoran Xie","Fu Lee Wang","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2408.15050v1.pdf","comment":"to be published in TACL"},{"id":"http://arxiv.org/abs/2408.15040v1","updated":"2024-08-27T13:10:05Z","published":"2024-08-27T13:10:05Z","title":"A Survey of Large Language Models for European Languages","summary":"  Large Language Models (LLMs) have gained significant attention due to their\nhigh performance on a wide range of natural language tasks since the release of\nChatGPT. The LLMs learn to understand and generate language by training\nbillions of model parameters on vast volumes of text data. Despite being a\nrelatively new field, LLM research is rapidly advancing in various directions.\nIn this paper, we present an overview of LLM families, including LLaMA, PaLM,\nGPT, and MoE, and the methods developed to create and enhance LLMs for official\nEuropean Union (EU) languages. We provide a comprehensive summary of common\nmonolingual and multilingual datasets used for pretraining LLMs.\n","authors":["Wazir Ali","Sampo Pyysalo"],"pdf_url":"https://arxiv.org/pdf/2408.15040v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15037v1","updated":"2024-08-27T13:07:07Z","published":"2024-08-27T13:07:07Z","title":"Evidence-Enhanced Triplet Generation Framework for Hallucination\n  Alleviation in Generative Question Answering","summary":"  To address the hallucination in generative question answering (GQA) where the\nanswer can not be derived from the document, we propose a novel\nevidence-enhanced triplet generation framework, EATQA, encouraging the model to\npredict all the combinations of (Question, Evidence, Answer) triplet by\nflipping the source pair and the target label to understand their logical\nrelationships, i.e., predict Answer(A), Question(Q), and Evidence(E) given a\nQE, EA, and QA pairs, respectively. Furthermore, we bridge the distribution gap\nto distill the knowledge from evidence in inference stage. Our framework\nensures the model to learn the logical relation between query, evidence and\nanswer, which simultaneously improves the evidence generation and query\nanswering. In this paper, we apply EATQA to LLama and it outperforms other\nLLMs-based methods and hallucination mitigation approaches on two challenging\nGQA benchmarks. Further analysis shows that our method not only keeps prior\nknowledge within LLM, but also mitigates hallucination and generates faithful\nanswers.\n","authors":["Haowei Du","Huishuai Zhang","Dongyan Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.15037v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14991v1","updated":"2024-08-27T12:15:43Z","published":"2024-08-27T12:15:43Z","title":"Speech Recognition Transformers: Topological-lingualism Perspective","summary":"  Transformers have evolved with great success in various artificial\nintelligence tasks. Thanks to our recent prevalence of self-attention\nmechanisms, which capture long-term dependency, phenomenal outcomes in speech\nprocessing and recognition tasks have been produced. The paper presents a\ncomprehensive survey of transformer techniques oriented in speech modality. The\nmain contents of this survey include (1) background of traditional ASR,\nend-to-end transformer ecosystem, and speech transformers (2) foundational\nmodels in a speech via lingualism paradigm, i.e., monolingual, bilingual,\nmultilingual, and cross-lingual (3) dataset and languages, acoustic features,\narchitecture, decoding, and evaluation metric from a specific topological\nlingualism perspective (4) popular speech transformer toolkit for building\nend-to-end ASR systems. Finally, highlight the discussion of open challenges\nand potential research directions for the community to conduct further research\nin this domain.\n","authors":["Shruti Singh","Muskaan Singh","Virender Kadyan"],"pdf_url":"https://arxiv.org/pdf/2408.14991v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14972v1","updated":"2024-08-27T11:24:38Z","published":"2024-08-27T11:24:38Z","title":"AgentMonitor: A Plug-and-Play Framework for Predictive and Secure\n  Multi-Agent Systems","summary":"  The rapid advancement of large language models (LLMs) has led to the rise of\nLLM-based agents. Recent research shows that multi-agent systems (MAS), where\neach agent plays a specific role, can outperform individual LLMs. However,\nconfiguring an MAS for a task remains challenging, with performance only\nobservable post-execution. Inspired by scaling laws in LLM development, we\ninvestigate whether MAS performance can be predicted beforehand. We introduce\nAgentMonitor, a framework that integrates at the agent level to capture inputs\nand outputs, transforming them into statistics for training a regression model\nto predict task performance. Additionally, it can further apply real-time\ncorrections to address security risks posed by malicious agents, mitigating\nnegative impacts and enhancing MAS security. Experiments demonstrate that an\nXGBoost model achieves a Spearman correlation of 0.89 in-domain and 0.58 in\nmore challenging scenarios. Furthermore, using AgentMonitor reduces harmful\ncontent by 6.2% and increases helpful content by 1.8% on average, enhancing\nsafety and reliability. Code is available at\n\\url{https://github.com/chanchimin/AgentMonitor}.\n","authors":["Chi-Min Chan","Jianxuan Yu","Weize Chen","Chunyang Jiang","Xinyu Liu","Weijie Shi","Zhiyuan Liu","Wei Xue","Yike Guo"],"pdf_url":"https://arxiv.org/pdf/2408.14972v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14968v1","updated":"2024-08-27T11:21:19Z","published":"2024-08-27T11:21:19Z","title":"MRSE: An Efficient Multi-modality Retrieval System for Large Scale\n  E-commerce","summary":"  Providing high-quality item recall for text queries is crucial in large-scale\ne-commerce search systems. Current Embedding-based Retrieval Systems (ERS)\nembed queries and items into a shared low-dimensional space, but uni-modality\nERS rely too heavily on textual features, making them unreliable in complex\ncontexts. While multi-modality ERS incorporate various data sources, they often\noverlook individual preferences for different modalities, leading to suboptimal\nresults. To address these issues, we propose MRSE, a Multi-modality Retrieval\nSystem that integrates text, item images, and user preferences through\nlightweight mixture-of-expert (LMoE) modules to better align features across\nand within modalities. MRSE also builds user profiles at a multi-modality level\nand introduces a novel hybrid loss function that enhances consistency and\nrobustness using hard negative sampling. Experiments on a large-scale dataset\nfrom Shopee and online A/B testing show that MRSE achieves an 18.9% improvement\nin offline relevance and a 3.7% gain in online core metrics compared to\nShopee's state-of-the-art uni-modality system.\n","authors":["Hao Jiang","Haoxiang Zhang","Qingshan Hou","Chaofeng Chen","Weisi Lin","Jingchang Zhang","Annan Wang"],"pdf_url":"https://arxiv.org/pdf/2408.14968v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14960v1","updated":"2024-08-27T11:07:15Z","published":"2024-08-27T11:07:15Z","title":"Multilingual Arbitrage: Optimizing Data Pools to Accelerate Multilingual\n  Progress","summary":"  The use of synthetic data has played a critical role in recent state-of-art\nbreakthroughs. However, overly relying on a single oracle teacher model to\ngenerate data has been shown to lead to model collapse and invite propagation\nof biases. These limitations are particularly evident in multilingual settings,\nwhere the absence of a universally effective teacher model that excels across\nall languages presents significant challenges. In this work, we address these\nextreme difference by introducing \"multilingual arbitrage\", which capitalizes\non performance variations between multiple models for a given language. To do\nso, we strategically route samples through a diverse pool of models, each with\nunique strengths in different languages. Across exhaustive experiments on\nstate-of-art models, our work suggests that arbitrage techniques allow for\nspectacular gains in performance that far outperform relying on a single\nteacher. In particular, compared to the best single teacher, we observe gains\nof up to 56.5% improvement in win rates averaged across all languages when\nswitching to multilingual arbitrage. We observe the most significant gains for\nthe least resourced languages in our pool.\n","authors":["Ayomide Odumakinde","Daniel D'souza","Pat Verga","Beyza Ermis","Sara Hooker"],"pdf_url":"https://arxiv.org/pdf/2408.14960v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.15504v2","updated":"2024-08-27T10:07:27Z","published":"2024-06-19T16:43:56Z","title":"Dr.E Bridges Graphs with Large Language Models through Words","summary":"  Significant efforts have been dedicated to integrating the powerful Large\nLanguage Models (LLMs) with diverse modalities, particularly focusing on the\nfusion of language, vision and audio data. However, the graph-structured data,\nwhich is inherently rich in structural and domain-specific knowledge, has not\nyet been gracefully adapted to LLMs. Existing methods either describe the graph\nwith raw text, suffering the loss of graph structural information, or feed\nGraph Neural Network (GNN) embeddings into LLMs at the cost of losing\nexplainable prompt semantics. To bridge this gap, we introduce an end-to-end\nmodality-aligning framework for LLM-graph alignment: Dual-Residual Vector\nQuantized-Variational AutoEncoder, namely Dr.E. Our approach is purposefully\ndesigned to facilitate token-level alignment with LLMs, enabling an effective\ntranslation of the intrinsic `language' of graphs into comprehensible natural\nlanguage. We also manage to enhance LLMs' more robust structural understanding\nof graphs by incorporating multiple views of the central nodes based on their\nsurrounding nodes at various distances. Our experimental evaluations on\nstandard graph tasks demonstrate competitive performance against other\nstate-of-the-art (SOTA) approaches. Additionally, our framework ensures certain\nvisual interpretability, efficiency, and robustness, marking the promising\nsuccessful endeavor to achieve token-level alignment between LLMs and GNNs. Our\ncode is available at: https://anonymous.4open.science/r/dre-817.\n","authors":["Zipeng Liu","Likang Wu","Ming He","Zhong Guan","Hongke Zhao","Nan Feng"],"pdf_url":"https://arxiv.org/pdf/2406.15504v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14909v1","updated":"2024-08-27T09:35:49Z","published":"2024-08-27T09:35:49Z","title":"SpikingSSMs: Learning Long Sequences with Sparse and Parallel Spiking\n  State Space Models","summary":"  Known as low energy consumption networks, spiking neural networks (SNNs) have\ngained a lot of attention within the past decades. While SNNs are increasing\ncompetitive with artificial neural networks (ANNs) for vision tasks, they are\nrarely used for long sequence tasks, despite their intrinsic temporal dynamics.\nIn this work, we develop spiking state space models (SpikingSSMs) for long\nsequence learning by leveraging on the sequence learning abilities of state\nspace models (SSMs). Inspired by dendritic neuron structure, we hierarchically\nintegrate neuronal dynamics with the original SSM block, meanwhile realizing\nsparse synaptic computation. Furthermore, to solve the conflict of event-driven\nneuronal dynamics with parallel computing, we propose a light-weight surrogate\ndynamic network which accurately predicts the after-reset membrane potential\nand compatible to learnable thresholds, enabling orders of acceleration in\ntraining speed compared with conventional iterative methods. On the long range\narena benchmark task, SpikingSSM achieves competitive performance to\nstate-of-the-art SSMs meanwhile realizing on average 90\\% of network sparsity.\nOn language modeling, our network significantly surpasses existing spiking\nlarge language models (spikingLLMs) on the WikiText-103 dataset with only a\nthird of the model size, demonstrating its potential as backbone architecture\nfor low computation cost LLMs.\n","authors":["Shuaijie Shen","Chao Wang","Renzhuo Huang","Yan Zhong","Qinghai Guo","Zhichao Lu","Jianguo Zhang","Luziwei Leng"],"pdf_url":"https://arxiv.org/pdf/2408.14909v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14908v1","updated":"2024-08-27T09:35:13Z","published":"2024-08-27T09:35:13Z","title":"Triplètoile: Extraction of Knowledge from Microblogging Text","summary":"  Numerous methods and pipelines have recently emerged for the automatic\nextraction of knowledge graphs from documents such as scientific publications\nand patents. However, adapting these methods to incorporate alternative text\nsources like micro-blogging posts and news has proven challenging as they\nstruggle to model open-domain entities and relations, typically found in these\nsources. In this paper, we propose an enhanced information extraction pipeline\ntailored to the extraction of a knowledge graph comprising open-domain entities\nfrom micro-blogging posts on social media platforms. Our pipeline leverages\ndependency parsing and classifies entity relations in an unsupervised manner\nthrough hierarchical clustering over word embeddings. We provide a use case on\nextracting semantic triples from a corpus of 100 thousand tweets about digital\ntransformation and publicly release the generated knowledge graph. On the same\ndataset, we conduct two experimental evaluations, showing that the system\nproduces triples with precision over 95% and outperforms similar pipelines of\naround 5% in terms of precision, while generating a comparatively higher number\nof triples.\n","authors":["Vanni Zavarella","Sergio Consoli","Diego Reforgiato Recupero","Gianni Fenu","Simone Angioni","Davide Buscaldi","Danilo Dessì","Francesco Osborne"],"pdf_url":"https://arxiv.org/pdf/2408.14908v1.pdf","comment":"42 pages, 6 figures"},{"id":"http://arxiv.org/abs/2408.14906v1","updated":"2024-08-27T09:34:38Z","published":"2024-08-27T09:34:38Z","title":"Writing in the Margins: Better Inference Pattern for Long Context\n  Retrieval","summary":"  In this paper, we introduce Writing in the Margins (WiM), a new inference\npattern for Large Language Models designed to optimize the handling of long\ninput sequences in retrieval-oriented tasks. This approach leverages the\nchunked prefill of the key-value cache to perform segment-wise inference, which\nenables efficient processing of extensive contexts along with the generation\nand classification of intermediate information (\"margins\") that guide the model\ntowards specific tasks. This method increases computational overhead marginally\nwhile significantly enhancing the performance of off-the-shelf models without\nthe need for fine-tuning. Specifically, we observe that WiM provides an average\nenhancement of 7.5% in accuracy for reasoning skills (HotpotQA, MultiHop-RAG)\nand more than a 30.0% increase in the F1-score for aggregation tasks (CWE).\nAdditionally, we show how the proposed pattern fits into an interactive\nretrieval design that provides end-users with ongoing updates about the\nprogress of context processing, and pinpoints the integration of relevant\ninformation into the final response. We release our implementation of WiM using\nHugging Face Transformers library at\nhttps://github.com/writer/writing-in-the-margins.\n","authors":["Melisa Russak","Umar Jamil","Christopher Bryant","Kiran Kamble","Axel Magnuson","Mateusz Russak","Waseem AlShikh"],"pdf_url":"https://arxiv.org/pdf/2408.14906v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14895v1","updated":"2024-08-27T09:18:57Z","published":"2024-08-27T09:18:57Z","title":"VHAKG: A Multi-modal Knowledge Graph Based on Synchronized Multi-view\n  Videos of Daily Activities","summary":"  Multi-modal knowledge graphs (MMKGs), which ground various non-symbolic data\n(e.g., images and videos) into symbols, have attracted attention as resources\nenabling knowledge processing and machine learning across modalities. However,\nthe construction of MMKGs for videos consisting of multiple events, such as\ndaily activities, is still in the early stages. In this paper, we construct an\nMMKG based on synchronized multi-view simulated videos of daily activities.\nBesides representing the content of daily life videos as event-centric\nknowledge, our MMKG also includes frame-by-frame fine-grained changes, such as\nbounding boxes within video frames. In addition, we provide support tools for\nquerying our MMKG. As an application example, we demonstrate that our MMKG\nfacilitates benchmarking vision-language models by providing the necessary\nvision-language datasets for a tailored task.\n","authors":["Shusaku Egami","Takahiro Ugai","Ken Fukuda"],"pdf_url":"https://arxiv.org/pdf/2408.14895v1.pdf","comment":"5 pages,4 figures, accepted by CIKM2024 Resource Track"},{"id":"http://arxiv.org/abs/2408.14892v1","updated":"2024-08-27T09:07:37Z","published":"2024-08-27T09:07:37Z","title":"A Functional Trade-off between Prosodic and Semantic Cues in Conveying\n  Sarcasm","summary":"  This study investigates the acoustic features of sarcasm and disentangles the\ninterplay between the propensity of an utterance being used sarcastically and\nthe presence of prosodic cues signaling sarcasm. Using a dataset of sarcastic\nutterances compiled from television shows, we analyze the prosodic features\nwithin utterances and key phrases belonging to three distinct sarcasm\ncategories (embedded, propositional, and illocutionary), which vary in the\ndegree of semantic cues present, and compare them to neutral expressions.\nResults show that in phrases where the sarcastic meaning is salient from the\nsemantics, the prosodic cues are less relevant than when the sarcastic meaning\nis not evident from the semantics, suggesting a trade-off between prosodic and\nsemantic cues of sarcasm at the phrase level. These findings highlight a\nlessened reliance on prosodic modulation in semantically dense sarcastic\nexpressions and a nuanced interaction that shapes the communication of\nsarcastic intent.\n","authors":["Zhu Li","Xiyuan Gao","Yuqing Zhang","Shekhar Nayak","Matt Coler"],"pdf_url":"https://arxiv.org/pdf/2408.14892v1.pdf","comment":"accepted at Interspeech 2024"},{"id":"http://arxiv.org/abs/2408.14874v1","updated":"2024-08-27T08:43:32Z","published":"2024-08-27T08:43:32Z","title":"Inverse-Q*: Token Level Reinforcement Learning for Aligning Large\n  Language Models Without Preference Data","summary":"  Reinforcement Learning from Human Feedback (RLHF) has proven effective in\naligning large language models with human intentions, yet it often relies on\ncomplex methodologies like Proximal Policy Optimization (PPO) that require\nextensive hyper-parameter tuning and present challenges in sample efficiency\nand stability. In this paper, we introduce Inverse-Q*, an innovative framework\nthat transcends traditional RL methods by optimizing token-level reinforcement\nlearning without the need for additional reward or value models. Inverse-Q*\nleverages direct preference optimization techniques but extends them by\nestimating the conditionally optimal policy directly from the model's\nresponses, facilitating more granular and flexible policy shaping. Our approach\nreduces reliance on human annotation and external supervision, making it\nespecially suitable for low-resource settings. We present extensive\nexperimental results demonstrating that Inverse-Q* not only matches but\npotentially exceeds the effectiveness of PPO in terms of convergence speed and\nthe alignment of model responses with human preferences. Our findings suggest\nthat Inverse-Q* offers a practical and robust alternative to conventional RLHF\napproaches, paving the way for more efficient and adaptable model training\napproaches.\n","authors":["Han Xia","Songyang Gao","Qiming Ge","Zhiheng Xi","Qi Zhang","Xuanjing Huang"],"pdf_url":"https://arxiv.org/pdf/2408.14874v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14866v1","updated":"2024-08-27T08:38:48Z","published":"2024-08-27T08:38:48Z","title":"Advancing Adversarial Suffix Transfer Learning on Aligned Large Language\n  Models","summary":"  Language Language Models (LLMs) face safety concerns due to potential misuse\nby malicious users. Recent red-teaming efforts have identified adversarial\nsuffixes capable of jailbreaking LLMs using the gradient-based search algorithm\nGreedy Coordinate Gradient (GCG). However, GCG struggles with computational\ninefficiency, limiting further investigations regarding suffix transferability\nand scalability across models and data. In this work, we bridge the connection\nbetween search efficiency and suffix transferability. We propose a two-stage\ntransfer learning framework, DeGCG, which decouples the search process into\nbehavior-agnostic pre-searching and behavior-relevant post-searching.\nSpecifically, we employ direct first target token optimization in pre-searching\nto facilitate the search process. We apply our approach to cross-model,\ncross-data, and self-transfer scenarios. Furthermore, we introduce an\ninterleaved variant of our approach, i-DeGCG, which iteratively leverages\nself-transferability to accelerate the search process. Experiments on HarmBench\ndemonstrate the efficiency of our approach across various models and domains.\nNotably, our i-DeGCG outperforms the baseline on Llama2-chat-7b with ASRs of\n$43.9$ ($+22.2$) and $39.0$ ($+19.5$) on valid and test sets, respectively.\nFurther analysis on cross-model transfer indicates the pivotal role of first\ntarget token optimization in leveraging suffix transferability for efficient\nsearching.\n","authors":["Hongfu Liu","Yuxi Xie","Ye Wang","Michael Shieh"],"pdf_url":"https://arxiv.org/pdf/2408.14866v1.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2402.03848v7","updated":"2024-08-27T08:33:29Z","published":"2024-02-06T09:50:08Z","title":"ANLS* -- A Universal Document Processing Metric for Generative Large\n  Language Models","summary":"  Traditionally, discriminative models have been the predominant choice for\ntasks like document classification and information extraction. These models\nmake predictions that fall into a limited number of predefined classes,\nfacilitating a binary true or false evaluation and enabling the direct\ncalculation of metrics such as the F1 score. However, recent advancements in\ngenerative large language models (GLLMs) have prompted a shift in the field due\nto their enhanced zero-shot capabilities, which eliminate the need for a\ndownstream dataset and computationally expensive fine-tuning. However,\nevaluating GLLMs presents a challenge as the binary true or false evaluation\nused for discriminative models is not applicable to the predictions made by\nGLLMs.\n  This paper introduces a new metric for generative models called ANLS* for\nevaluating a wide variety of tasks, including information extraction and\nclassification tasks. The ANLS* metric extends existing ANLS metrics as a\ndrop-in-replacement and is still compatible with previously reported ANLS\nscores. An evaluation of 7 different datasets, and more than 10 different GLLMs\ntogether with 3 different prompting methods using the ANLS* metric is also\nprovided, demonstrating the importance of the proposed metric.\n  We also benchmark a novel approach to generate prompts for documents, called\nSFT, against other prompting techniques such as LATIN. In almost all cases, SFT\noutperforms other techniques and improves the state-of-the-art, sometimes by as\nmuch as $10$ percentage points.\n  Sources are available at https://github.com/deepopinion/anls_star_metric\n","authors":["David Peer","Philemon Schöpf","Volckmar Nebendahl","Alexander Rietzler","Sebastian Stabinger"],"pdf_url":"https://arxiv.org/pdf/2402.03848v7.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03870v2","updated":"2024-08-27T08:31:04Z","published":"2024-03-06T17:23:28Z","title":"Learning to Decode Collaboratively with Multiple Language Models","summary":"  We propose a method to teach multiple large language models (LLM) to\ncollaborate by interleaving their generations at the token level. We model the\ndecision of which LLM generates the next token as a latent variable. By\noptimizing the marginal likelihood of a training set under our latent variable\nmodel, the base LLM automatically learns when to generate itself and when to\ncall on one of the ``assistant'' language models to generate, all without\ndirect supervision. Token-level collaboration during decoding allows for a\nfusion of each model's expertise in a manner tailored to the specific task at\nhand. Our collaborative decoding is especially useful in cross-domain settings\nwhere a generalist base LLM learns to invoke domain expert models. On\ninstruction-following, domain-specific QA, and reasoning tasks, we show that\nthe performance of the joint system exceeds that of the individual models.\nThrough qualitative analysis of the learned latent decisions, we show models\ntrained with our method exhibit several interesting collaboration patterns,\ne.g., template-filling. Our code is available at\nhttps://github.com/clinicalml/co-llm.\n","authors":["Shannon Zejiang Shen","Hunter Lang","Bailin Wang","Yoon Kim","David Sontag"],"pdf_url":"https://arxiv.org/pdf/2403.03870v2.pdf","comment":"16 pages, 4 figures, 11 tables"},{"id":"http://arxiv.org/abs/2408.14853v1","updated":"2024-08-27T08:12:08Z","published":"2024-08-27T08:12:08Z","title":"Detecting AI Flaws: Target-Driven Attacks on Internal Faults in Language\n  Models","summary":"  Large Language Models (LLMs) have become a focal point in the rapidly\nevolving field of artificial intelligence. However, a critical concern is the\npresence of toxic content within the pre-training corpus of these models, which\ncan lead to the generation of inappropriate outputs. Investigating methods for\ndetecting internal faults in LLMs can help us understand their limitations and\nimprove their security. Existing methods primarily focus on jailbreaking\nattacks, which involve manually or automatically constructing adversarial\ncontent to prompt the target LLM to generate unexpected responses. These\nmethods rely heavily on prompt engineering, which is time-consuming and usually\nrequires specially designed questions. To address these challenges, this paper\nproposes a target-driven attack paradigm that focuses on directly eliciting the\ntarget response instead of optimizing the prompts. We introduce the use of\nanother LLM as the detector for toxic content, referred to as ToxDet. Given a\ntarget toxic response, ToxDet can generate a possible question and a\npreliminary answer to provoke the target model into producing desired toxic\nresponses with meanings equivalent to the provided one. ToxDet is trained by\ninteracting with the target LLM and receiving reward signals from it, utilizing\nreinforcement learning for the optimization process. While the primary focus of\nthe target models is on open-source LLMs, the fine-tuned ToxDet can also be\ntransferred to attack black-box models such as GPT-4o, achieving notable\nresults. Experimental results on AdvBench and HH-Harmless datasets demonstrate\nthe effectiveness of our methods in detecting the tendencies of target LLMs to\ngenerate harmful responses. This algorithm not only exposes vulnerabilities but\nalso provides a valuable resource for researchers to strengthen their models\nagainst such attacks.\n","authors":["Yuhao Du","Zhuo Li","Pengyu Cheng","Xiang Wan","Anningzhe Gao"],"pdf_url":"https://arxiv.org/pdf/2408.14853v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14053v2","updated":"2024-08-27T08:05:07Z","published":"2024-08-26T07:19:07Z","title":"Enhancing Depression Diagnosis with Chain-of-Thought Prompting","summary":"  When using AI to detect signs of depressive disorder, AI models habitually\ndraw preemptive conclusions. We theorize that using chain-of-thought (CoT)\nprompting to evaluate Patient Health Questionnaire-8 (PHQ-8) scores will\nimprove the accuracy of the scores determined by AI models. In our findings,\nwhen the models reasoned with CoT, the estimated PHQ-8 scores were consistently\ncloser on average to the accepted true scores reported by each participant\ncompared to when not using CoT. Our goal is to expand upon AI models'\nunderstanding of the intricacies of human conversation, allowing them to more\neffectively assess a patient's feelings and tone, therefore being able to more\naccurately discern mental disorder symptoms; ultimately, we hope to augment AI\nmodels' abilities, so that they can be widely accessible and used in the\nmedical field.\n","authors":["Elysia Shi","Adithri Manda","London Chowdhury","Runeema Arun","Kevin Zhu","Michael Lam"],"pdf_url":"https://arxiv.org/pdf/2408.14053v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14849v1","updated":"2024-08-27T08:01:13Z","published":"2024-08-27T08:01:13Z","title":"Project SHADOW: Symbolic Higher-order Associative Deductive reasoning On\n  Wikidata using LM probing","summary":"  We introduce SHADOW, a fine-tuned language model trained on an intermediate\ntask using associative deductive reasoning, and measure its performance on a\nknowledge base construction task using Wikidata triple completion. We evaluate\nSHADOW on the LM-KBC 2024 challenge and show that it outperforms the baseline\nsolution by 20% with a F1 score of 68.72%.\n","authors":["Hanna Abi Akl"],"pdf_url":"https://arxiv.org/pdf/2408.14849v1.pdf","comment":"6 pages, 1 figure"},{"id":"http://arxiv.org/abs/2407.03600v2","updated":"2024-08-27T08:00:03Z","published":"2024-07-04T03:20:31Z","title":"Chain-of-Thought Augmentation with Logit Contrast for Enhanced Reasoning\n  in Language Models","summary":"  Rapidly increasing model scales coupled with steering methods such as\nchain-of-thought prompting have led to drastic improvements in language model\nreasoning. At the same time, models struggle with compositional generalization\nand are far from human performance on many reasoning-based benchmarks.\nLeveraging the success of chain-of-thought prompting, and also taking\ninspiration from context-aware decoding (CAD), we explore input-based\ncontrasting methods to further encourage the type of reasoning induced by\nchain-of-thought prompting. While work remains to stabilize these results\nacross datasets and models, the improvements we find warrant further\ninvestigation into input-based steering methods for context-aware reasoning.\n","authors":["Jay Shim","Grant Kruttschnitt","Alyssa Ma","Daniel Kim","Benjamin Chek","Athul Anand","Kevin Zhu","Sean O'Brien"],"pdf_url":"https://arxiv.org/pdf/2407.03600v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14845v1","updated":"2024-08-27T07:56:35Z","published":"2024-08-27T07:56:35Z","title":"AAVENUE: Detecting LLM Biases on NLU Tasks in AAVE via a Novel Benchmark","summary":"  Detecting biases in natural language understanding (NLU) for African American\nVernacular English (AAVE) is crucial to developing inclusive natural language\nprocessing (NLP) systems. To address dialect-induced performance discrepancies,\nwe introduce AAVENUE ({AAVE} {N}atural Language {U}nderstanding {E}valuation),\na benchmark for evaluating large language model (LLM) performance on NLU tasks\nin AAVE and Standard American English (SAE). AAVENUE builds upon and extends\nexisting benchmarks like VALUE, replacing deterministic syntactic and\nmorphological transformations with a more flexible methodology leveraging\nLLM-based translation with few-shot prompting, improving performance across our\nevaluation metrics when translating key tasks from the GLUE and SuperGLUE\nbenchmarks. We compare AAVENUE and VALUE translations using five popular LLMs\nand a comprehensive set of metrics including fluency, BARTScore, quality,\ncoherence, and understandability. Additionally, we recruit fluent AAVE speakers\nto validate our translations for authenticity. Our evaluations reveal that LLMs\nconsistently perform better on SAE tasks than AAVE-translated versions,\nunderscoring inherent biases and highlighting the need for more inclusive NLP\nmodels. We have open-sourced our source code on GitHub and created a website to\nshowcase our work at https://aavenue.live.\n","authors":["Abhay Gupta","Philip Meng","Ece Yurtseven","Sean O'Brien","Kevin Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.14845v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14840v1","updated":"2024-08-27T07:51:26Z","published":"2024-08-27T07:51:26Z","title":"CL4KGE: A Curriculum Learning Method for Knowledge Graph Embedding","summary":"  Knowledge graph embedding (KGE) constitutes a foundational task, directed\ntowards learning representations for entities and relations within knowledge\ngraphs (KGs), with the objective of crafting representations comprehensive\nenough to approximate the logical and symbolic interconnections among entities.\nIn this paper, we define a metric Z-counts to measure the difficulty of\ntraining each triple ($<$head entity, relation, tail entity$>$) in KGs with\ntheoretical analysis. Based on this metric, we propose \\textbf{CL4KGE}, an\nefficient \\textbf{C}urriculum \\textbf{L}earning based training strategy for\n\\textbf{KGE}. This method includes a difficulty measurer and a training\nscheduler that aids in the training of KGE models. Our approach possesses the\nflexibility to act as a plugin within a wide range of KGE models, with the\nadded advantage of adaptability to the majority of KGs in existence. The\nproposed method has been evaluated on popular KGE models, and the results\ndemonstrate that it enhances the state-of-the-art methods. The use of Z-counts\nas a metric has enabled the identification of challenging triples in KGs, which\nhelps in devising effective training strategies.\n","authors":["Yang Liu","Chuan Zhou","Peng Zhang","Yanan Cao","Yongchao Liu","Zhao Li","Hongyang Chen"],"pdf_url":"https://arxiv.org/pdf/2408.14840v1.pdf","comment":"16 pages, 3 figures"},{"id":"http://arxiv.org/abs/2408.14830v1","updated":"2024-08-27T07:27:16Z","published":"2024-08-27T07:27:16Z","title":"PolicyLR: A Logic Representation For Privacy Policies","summary":"  Privacy policies are crucial in the online ecosystem, defining how services\nhandle user data and adhere to regulations such as GDPR and CCPA. However,\ntheir complexity and frequent updates often make them difficult for\nstakeholders to understand and analyze. Current automated analysis methods,\nwhich utilize natural language processing, have limitations. They typically\nfocus on individual tasks and fail to capture the full context of the policies.\nWe propose PolicyLR, a new paradigm that offers a comprehensive\nmachine-readable representation of privacy policies, serving as an all-in-one\nsolution for multiple downstream tasks. PolicyLR converts privacy policies into\na machine-readable format using valuations of atomic formulae, allowing for\nformal definitions of tasks like compliance and consistency. We have developed\na compiler that transforms unstructured policy text into this format using\noff-the-shelf Large Language Models (LLMs). This compiler breaks down the\ntransformation task into a two-stage translation and entailment procedure. This\nprocedure considers the full context of the privacy policy to infer a complex\nformula, where each formula consists of simpler atomic formulae. The advantage\nof this model is that PolicyLR is interpretable by design and grounded in\nsegments of the privacy policy. We evaluated the compiler using ToS;DR, a\ncommunity-annotated privacy policy entailment dataset. Utilizing open-source\nLLMs, our compiler achieves precision and recall values of 0.91 and 0.88,\nrespectively. Finally, we demonstrate the utility of PolicyLR in three privacy\ntasks: Policy Compliance, Inconsistency Detection, and Privacy Comparison\nShopping.\n","authors":["Ashish Hooda","Rishabh Khandelwal","Prasad Chalasani","Kassem Fawaz","Somesh Jha"],"pdf_url":"https://arxiv.org/pdf/2408.14830v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16349v3","updated":"2024-08-27T07:22:50Z","published":"2023-08-30T22:50:32Z","title":"Affective Visual Dialog: A Large-Scale Benchmark for Emotional Reasoning\n  Based on Visually Grounded Conversations","summary":"  We introduce Affective Visual Dialog, an emotion explanation and reasoning\ntask as a testbed for research on understanding the formation of emotions in\nvisually grounded conversations. The task involves three skills: (1)\nDialog-based Question Answering (2) Dialog-based Emotion Prediction and (3)\nAffective emotion explanation generation based on the dialog. Our key\ncontribution is the collection of a large-scale dataset, dubbed AffectVisDial,\nconsisting of 50K 10-turn visually grounded dialogs as well as concluding\nemotion attributions and dialog-informed textual emotion explanations,\nresulting in a total of 27,180 working hours. We explain our design decisions\nin collecting the dataset and introduce the questioner and answerer tasks that\nare associated with the participants in the conversation. We train and\ndemonstrate solid Affective Visual Dialog baselines adapted from\nstate-of-the-art models. Remarkably, the responses generated by our models show\npromising emotional reasoning abilities in response to visually grounded\nconversations. Our project page is available at\nhttps://affective-visual-dialog.github.io.\n","authors":["Kilichbek Haydarov","Xiaoqian Shen","Avinash Madasu","Mahmoud Salem","Li-Jia Li","Gamaleldin Elsayed","Mohamed Elhoseiny"],"pdf_url":"https://arxiv.org/pdf/2308.16349v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14825v1","updated":"2024-08-27T07:11:45Z","published":"2024-08-27T07:11:45Z","title":"From Rule-Based Models to Deep Learning Transformers Architectures for\n  Natural Language Processing and Sign Language Translation Systems: Survey,\n  Taxonomy and Performance Evaluation","summary":"  With the growing Deaf and Hard of Hearing population worldwide and the\npersistent shortage of certified sign language interpreters, there is a\npressing need for an efficient, signs-driven, integrated end-to-end translation\nsystem, from sign to gloss to text and vice-versa. There has been a wealth of\nresearch on machine translations and related reviews. However, there are few\nworks on sign language machine translation considering the particularity of the\nlanguage being continuous and dynamic. This paper aims to address this void,\nproviding a retrospective analysis of the temporal evolution of sign language\nmachine translation algorithms and a taxonomy of the Transformers\narchitectures, the most used approach in language translation. We also present\nthe requirements of a real-time Quality-of-Service sign language ma-chine\ntranslation system underpinned by accurate deep learning algorithms. We propose\nfuture research directions for sign language translation systems.\n","authors":["Nada Shahin","Leila Ismail"],"pdf_url":"https://arxiv.org/pdf/2408.14825v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.05885v2","updated":"2024-08-27T06:53:16Z","published":"2024-06-09T18:45:41Z","title":"Are Large Language Models Actually Good at Text Style Transfer?","summary":"  We analyze the performance of large language models (LLMs) on Text Style\nTransfer (TST), specifically focusing on sentiment transfer and text\ndetoxification across three languages: English, Hindi, and Bengali. Text Style\nTransfer involves modifying the linguistic style of a text while preserving its\ncore content. We evaluate the capabilities of pre-trained LLMs using zero-shot\nand few-shot prompting as well as parameter-efficient finetuning on publicly\navailable datasets. Our evaluation using automatic metrics, GPT-4 and human\nevaluations reveals that while some prompted LLMs perform well in English,\ntheir performance in on other languages (Hindi, Bengali) remains average.\nHowever, finetuning significantly improves results compared to zero-shot and\nfew-shot prompting, making them comparable to previous state-of-the-art. This\nunderscores the necessity of dedicated datasets and specialized models for\neffective TST.\n","authors":["Sourabrata Mukherjee","Atul Kr. Ojha","Ondřej Dušek"],"pdf_url":"https://arxiv.org/pdf/2406.05885v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20805v3","updated":"2024-08-27T06:51:00Z","published":"2024-05-31T14:05:27Z","title":"Multilingual Text Style Transfer: Datasets & Models for Indian Languages","summary":"  Text style transfer (TST) involves altering the linguistic style of a text\nwhile preserving its core content. This paper focuses on sentiment transfer, a\npopular TST subtask, across a spectrum of Indian languages: Hindi, Magahi,\nMalayalam, Marathi, Punjabi, Odia, Telugu, and Urdu, expanding upon previous\nwork on English-Bangla sentiment transfer (Mukherjee et al., 2023). We\nintroduce dedicated datasets of 1,000 positive and 1,000 negative\nstyle-parallel sentences for each of these eight languages. We then evaluate\nthe performance of various benchmark models categorized into parallel,\nnon-parallel, cross-lingual, and shared learning approaches, including the\nLlama2 and GPT-3.5 large language models (LLMs). Our experiments highlight the\nsignificance of parallel data in TST and demonstrate the effectiveness of the\nMasked Style Filling (MSF) approach (Mukherjee et al., 2023) in non-parallel\ntechniques. Moreover, cross-lingual and joint multilingual learning methods\nshow promise, offering insights into selecting optimal models tailored to the\nspecific language and task requirements. To the best of our knowledge, this\nwork represents the first comprehensive exploration of the TST task as\nsentiment transfer across a diverse set of languages.\n","authors":["Sourabrata Mukherjee","Atul Kr. Ojha","Akanksha Bansal","Deepak Alok","John P. McCrae","Ondřej Dušek"],"pdf_url":"https://arxiv.org/pdf/2405.20805v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14809v1","updated":"2024-08-27T06:44:28Z","published":"2024-08-27T06:44:28Z","title":"GSIFN: A Graph-Structured and Interlaced-Masked Multimodal Transformer\n  Based Fusion Network for Multimodal Sentiment Analysis","summary":"  Multimodal Sentiment Analysis (MSA) leverages multiple modals to analyze\nsentiments. Typically, advanced fusion methods and representation\nlearning-based methods are designed to tackle it. Our proposed GSIFN solves two\nkey problems to be solved in MSA: (i) In multimodal fusion, the decoupling of\nmodal combinations and tremendous parameter redundancy in existing fusion\nmethods, which lead to poor fusion performance and efficiency. (ii) The\ntrade-off between representation capability and computation overhead of the\nunimodal feature extractors and enhancers. GSIFN incorporates two main\ncomponents to solve these problems: (i) Graph-Structured and Interlaced-Masked\nMultimodal Transformer. It adopts the Interlaced Mask mechanism to construct\nrobust multimodal graph embedding, achieve all-modal-in-one Transformer-based\nfusion, and greatly reduce the computation overhead. (ii) A self-supervised\nlearning framework with low computation overhead and high performance, which\nutilizes a parallelized LSTM with matrix memory to enhance non-verbal modal\nfeature for unimodal label generation. Evaluated on the MSA datasets CMU-MOSI,\nCMU-MOSEI, and CH-SIMS, GSIFN demonstrates superior performance with\nsignificantly lower computation overhead compared with state-of-the-art\nmethods.\n","authors":["Yijie Jin"],"pdf_url":"https://arxiv.org/pdf/2408.14809v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08841v2","updated":"2024-08-27T06:23:45Z","published":"2024-08-16T17:00:11Z","title":"FLEXTAF: Enhancing Table Reasoning with Flexible Tabular Formats","summary":"  The table reasoning task aims to answer the question according to the given\ntable. Currently, using Large Language Models (LLMs) is the predominant method\nfor table reasoning. Most existing methods employ a fixed tabular format to\nrepresent the table, which could limit the performance. Given that each\ninstance requires different capabilities and models possess varying abilities,\nwe assert that different instances and models suit different tabular formats.\nWe prove the aforementioned claim through quantitative analysis of experimental\nresults, where different instances and models achieve different performances\nusing various tabular formats. Building on this discussion, we propose\nFLEXTAF-Single and FLEXTAF-Vote to enhance table reasoning performance by\nemploying flexible tabular formats. Specifically, (i) FLEXTAF-Single trains a\nclassifier to predict the most suitable tabular format based on the instance\nand the LLM. (ii) FLEXTAF-Vote integrates the results across different formats.\nOur experiments on WikiTableQuestions and TabFact reveal significant\nimprovements, with average gains of 2.3% and 4.8% compared to the best\nperformance achieved using a fixed tabular format with greedy decoding and\nself-consistency decoding, thereby validating the effectiveness of our methods.\n","authors":["Xuanliang Zhang","Dingzirui Wang","Longxu Dou","Baoxin Wang","Dayong Wu","Qingfu Zhu","Wanxiang Che"],"pdf_url":"https://arxiv.org/pdf/2408.08841v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14043v2","updated":"2024-08-27T06:18:05Z","published":"2024-06-20T07:06:58Z","title":"Taxonomy-Guided Zero-Shot Recommendations with LLMs","summary":"  With the emergence of large language models (LLMs) and their ability to\nperform a variety of tasks, their application in recommender systems (RecSys)\nhas shown promise. However, we are facing significant challenges when deploying\nLLMs into RecSys, such as limited prompt length, unstructured item information,\nand un-constrained generation of recommendations, leading to sub-optimal\nperformance. To address these issues, we propose a novel method using a\ntaxonomy dictionary. This method provides a systematic framework for\ncategorizing and organizing items, improving the clarity and structure of item\ninformation. By incorporating the taxonomy dictionary into LLM prompts, we\nachieve efficient token utilization and controlled feature generation, leading\nto more accurate and contextually relevant recommendations. Our Taxonomy-guided\nRecommendation (TaxRec) approach features a two-step process: one-time taxonomy\ncategorization and LLM-based recommendation, enabling zero-shot recommendations\nwithout the need for domain-specific fine-tuning. Experimental results\ndemonstrate TaxRec significantly enhances recommendation quality compared to\ntraditional zero-shot approaches, showcasing its efficacy as personal\nrecommender with LLMs. Code is available at\nhttps://github.com/yueqingliang1/TaxRec.\n","authors":["Yueqing Liang","Liangwei Yang","Chen Wang","Xiongxiao Xu","Philip S. Yu","Kai Shu"],"pdf_url":"https://arxiv.org/pdf/2406.14043v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08779v2","updated":"2024-08-27T06:14:54Z","published":"2024-08-16T14:43:15Z","title":"DAC: Decomposed Automation Correction for Text-to-SQL","summary":"  Text-to-SQL is an important task that helps people obtain information from\ndatabases by automatically generating SQL queries. Considering the brilliant\nperformance, approaches based on Large Language Models (LLMs) become the\nmainstream for text-to-SQL. Among these approaches, automated correction is an\neffective approach that further enhances performance by correcting the mistakes\nin the generated results. The existing correction methods require LLMs to\ndirectly correct with generated SQL, while previous research shows that LLMs do\nnot know how to detect mistakes, leading to poor performance. Therefore, in\nthis paper, we propose to employ the decomposed correction to enhance\ntext-to-SQL performance. We first demonstrate that decomposed correction\noutperforms direct correction since detecting and fixing mistakes with the\nresults of the decomposed sub-tasks is easier than with SQL. Based on this\nanalysis, we introduce Decomposed Automation Correction (DAC), which corrects\nSQL by decomposing text-to-SQL into entity linking and skeleton parsing. DAC\nfirst generates the entity and skeleton corresponding to the question and then\ncompares the differences between the initial SQL and the generated entities and\nskeleton as feedback for correction. Experimental results show that our method\nimproves performance by $3.7\\%$ on average of Spider, Bird, and KaggleDBQA\ncompared with the baseline method, demonstrating the effectiveness of DAC.\n","authors":["Dingzirui Wang","Longxu Dou","Xuanliang Zhang","Qingfu Zhu","Wanxiang Che"],"pdf_url":"https://arxiv.org/pdf/2408.08779v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08072v2","updated":"2024-08-27T04:50:12Z","published":"2024-08-15T10:44:38Z","title":"I-SHEEP: Self-Alignment of LLM from Scratch through an Iterative\n  Self-Enhancement Paradigm","summary":"  Large Language Models (LLMs) have achieved significant advancements, however,\nthe common learning paradigm treats LLMs as passive information repositories,\nneglecting their potential for active learning and alignment. Some approaches\ntrain LLMs using their own generated synthetic data, exploring the possibility\nof active alignment. However, there is still a huge gap between these one-time\nalignment methods and the continuous automatic alignment of humans. In this\npaper, we introduce \\textbf{I-SHEEP}, an \\textbf{I}terative\n\\textbf{S}elf-En\\textbf{H}anc\\textbf{E}m\\textbf{E}nt \\textbf{P}aradigm.This\nhuman-like paradigm enables LLMs to \\textbf{continuously self-align from\nscratch with nothing}. Compared to the one-time alignment method Dromedary\n\\cite{sun2023principledriven}, which refers to the first iteration in this\npaper, I-SHEEP can significantly enhance capacities on both Qwen and Llama\nmodels. I-SHEEP achieves a maximum relative improvement of 78.2\\% in the Alpaca\nEval, 24.0\\% in the MT Bench, and an absolute increase of 8.88\\% in the IFEval\naccuracy over subsequent iterations in Qwen-1.5 72B model. Additionally,\nI-SHEEP surpasses the base model in various standard benchmark generation\ntasks, achieving an average improvement of 24.77\\% in code generation tasks,\n12.04\\% in TrivialQA, and 20.29\\% in SQuAD. We also provide new insights based\non the experiment results. Our codes, datasets, and models are available at\n\\textbf{https://anonymous.4open.science/r/I-SHEEP}.\n","authors":["Yiming Liang","Ge Zhang","Xingwei Qu","Tianyu Zheng","Jiawei Guo","Xinrun Du","Zhenzhu Yang","Jiaheng Liu","Chenghua Lin","Lei Ma","Wenhao Huang","Jiajun Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.08072v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.06537v2","updated":"2024-08-27T04:43:59Z","published":"2024-07-09T04:17:39Z","title":"Efficient and Accurate Memorable Conversation Model using DPO based on\n  sLLM","summary":"  In multi-session dialog system, it is essential to continuously update the\nmemory as the session progresses. Simply accumulating memory can make it\ndifficult to focus on the content of the conversation for inference due to the\nlimited input sentence size. Therefore, efficient and accurate conversation\nmodel that is capable of managing memory to reflect the conversation history\ncontinuously is necessary. This paper presents a conversation model that\nefficiently manages memory as sessions progress and incorporates this into the\nmodel to reflect the conversation history accurately with 3 methodologies: SFT,\nDPO and DPO with SFT model. Our model using DPO algorithm shows an improvement\nabout 0.0591 of BERTScore in memory accuracy, and the rate of responses\nreflecting the memory increased as well. Also, response generation performance\nenhanced about 4.292 in fluency, 3.935 in coherence, and 2.896 in consistency.\nThis paper describes a training method that yields better performance than\nmodels with more than twice the parameter size, even when the model size is\nsmaller. Thus, our model demonstrates efficiency not only in terms of accuracy\nbut also in resource utilization.\n","authors":["Youngkyung Seo","Yoonseok Heo","Jun-Seok Koh","Du-Seong Chang"],"pdf_url":"https://arxiv.org/pdf/2407.06537v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.08374v2","updated":"2024-08-27T04:35:20Z","published":"2023-06-14T09:04:29Z","title":"SpeechGLUE: How Well Can Self-Supervised Speech Models Capture\n  Linguistic Knowledge?","summary":"  Self-supervised learning (SSL) for speech representation has been\nsuccessfully applied in various downstream tasks, such as speech and speaker\nrecognition. More recently, speech SSL models have also been shown to be\nbeneficial in advancing spoken language understanding tasks, implying that the\nSSL models have the potential to learn not only acoustic but also linguistic\ninformation. In this paper, we aim to clarify if speech SSL techniques can well\ncapture linguistic knowledge. For this purpose, we introduce SpeechGLUE, a\nspeech version of the General Language Understanding Evaluation (GLUE)\nbenchmark. Since GLUE comprises a variety of natural language understanding\ntasks, SpeechGLUE can elucidate the degree of linguistic ability of speech SSL\nmodels. Experiments demonstrate that speech SSL models, although inferior to\ntext-based SSL models, perform better than baselines, suggesting that they can\nacquire a certain amount of general linguistic knowledge from just unlabeled\nspeech data.\n","authors":["Takanori Ashihara","Takafumi Moriya","Kohei Matsuura","Tomohiro Tanaka","Yusuke Ijima","Taichi Asami","Marc Delcroix","Yukinori Honma"],"pdf_url":"https://arxiv.org/pdf/2306.08374v2.pdf","comment":"Accepted at INTERSPEECH 2023. This paper has been extended in a\n  subsequent journal paper, see\n  https://ieeexplore.ieee.org/abstract/document/10597571"},{"id":"http://arxiv.org/abs/2408.14774v1","updated":"2024-08-27T04:31:58Z","published":"2024-08-27T04:31:58Z","title":"Instruct-SkillMix: A Powerful Pipeline for LLM Instruction Tuning","summary":"  We introduce Instruct-SkillMix, an automated approach for creating diverse,\nhigh quality SFT data. The Instruct-SkillMix pipeline involves two stages, each\nleveraging an existing powerful LLM: (1) Skill extraction: uses the LLM to\nextract core \"skills\" for instruction-following, either from existing datasets,\nor by directly prompting the model; (2) Data generation: uses the powerful LLM\nto generate (instruction, response) data that exhibit a randomly chosen pair of\nthese skills. Here, the use of random skill combinations promotes diversity and\ndifficulty.\n  Vanilla SFT (i.e., no PPO, DPO, or RL methods) on data generated from\nInstruct-SkillMix leads to strong gains on instruction following benchmarks\nsuch as AlpacaEval 2.0, MT-Bench, and WildBench. With just $4$K examples,\nLLaMA-3-8B-Base achieves 42.76% length-controlled win rate on AlpacaEval 2.0.\nTo our knowledge, this achieves state-of-the-art performance among all models\nthat have only undergone SFT (no RL methods) and competes with proprietary\nmodels such as Claude 3 Opus and LLaMA-3.1-405B-Instruct.\n  Ablation studies also suggest plausible reasons for why creating open\ninstruction-tuning datasets via naive crowd-sourcing has proved difficult.\nIntroducing low quality answers (\"shirkers\") in $20\\%$ of Instruct-SkillMix\nexamples causes performance to plummet, sometimes catastrophically.\n  The Instruct-SkillMix pipeline is flexible and is adaptable to other\nsettings.\n","authors":["Simran Kaur","Simon Park","Anirudh Goyal","Sanjeev Arora"],"pdf_url":"https://arxiv.org/pdf/2408.14774v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14856v2","updated":"2024-08-27T04:30:29Z","published":"2023-07-27T13:37:06Z","title":"Exploiting the Potential of Seq2Seq Models as Robust Few-Shot Learners","summary":"  In-context learning, which offers substantial advantages over fine-tuning, is\npredominantly observed in decoder-only models, while encoder-decoder (i.e.,\nseq2seq) models excel in methods that rely on weight updates. Recently, a few\nstudies have demonstrated the feasibility of few-shot learning with seq2seq\nmodels; however, this has been limited to tasks that align well with the\nseq2seq architecture, such as summarization and translation. Inspired by these\ninitial studies, we provide a first-ever extensive experiment comparing the\nin-context few-shot learning capabilities of decoder-only and encoder-decoder\nmodels on a broad range of tasks. Furthermore, we propose two methods to more\neffectively elicit in-context learning ability in seq2seq models:\nobjective-aligned prompting and a fusion-based approach. Remarkably, our\napproach outperforms a decoder-only model that is six times larger and exhibits\nsignificant performance improvements compared to conventional seq2seq models\nacross a variety of settings. We posit that, with the right configuration and\nprompt design, seq2seq models can be highly effective few-shot learners for a\nwide spectrum of applications.\n","authors":["Jihyeon Lee","Dain Kim","Doohae Jung","Boseop Kim","Kyoung-Woon On"],"pdf_url":"https://arxiv.org/pdf/2307.14856v2.pdf","comment":"Accepted to COLM'2024"},{"id":"http://arxiv.org/abs/2408.14772v1","updated":"2024-08-27T04:20:10Z","published":"2024-08-27T04:20:10Z","title":"A global AI community requires language-diverse publishing","summary":"  In this provocation, we discuss the English dominance of the AI research\ncommunity, arguing that the requirement for English language publishing upholds\nand reinforces broader regimes of extraction in AI. While large language models\nand machine translation have been celebrated as a way to break down barriers,\nwe regard their use as a symptom of linguistic exclusion of scientists and\npotential readers. We propose alternative futures for a healthier publishing\nculture, organized around three themes: administering conferences in the\nlanguages of the country in which they are held, instructing peer reviewers not\nto adjudicate the language appropriateness of papers, and offering\nopportunities to publish and present in multiple languages. We welcome new\ntranslations of this piece. Please contact the authors if you would like to\ncontribute one.\n","authors":["Haley Lepp","Parth Sarin"],"pdf_url":"https://arxiv.org/pdf/2408.14772v1.pdf","comment":"Translations by Michael Hardy (Guarani), Vandana Sarin and Vivek\n  Sarin (Hindi), Roshna Omer Abdulrahman (Soran\\^i Kurdish), Gabriel Poesia\n  (Portuguese), and Mat\\'ias Grinberg (Spanish). In the proceedings of the\n  Global AI Cultures Workshop at the Twelfth International Conference on\n  Learning Representations (ICLR) 2024, Vienna, Austria, May 7-11, 2024"},{"id":"http://arxiv.org/abs/2408.14470v2","updated":"2024-08-27T03:56:11Z","published":"2024-08-26T17:58:53Z","title":"Step-by-Step Unmasking for Parameter-Efficient Fine-tuning of Large\n  Language Models","summary":"  Fine-tuning large language models (LLMs) on downstream tasks requires\nsubstantial computational resources. A class of parameter-efficient fine-tuning\n(PEFT) aims to mitigate these computational challenges by selectively\nfine-tuning only a small fraction of the model parameters. Although\ncomputationally efficient, these techniques often fail to match the performance\nof fully fine-tuned models, primarily due to inherent biases introduced during\nparameter selection. Traditional selective PEFT techniques use a fixed set of\nparameters based on a predefined budget (a process also known as unmasking),\nfailing to capture parameter importance dynamically and often ending up\nexceeding the budget. We introduce $\\text{ID}^3$, a novel selective PEFT method\nthat calculates parameter importance continually and dynamically unmasks\nparameters by balancing exploration and exploitation in parameter selection.\nOur empirical study on 15 tasks spanning natural language understanding and\ngenerative tasks demonstrates the effectiveness of our method compared to\nfixed-masking-based PEFT techniques. We analytically show that $\\text{ID}^3$\nreduces the number of gradient updates by a factor of two, enhancing\ncomputational efficiency. $\\text{ID}^3$ is robust to random initialization of\nneurons and, therefore, can be seamlessly integrated into existing additive and\nreparametrization-based PEFT modules such as adapters and LoRA for dynamic\nsparsification.\n","authors":["Aradhye Agarwal","Suhas K Ramesh","Ayan Sengupta","Tanmoy Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2408.14470v2.pdf","comment":"15 pages, 7 tables, 9 figures"},{"id":"http://arxiv.org/abs/2402.10260v2","updated":"2024-08-27T03:32:47Z","published":"2024-02-15T18:58:09Z","title":"A StrongREJECT for Empty Jailbreaks","summary":"  Most jailbreak papers claim the jailbreaks they propose are highly effective,\noften boasting near-100% attack success rates. However, it is perhaps more\ncommon than not for jailbreak developers to substantially exaggerate the\neffectiveness of their jailbreaks. We suggest this problem arises because\njailbreak researchers lack a standard, high-quality benchmark for evaluating\njailbreak performance, leaving researchers to create their own. To create a\nbenchmark, researchers must choose a dataset of forbidden prompts to which a\nvictim model will respond, along with an evaluation method that scores the\nharmfulness of the victim model's responses. We show that existing benchmarks\nsuffer from significant shortcomings and introduce the StrongREJECT benchmark\nto address these issues. StrongREJECT's dataset contains prompts that victim\nmodels must answer with specific, harmful information, while its automated\nevaluator measures the extent to which a response gives useful information to\nforbidden prompts. In doing so, the StrongREJECT evaluator achieves\nstate-of-the-art agreement with human judgments of jailbreak effectiveness.\nNotably, we find that existing evaluation methods significantly overstate\njailbreak effectiveness compared to human judgments and the StrongREJECT\nevaluator. We describe a surprising and novel phenomenon that explains this\ndiscrepancy: jailbreaks bypassing a victim model's safety fine-tuning tend to\nreduce its capabilities. Together, our findings underscore the need for\nresearchers to use a high-quality benchmark, such as StrongREJECT, when\ndeveloping new jailbreak attacks. We release the StrongREJECT code and data at\nhttps://strong-reject.readthedocs.io/en/latest/.\n","authors":["Alexandra Souly","Qingyuan Lu","Dillon Bowen","Tu Trinh","Elvis Hsieh","Sana Pandey","Pieter Abbeel","Justin Svegliato","Scott Emmons","Olivia Watkins","Sam Toyer"],"pdf_url":"https://arxiv.org/pdf/2402.10260v2.pdf","comment":"Code and data at https://strong-reject.readthedocs.io/en/latest/"},{"id":"http://arxiv.org/abs/2408.13184v2","updated":"2024-08-27T03:27:08Z","published":"2024-08-23T16:02:54Z","title":"Can LLM be a Good Path Planner based on Prompt Engineering? Mitigating\n  the Hallucination for Path Planning","summary":"  Spatial reasoning in Large Language Models (LLMs) is the foundation for\nembodied intelligence. However, even in simple maze environments, LLMs still\nencounter challenges in long-term path-planning, primarily influenced by their\nspatial hallucination and context inconsistency hallucination by long-term\nreasoning. To address this challenge, this study proposes an innovative model,\nSpatial-to-Relational Transformation and Curriculum Q-Learning (S2RCQL). To\naddress the spatial hallucination of LLMs, we propose the Spatial-to-Relational\napproach, which transforms spatial prompts into entity relations and paths\nrepresenting entity relation chains. This approach fully taps the potential of\nLLMs in terms of sequential thinking. As a result, we design a path-planning\nalgorithm based on Q-learning to mitigate the context inconsistency\nhallucination, which enhances the reasoning ability of LLMs. Using the Q-value\nof state-action as auxiliary information for prompts, we correct the\nhallucinations of LLMs, thereby guiding LLMs to learn the optimal path.\nFinally, we propose a reverse curriculum learning technique based on LLMs to\nfurther mitigate the context inconsistency hallucination. LLMs can rapidly\naccumulate successful experiences by reducing task difficulty and leveraging\nthem to tackle more complex tasks. We performed comprehensive experiments based\non Baidu's self-developed LLM: ERNIE-Bot 4.0. The results showed that our\nS2RCQL achieved a 23%--40% improvement in both success and optimality rates\ncompared with advanced prompt engineering.\n","authors":["Hourui Deng","Hongjie Zhang","Jie Ou","Chaosheng Feng"],"pdf_url":"https://arxiv.org/pdf/2408.13184v2.pdf","comment":"Submitted to ICASSP"},{"id":"http://arxiv.org/abs/2408.01262v3","updated":"2024-08-27T03:13:50Z","published":"2024-08-02T13:35:11Z","title":"RAGEval: Scenario Specific RAG Evaluation Dataset Generation Framework","summary":"  Retrieval-Augmented Generation (RAG) systems have demonstrated their\nadvantages in alleviating the hallucination of Large Language Models (LLMs).\nExisting RAG benchmarks mainly focus on evaluating whether LLMs can correctly\nanswer the general knowledge. However, they are unable to evaluate the\neffectiveness of the RAG system in dealing with the data from different\nvertical domains. This paper introduces RAGEval, a framework for automatically\ngenerating evaluation datasets to evaluate the knowledge usage ability of\ndifferent LLMs in different scenarios. Specifically, RAGEval summarizes a\nschema from seed documents, applies the configurations to generate diverse\ndocuments, and constructs question-answering pairs according to both articles\nand configurations. We propose three novel metrics, Completeness,\nHallucination, and Irrelevance, to carefully evaluate the responses generated\nby LLMs. By benchmarking RAG models in vertical domains, RAGEval has the\nability to better evaluate the knowledge usage ability of LLMs, which avoids\nthe confusion regarding the source of knowledge in answering question in\nexisting QA datasets--whether it comes from parameterized memory or retrieval.\nThe code and dataset will be released.\n","authors":["Kunlun Zhu","Yifan Luo","Dingling Xu","Ruobing Wang","Shi Yu","Shuo Wang","Yukun Yan","Zhenghao Liu","Xu Han","Zhiyuan Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2408.01262v3.pdf","comment":"add github repo"},{"id":"http://arxiv.org/abs/2408.14750v1","updated":"2024-08-27T03:01:48Z","published":"2024-08-27T03:01:48Z","title":"LyCon: Lyrics Reconstruction from the Bag-of-Words Using Large Language\n  Models","summary":"  This paper addresses the unique challenge of conducting research in lyric\nstudies, where direct use of lyrics is often restricted due to copyright\nconcerns. Unlike typical data, internet-sourced lyrics are frequently protected\nunder copyright law, necessitating alternative approaches. Our study introduces\na novel method for generating copyright-free lyrics from publicly available\nBag-of-Words (BoW) datasets, which contain the vocabulary of lyrics but not the\nlyrics themselves. Utilizing metadata associated with BoW datasets and large\nlanguage models, we successfully reconstructed lyrics. We have compiled and\nmade available a dataset of reconstructed lyrics, LyCon, aligned with metadata\nfrom renowned sources including the Million Song Dataset, Deezer Mood Detection\nDataset, and AllMusic Genre Dataset, available for public access. We believe\nthat the integration of metadata such as mood annotations or genres enables a\nvariety of academic experiments on lyrics, such as conditional lyric\ngeneration.\n","authors":["Haven Kim","Kahyun Choi"],"pdf_url":"https://arxiv.org/pdf/2408.14750v1.pdf","comment":"Dataset downlodable at https://github.com/havenpersona/lycon"},{"id":"http://arxiv.org/abs/2408.10903v4","updated":"2024-08-27T02:58:39Z","published":"2024-08-20T14:47:38Z","title":"BEYOND DIALOGUE: A Profile-Dialogue Alignment Framework Towards General\n  Role-Playing Language Model","summary":"  The rapid advancement of large language models (LLMs) has revolutionized\nrole-playing, enabling the development of general role-playing models. However,\ncurrent role-playing training has two significant issues: (I) Using a\npredefined role profile to prompt dialogue training for specific scenarios\nusually leads to inconsistencies and even conflicts between the dialogue and\nthe profile, resulting in training biases. (II) The model learns to imitate the\nrole based solely on the profile, neglecting profile-dialogue alignment at the\nsentence level. In this work, we propose a simple yet effective framework\ncalled BEYOND DIALOGUE, designed to overcome these hurdles. This framework\ninnovatively introduces \"beyond dialogue\" tasks to align dialogue with profile\ntraits based on each specific scenario, thereby eliminating biases during\ntraining. Furthermore, by adopting an innovative prompting mechanism that\ngenerates reasoning outcomes for training, the framework allows the model to\nachieve fine-grained alignment between profile and dialogue at the sentence\nlevel. The aforementioned methods are fully automated and low-cost.\nAdditionally, the integration of automated dialogue and objective evaluation\nmethods forms a comprehensive framework, paving the way for general\nrole-playing. Experimental results demonstrate that our model excels in\nadhering to and reflecting various dimensions of role profiles, outperforming\nmost proprietary general and specialized role-playing baselines. All code and\ndatasets are available at https://github.com/yuyouyu32/BeyondDialogue.\n","authors":["Yeyong Yu","Runsheng Yu","Haojie Wei","Zhanqiu Zhang","Quan Qian"],"pdf_url":"https://arxiv.org/pdf/2408.10903v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05706v2","updated":"2024-08-27T02:24:30Z","published":"2024-02-08T14:35:09Z","title":"Integrating Paralinguistics in Speech-Empowered Large Language Models\n  for Natural Conversation","summary":"  Recent work shows promising results in expanding the capabilities of large\nlanguage models (LLM) to directly understand and synthesize speech. However, an\nLLM-based strategy for modeling spoken dialogs remains elusive, calling for\nfurther investigation. This paper introduces an extensive speech-text LLM\nframework, the Unified Spoken Dialog Model (USDM), designed to generate\ncoherent spoken responses with naturally occurring prosodic features relevant\nto the given input speech without relying on explicit automatic speech\nrecognition (ASR) or text-to-speech (TTS) systems. We have verified the\ninclusion of prosody in speech tokens that predominantly contain semantic\ninformation and have used this foundation to construct a prosody-infused\nspeech-text model. Additionally, we propose a generalized speech-text\npretraining scheme that enhances the capture of cross-modal semantics. To\nconstruct USDM, we fine-tune our speech-text model on spoken dialog data using\na multi-step spoken dialog template that stimulates the chain-of-reasoning\ncapabilities exhibited by the underlying LLM. Automatic and human evaluations\non the DailyTalk dataset demonstrate that our approach effectively generates\nnatural-sounding spoken responses, surpassing previous and cascaded baselines.\nWe will make our code and checkpoints publicly available.\n","authors":["Heeseung Kim","Soonshin Seo","Kyeongseok Jeong","Ohsung Kwon","Soyoon Kim","Jungwhan Kim","Jaehong Lee","Eunwoo Song","Myungwoo Oh","Jung-Woo Ha","Sungroh Yoon","Kang Min Yoo"],"pdf_url":"https://arxiv.org/pdf/2402.05706v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.12654v3","updated":"2024-08-27T02:22:00Z","published":"2024-02-20T02:04:38Z","title":"OWSM-CTC: An Open Encoder-Only Speech Foundation Model for Speech\n  Recognition, Translation, and Language Identification","summary":"  There has been an increasing interest in large speech models that can perform\nmultiple tasks in a single model. Such models usually adopt an encoder-decoder\nor decoder-only architecture due to their popularity and good performance in\nmany domains. However, autoregressive models can be slower during inference\ncompared to non-autoregressive models and also have potential risks of\nhallucination. Though prior studies observed promising results of\nnon-autoregressive models for certain tasks at small scales, it remains unclear\nif they can be scaled to speech-to-text generation in diverse languages and\ntasks. Inspired by the Open Whisper-style Speech Model (OWSM) project, we\npropose OWSM-CTC, a novel encoder-only speech foundation model based on\nConnectionist Temporal Classification (CTC). It is trained on 180k hours of\npublic audio data for multilingual automatic speech recognition (ASR), speech\ntranslation (ST), and language identification (LID). Compared to\nencoder-decoder OWSM, our OWSM-CTC achieves competitive results on ASR and up\nto 24% relative improvement on ST, while it is more robust and 3 to 4 times\nfaster for inference. OWSM-CTC also improves the long-form ASR result with 20x\nspeed-up. We will publicly release our code, pre-trained model, and training\nlogs to promote open science in speech foundation models.\n","authors":["Yifan Peng","Yui Sudo","Muhammad Shakeel","Shinji Watanabe"],"pdf_url":"https://arxiv.org/pdf/2402.12654v3.pdf","comment":"Accepted at ACL 2024 main conference"},{"id":"http://arxiv.org/abs/2401.16658v3","updated":"2024-08-27T02:15:49Z","published":"2024-01-30T01:22:18Z","title":"OWSM v3.1: Better and Faster Open Whisper-Style Speech Models based on\n  E-Branchformer","summary":"  Recent studies have highlighted the importance of fully open foundation\nmodels. The Open Whisper-style Speech Model (OWSM) is an initial step towards\nreproducing OpenAI Whisper using public data and open-source toolkits. However,\nprevious versions of OWSM (v1 to v3) are still based on standard Transformer,\nwhich might lead to inferior performance compared to state-of-the-art speech\nencoder architectures. This work aims to improve the performance and efficiency\nof OWSM without additional data. We present a series of E-Branchformer-based\nmodels named OWSM v3.1, ranging from 100M to 1B parameters. OWSM v3.1\noutperforms its predecessor, OWSM v3, in most evaluation benchmarks, while\nshowing an improved inference speed of up to 25%. We further reveal the\nemergent ability of OWSM v3.1 in zero-shot contextual biasing speech\nrecognition. We also provide a model trained on a subset of data with low\nlicense restrictions. We will publicly release the code, pre-trained models,\nand training logs.\n","authors":["Yifan Peng","Jinchuan Tian","William Chen","Siddhant Arora","Brian Yan","Yui Sudo","Muhammad Shakeel","Kwanghee Choi","Jiatong Shi","Xuankai Chang","Jee-weon Jung","Shinji Watanabe"],"pdf_url":"https://arxiv.org/pdf/2401.16658v3.pdf","comment":"Accepted at INTERSPEECH 2024. Webpage:\n  https://www.wavlab.org/activities/2024/owsm/"},{"id":"http://arxiv.org/abs/2404.02342v2","updated":"2024-08-27T02:12:57Z","published":"2024-04-02T22:31:38Z","title":"A Computational Analysis of Lyric Similarity Perception","summary":"  In musical compositions that include vocals, lyrics significantly contribute\nto artistic expression. Consequently, previous studies have introduced the\nconcept of a recommendation system that suggests lyrics similar to a user's\nfavorites or personalized preferences, aiding in the discovery of lyrics among\nmillions of tracks. However, many of these systems do not fully consider human\nperceptions of lyric similarity, primarily due to limited research in this\narea. To bridge this gap, we conducted a comparative analysis of computational\nmethods for modeling lyric similarity with human perception. Results indicated\nthat computational models based on similarities between embeddings from\npre-trained BERT-based models, the audio from which the lyrics are derived, and\nphonetic components are indicative of perceptual lyric similarity. This finding\nunderscores the importance of semantic, stylistic, and phonetic similarities in\nhuman perception about lyric similarity. We anticipate that our findings will\nenhance the development of similarity-based lyric recommendation systems by\noffering pseudo-labels for neural network development and introducing objective\nevaluation metrics.\n","authors":["Haven Kim","Taketo Akama"],"pdf_url":"https://arxiv.org/pdf/2404.02342v2.pdf","comment":"In the process of a detailed revision"},{"id":"http://arxiv.org/abs/2408.11247v2","updated":"2024-08-27T02:11:32Z","published":"2024-08-20T23:54:26Z","title":"Unboxing Occupational Bias: Grounded Debiasing of LLMs with U.S. Labor\n  Data","summary":"  Large Language Models (LLMs) are prone to inheriting and amplifying societal\nbiases embedded within their training data, potentially reinforcing harmful\nstereotypes related to gender, occupation, and other sensitive categories. This\nissue becomes particularly problematic as biased LLMs can have far-reaching\nconsequences, leading to unfair practices and exacerbating social inequalities\nacross various domains, such as recruitment, online content moderation, or even\nthe criminal justice system. Although prior research has focused on detecting\nbias in LLMs using specialized datasets designed to highlight intrinsic biases,\nthere has been a notable lack of investigation into how these findings\ncorrelate with authoritative datasets, such as those from the U.S. National\nBureau of Labor Statistics (NBLS). To address this gap, we conduct empirical\nresearch that evaluates LLMs in a ``bias-out-of-the-box\" setting, analyzing how\nthe generated outputs compare with the distributions found in NBLS data.\nFurthermore, we propose a straightforward yet effective debiasing mechanism\nthat directly incorporates NBLS instances to mitigate bias within LLMs. Our\nstudy spans seven different LLMs, including instructable, base, and\nmixture-of-expert models, and reveals significant levels of bias that are often\noverlooked by existing bias detection techniques. Importantly, our debiasing\nmethod, which does not rely on external datasets, demonstrates a substantial\nreduction in bias scores, highlighting the efficacy of our approach in creating\nfairer and more reliable LLMs.\n","authors":["Atmika Gorti","Manas Gaur","Aman Chadha"],"pdf_url":"https://arxiv.org/pdf/2408.11247v2.pdf","comment":"Accepted in AAAI Spring Symposium 2024"},{"id":"http://arxiv.org/abs/2312.06635v6","updated":"2024-08-27T01:27:29Z","published":"2023-12-11T18:51:59Z","title":"Gated Linear Attention Transformers with Hardware-Efficient Training","summary":"  Transformers with linear attention allow for efficient parallel training but\ncan simultaneously be formulated as an RNN with 2D (matrix-valued) hidden\nstates, thus enjoying linear-time inference complexity. However, linear\nattention generally underperforms ordinary softmax attention. Moreover, current\nimplementations of linear attention lack I/O-awareness and are thus slower than\nhighly optimized implementations of softmax attention. This work describes a\nhardware-efficient algorithm for linear attention that trades off memory\nmovement against parallelizability. The resulting implementation, dubbed\nFLASHLINEARATTENTION, is faster than FLASHATTENTION-2 (Dao, 2023) as a\nstandalone layer even on short sequence lengths (e.g., 1K). We then generalize\nthis algorithm to a more expressive variant of linear attention with\ndata-dependent gates. When used as a replacement for the standard attention\nlayer in Transformers, the resulting gated linear attention (GLA) Transformer\nis found to perform competitively against the LLaMA-architecture Transformer\n(Touvron et al., 2023) as well recent linear-time-inference baselines such as\nRetNet (Sun et al., 2023a) and Mamba (Gu & Dao, 2023) on moderate-scale\nlanguage modeling experiments. GLA Transformer is especially effective at\nlength generalization, enabling a model trained on 2K to generalize to\nsequences longer than 20K without significant perplexity degradations. For\ntraining speed, the GLA Transformer has higher throughput than a\nsimilarly-sized Mamba model.\n","authors":["Songlin Yang","Bailin Wang","Yikang Shen","Rameswar Panda","Yoon Kim"],"pdf_url":"https://arxiv.org/pdf/2312.06635v6.pdf","comment":"minor update"},{"id":"http://arxiv.org/abs/2408.14721v1","updated":"2024-08-27T01:04:14Z","published":"2024-08-27T01:04:14Z","title":"PAT: Pruning-Aware Tuning for Large Language Models","summary":"  Large language models (LLMs) excel in language tasks, especially with\nsupervised fine-tuning after pre-training. However, their substantial memory\nand computational requirements hinder practical applications. Structural\npruning, which reduces less significant weight dimensions, is one solution.\nYet, traditional post-hoc pruning often leads to significant performance loss,\nwith limited recovery from further fine-tuning due to reduced capacity. Since\nthe model fine-tuning refines the general and chaotic knowledge in pre-trained\nmodels, we aim to incorporate structural pruning with the fine-tuning, and\npropose the Pruning-Aware Tuning (PAT) paradigm to eliminate model redundancy\nwhile preserving the model performance to the maximum extend. Specifically, we\ninsert the innovative Hybrid Sparsification Modules (HSMs) between the\nAttention and FFN components to accordingly sparsify the upstream and\ndownstream linear modules. The HSM comprises a lightweight operator and a\nglobally shared trainable mask. The lightweight operator maintains a training\noverhead comparable to that of LoRA, while the trainable mask unifies the\nchannels to be sparsified, ensuring structural pruning. Additionally, we\npropose the Identity Loss which decouples the transformation and scaling\nproperties of the HSMs to enhance training robustness. Extensive experiments\ndemonstrate that PAT excels in both performance and efficiency. For example,\nour Llama2-7b model with a 25\\% pruning ratio achieves 1.33$\\times$ speedup\nwhile outperforming the LoRA-finetuned model by up to 1.26\\% in accuracy with a\nsimilar training cost. Code:\nhttps://github.com/kriskrisliu/PAT_Pruning-Aware-Tuning\n","authors":["Yijiang Liu","Huanrui Yang","Youxin Chen","Rongyu Zhang","Miao Wang","Yuan Du","Li Du"],"pdf_url":"https://arxiv.org/pdf/2408.14721v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12023v4","updated":"2024-08-27T00:48:35Z","published":"2023-11-20T18:57:41Z","title":"LQ-LoRA: Low-rank Plus Quantized Matrix Decomposition for Efficient\n  Language Model Finetuning","summary":"  We propose a simple approach for memory-efficient adaptation of pretrained\nlanguage models. Our approach uses an iterative algorithm to decompose each\npretrained matrix into a high-precision low-rank component and a\nmemory-efficient quantized component. During finetuning, the quantized\ncomponent remains fixed and only the low-rank component is updated. We present\nan integer linear programming formulation of the quantization component which\nenables dynamic configuration of quantization parameters (e.g., bit-width,\nblock size) for each matrix given an overall target memory budget. We further\nexplore a data-aware version of the algorithm which uses an approximation of\nthe Fisher information matrix to weight the reconstruction objective during\nmatrix decomposition. Experiments on finetuning RoBERTa and LLaMA-2 (7B and\n70B) demonstrate that our low-rank plus quantized matrix decomposition approach\n(LQ-LoRA) outperforms strong QLoRA and GPTQ-LoRA baselines and enables\naggressive quantization to sub-3 bits with only minor performance degradations.\nWhen finetuned on a language modeling calibration dataset, LQ-LoRA can also be\nused for model compression; in this setting our 2.75-bit LLaMA-2-70B model\n(which has 2.85 bits on average when including the low-rank components and\nrequires 27GB of GPU memory) performs respectably compared to the 16-bit\nbaseline.\n","authors":["Han Guo","Philip Greengard","Eric P. Xing","Yoon Kim"],"pdf_url":"https://arxiv.org/pdf/2311.12023v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.10960v2","updated":"2024-08-27T00:27:12Z","published":"2024-07-15T17:55:42Z","title":"Fast Matrix Multiplications for Lookup Table-Quantized LLMs","summary":"  The deployment of large language models (LLMs) is often constrained by memory\nbandwidth, where the primary bottleneck is the cost of transferring model\nparameters from the GPU's global memory to its registers. When coupled with\ncustom kernels that fuse the dequantization and matmul operations, weight-only\nquantization can thus enable faster inference by reducing the amount of memory\nmovement. However, developing high-performance kernels for weight-quantized\nLLMs presents substantial challenges, especially when the weights are\ncompressed to non-evenly-divisible bit widths (e.g., 3 bits) with non-uniform,\nlookup table (LUT) quantization. This paper describes FLUTE, a flexible lookup\ntable engine for LUT-quantized LLMs, which uses offline restructuring of the\nquantized weight matrix to minimize bit manipulations associated with\nunpacking, and vectorization and duplication of the lookup table to mitigate\nshared memory bandwidth constraints. At batch sizes < 32 and quantization group\nsize of 128 (typical in LLM inference), the FLUTE kernel can be 2-4x faster\nthan existing GEMM kernels. As an application of FLUTE, we explore a simple\nextension to lookup table-based NormalFloat quantization and apply it to\nquantize LLaMA3 to various configurations, obtaining competitive quantization\nperformance against strong baselines while obtaining an end-to-end throughput\nincrease of 1.5 to 2 times.\n","authors":["Han Guo","William Brandon","Radostin Cholakov","Jonathan Ragan-Kelley","Eric P. Xing","Yoon Kim"],"pdf_url":"https://arxiv.org/pdf/2407.10960v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08816v2","updated":"2024-08-27T22:51:57Z","published":"2024-04-12T21:16:53Z","title":"Measuring the Quality of Answers in Political Q&As with Large Language\n  Models","summary":"  This paper introduces a new approach for measuring the quality of answers in\npolitical question-and-answer sessions. We propose to measure answer quality\nbased on the degree to which it allows to infer the initial question\naccurately. This measure of answer quality reflects how well the answer engages\nwith and addresses the initial question. Drawing an analogy with semantic\nsearch, we demonstrate that this measurement approach can be implemented by\nfine-tuning a large language model on the corpus of observed questions and\nanswers without additional labeled data. We showcase our approach within the\ncontext of the Question Period in the Canadian House of Commons, providing\nvaluable insights into the correlates of answer quality. Our findings reveal\nsignificant variations in answer quality based on the party affiliation of the\nmembers of Parliament asking the question. Additionally, we find a meaningful\ncorrelation between answer quality and the topic raised in the question.\n","authors":["R. Michael Alvarez","Jacob Morrier"],"pdf_url":"https://arxiv.org/pdf/2404.08816v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17678v2","updated":"2024-08-27T22:06:20Z","published":"2024-07-25T00:27:07Z","title":"Efficient LLM Training and Serving with Heterogeneous Context Sharding\n  among Attention Heads","summary":"  Existing LLM training and inference frameworks struggle in boosting\nefficiency with sparsity while maintaining the integrity of context and model\narchitecture. Inspired by the sharding concept in database and the fact that\nattention parallelizes over heads on accelerators, we propose Sparsely-Sharded\n(S2) Attention, an attention algorithm that allocates heterogeneous context\npartitions for different attention heads to divide and conquer. S2-Attention\nenforces each attention head to only attend to a partition of contexts\nfollowing a strided sparsity pattern, while the full context is preserved as\nthe union of all the shards. As attention heads are processed in separate\nthread blocks, the context reduction for each head can thus produce end-to-end\nspeed-up and memory reduction. At inference, LLMs trained with S2-Attention can\nthen take the KV cache reduction as free meals with guaranteed model quality\npreserve. In experiments, we show S2-Attentioncan provide as much as (1) 25.3X\nwall-clock attention speed-up over FlashAttention-2, resulting in 6X reduction\nin end-to-end training time and 10X inference latency, (2) on-par model\ntraining quality compared to default attention, (3)perfect needle retrieval\naccuracy over 32K context window. On top of the algorithm, we build DKernel, an\nLLM training and inference kernel library that allows users to customize\nsparsity patterns for their own models. We open-sourced DKerneland make it\ncompatible with Megatron, Pytorch, and vLLM.\n","authors":["Xihui Lin","Yunan Zhang","Suyu Ge","Barun Patra","Vishrav Chaudhary","Hao Peng","Xia Song"],"pdf_url":"https://arxiv.org/pdf/2407.17678v2.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2408.15417v1","updated":"2024-08-27T21:46:47Z","published":"2024-08-27T21:46:47Z","title":"Implicit Geometry of Next-token Prediction: From Language Sparsity\n  Patterns to Model Representations","summary":"  Next-token prediction (NTP) over large text corpora has become the go-to\nparadigm to train large language models. Yet, it remains unclear how NTP\ninfluences the mapping of linguistic patterns to geometric properties of the\nresulting model representations. We frame training of large language models as\nsoft-label classification over sparse probabilistic label vectors, coupled with\nan analytical approximation that allows unrestricted generation of context\nembeddings. This approach links NTP training to rank-constrained, nuclear-norm\nregularized optimization in the logit domain, offering a framework for\nanalyzing the geometry of word and context embeddings. In large embedding\nspaces, we find that NTP implicitly favors learning logits with a sparse plus\nlow-rank structure. While the sparse component captures the co-occurrence\nfrequency of context-word pairs, the orthogonal low-rank component, which\nbecomes dominant as training progresses, depends solely on the sparsity pattern\nof the co-occurrence matrix. Consequently, when projected onto an appropriate\nsubspace, representations of contexts that are followed by the same set of\nnext-tokens collapse, a phenomenon we term subspace-collapse. We validate our\nfindings on synthetic and small-scale real language datasets. Finally, we\noutline potential research directions aimed at deepening the understanding of\nNTP's influence on the learning of linguistic patterns and regularities.\n","authors":["Yize Zhao","Tina Behnia","Vala Vakilian","Christos Thrampoulidis"],"pdf_url":"https://arxiv.org/pdf/2408.15417v1.pdf","comment":"Accepted at COLM 2024"},{"id":"http://arxiv.org/abs/2310.07819v3","updated":"2024-08-27T21:37:57Z","published":"2023-10-11T19:00:40Z","title":"Faithfulness Measurable Masked Language Models","summary":"  A common approach to explaining NLP models is to use importance measures that\nexpress which tokens are important for a prediction. Unfortunately, such\nexplanations are often wrong despite being persuasive. Therefore, it is\nessential to measure their faithfulness. One such metric is if tokens are truly\nimportant, then masking them should result in worse model performance. However,\ntoken masking introduces out-of-distribution issues, and existing solutions\nthat address this are computationally expensive and employ proxy models.\nFurthermore, other metrics are very limited in scope. This work proposes an\ninherently faithfulness measurable model that addresses these challenges. This\nis achieved using a novel fine-tuning method that incorporates masking, such\nthat masking tokens become in-distribution by design. This differs from\nexisting approaches, which are completely model-agnostic but are inapplicable\nin practice. We demonstrate the generality of our approach by applying it to 16\ndifferent datasets and validate it using statistical in-distribution tests. The\nfaithfulness is then measured with 9 different importance measures. Because\nmasking is in-distribution, importance measures that themselves use masking\nbecome consistently more faithful. Additionally, because the model makes\nfaithfulness cheap to measure, we can optimize explanations towards maximal\nfaithfulness; thus, our model becomes indirectly inherently explainable.\n","authors":["Andreas Madsen","Siva Reddy","Sarath Chandar"],"pdf_url":"https://arxiv.org/pdf/2310.07819v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15409v1","updated":"2024-08-27T21:19:37Z","published":"2024-08-27T21:19:37Z","title":"Awes, Laws, and Flaws From Today's LLM Research","summary":"  We perform a critical examination of the scientific methodology behind\ncontemporary large language model (LLM) research. For this we assess over 2,000\nresearch works based on criteria typical of what is considered good research\n(e.g. presence of statistical tests and reproducibility) and cross-validate it\nwith arguments that are at the centre of controversy (e.g., claims of emergent\nbehaviour, the use of LLMs as evaluators). We find multiple trends, such as\ndeclines in claims of emergent behaviour and the presence of ethics\ndisclaimers; and the rise of LLMs as evaluators. This paper underscores the\nneed for more scrutiny and rigour by and from this field. Critical reading and\nfamiliarity with the literature are crucial to live up to the fundamentals of a\nresponsible scientific method that is ethical, reproducible, systematic, and\nopen to criticism.\n","authors":["Adrian de Wynter"],"pdf_url":"https://arxiv.org/pdf/2408.15409v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2408.15406v1","updated":"2024-08-27T21:03:42Z","published":"2024-08-27T21:03:42Z","title":"Intertwined Biases Across Social Media Spheres: Unpacking Correlations\n  in Media Bias Dimensions","summary":"  Media bias significantly shapes public perception by reinforcing stereotypes\nand exacerbating societal divisions. Prior research has often focused on\nisolated media bias dimensions such as \\textit{political bias} or\n\\textit{racial bias}, neglecting the complex interrelationships among various\nbias dimensions across different topic domains. Moreover, we observe that\nmodels trained on existing media bias benchmarks fail to generalize effectively\non recent social media posts, particularly in certain bias identification\ntasks. This shortfall primarily arises because these benchmarks do not\nadequately reflect the rapidly evolving nature of social media content, which\nis characterized by shifting user behaviors and emerging trends. In response to\nthese limitations, our research introduces a novel dataset collected from\nYouTube and Reddit over the past five years. Our dataset includes automated\nannotations for YouTube content across a broad spectrum of bias dimensions,\nsuch as gender, racial, and political biases, as well as hate speech, among\nothers. It spans diverse domains including politics, sports, healthcare,\neducation, and entertainment, reflecting the complex interplay of biases across\ndifferent societal sectors. Through comprehensive statistical analysis, we\nidentify significant differences in bias expression patterns and intra-domain\nbias correlations across these domains. By utilizing our understanding of the\ncorrelations among various bias dimensions, we lay the groundwork for creating\nadvanced systems capable of detecting multiple biases simultaneously. Overall,\nour dataset advances the field of media bias identification, contributing to\nthe development of tools that promote fairer media consumption. The\ncomprehensive awareness of existing media bias fosters more ethical journalism,\npromotes cultural sensitivity, and supports a more informed and equitable\npublic discourse.\n","authors":["Yifan Liu","Yike Li","Dong Wang"],"pdf_url":"https://arxiv.org/pdf/2408.15406v1.pdf","comment":"Accepted to ASONAM 2024"},{"id":"http://arxiv.org/abs/2408.15399v1","updated":"2024-08-27T20:51:06Z","published":"2024-08-27T20:51:06Z","title":"A Statistical Framework for Data-dependent Retrieval-Augmented Models","summary":"  Modern ML systems increasingly augment input instances with additional\nrelevant information to enhance final prediction. Despite growing interest in\nsuch retrieval-augmented models, their fundamental properties and training are\nnot well understood. We propose a statistical framework to study such models\nwith two components: 1) a {\\em retriever} to identify the relevant information\nout of a large corpus via a data-dependent metric; and 2) a {\\em predictor}\nthat consumes the input instances along with the retrieved information to make\nthe final predictions. We present a principled method for end-to-end training\nof both components and draw connections with various training approaches in the\nliterature. Furthermore, we establish excess risk bounds for\nretrieval-augmented models while delineating the contributions of both\nretriever and predictor towards the model performance. We validate the utility\nof our proposed training methods along with the key takeaways from our\nstatistical analysis on open domain question answering task where retrieval\naugmentation is important.\n","authors":["Soumya Basu","Ankit Singh Rawat","Manzil Zaheer"],"pdf_url":"https://arxiv.org/pdf/2408.15399v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.01966v2","updated":"2024-08-27T20:05:59Z","published":"2024-08-04T09:04:44Z","title":"ML-EAT: A Multilevel Embedding Association Test for Interpretable and\n  Transparent Social Science","summary":"  This research introduces the Multilevel Embedding Association Test (ML-EAT),\na method designed for interpretable and transparent measurement of intrinsic\nbias in language technologies. The ML-EAT addresses issues of ambiguity and\ndifficulty in interpreting the traditional EAT measurement by quantifying bias\nat three levels of increasing granularity: the differential association between\ntwo target concepts with two attribute concepts; the individual effect size of\neach target concept with two attribute concepts; and the association between\neach individual target concept and each individual attribute concept. Using the\nML-EAT, this research defines a taxonomy of EAT patterns describing the nine\npossible outcomes of an embedding association test, each of which is associated\nwith a unique EAT-Map, a novel four-quadrant visualization for interpreting the\nML-EAT. Empirical analysis of static and diachronic word embeddings, GPT-2\nlanguage models, and a CLIP language-and-image model shows that EAT patterns\nadd otherwise unobservable information about the component biases that make up\nan EAT; reveal the effects of prompting in zero-shot models; and can also\nidentify situations when cosine similarity is an ineffective metric, rendering\nan EAT unreliable. Our work contributes a method for rendering bias more\nobservable and interpretable, improving the transparency of computational\ninvestigations into human minds and societies.\n","authors":["Robert Wolfe","Alexis Hiniker","Bill Howe"],"pdf_url":"https://arxiv.org/pdf/2408.01966v2.pdf","comment":"Accepted at Artificial Intelligence, Ethics, and Society 2024"},{"id":"http://arxiv.org/abs/2408.01959v2","updated":"2024-08-27T19:57:45Z","published":"2024-08-04T08:26:58Z","title":"Dataset Scale and Societal Consistency Mediate Facial Impression Bias in\n  Vision-Language AI","summary":"  Multimodal AI models capable of associating images and text hold promise for\nnumerous domains, ranging from automated image captioning to accessibility\napplications for blind and low-vision users. However, uncertainty about bias\nhas in some cases limited their adoption and availability. In the present work,\nwe study 43 CLIP vision-language models to determine whether they learn\nhuman-like facial impression biases, and we find evidence that such biases are\nreflected across three distinct CLIP model families. We show for the first time\nthat the the degree to which a bias is shared across a society predicts the\ndegree to which it is reflected in a CLIP model. Human-like impressions of\nvisually unobservable attributes, like trustworthiness and sexuality, emerge\nonly in models trained on the largest dataset, indicating that a better fit to\nuncurated cultural data results in the reproduction of increasingly subtle\nsocial biases. Moreover, we use a hierarchical clustering approach to show that\ndataset size predicts the extent to which the underlying structure of facial\nimpression bias resembles that of facial impression bias in humans. Finally, we\nshow that Stable Diffusion models employing CLIP as a text encoder learn facial\nimpression biases, and that these biases intersect with racial biases in Stable\nDiffusion XL-Turbo. While pretrained CLIP models may prove useful for\nscientific studies of bias, they will also require significant dataset curation\nwhen intended for use as general-purpose models in a zero-shot setting.\n","authors":["Robert Wolfe","Aayushi Dangol","Alexis Hiniker","Bill Howe"],"pdf_url":"https://arxiv.org/pdf/2408.01959v2.pdf","comment":"Accepted at Artificial Intelligence, Ethics, and Society 2024"},{"id":"http://arxiv.org/abs/2408.15379v1","updated":"2024-08-27T19:33:15Z","published":"2024-08-27T19:33:15Z","title":"DualKanbaFormer: Kolmogorov-Arnold Networks and State Space Model\n  DualKanbaFormer: Kolmogorov-Arnold Networks and State Space Model Transformer\n  for Multimodal Aspect-based Sentiment Analysis","summary":"  Multimodal aspect-based sentiment analysis (MABSA) enhances sentiment\ndetection by combining text with other data types like images. However, despite\nsetting significant benchmarks, attention mechanisms exhibit limitations in\nefficiently modelling long-range dependencies between aspect and opinion\ntargets within the text. They also face challenges in capturing global-context\ndependencies for visual representations. To this end, we propose\nKolmogorov-Arnold Networks (KANs) and Selective State Space model (Mamba)\ntransformer (DualKanbaFormer), a novel architecture to address the above\nissues. We leverage the power of Mamba to capture global context dependencies,\nMulti-head Attention (MHA) to capture local context dependencies, and KANs to\ncapture non-linear modelling patterns for both textual representations (textual\nKanbaFormer) and visual representations (visual KanbaFormer). Furthermore, we\nfuse the textual KanbaFormer and visual KanbaFomer with a gated fusion layer to\ncapture the inter-modality dynamics. According to extensive experimental\nresults, our model outperforms some state-of-the-art (SOTA) studies on two\npublic datasets.\n","authors":["Adamu Lawan","Juhua Pu","Haruna Yunusa","Muhammad Lawan","Aliyu Umar","Adamu Sani Yahya"],"pdf_url":"https://arxiv.org/pdf/2408.15379v1.pdf","comment":"10 pages, 2 figures, and 3 tables"},{"id":"http://arxiv.org/abs/2408.15366v1","updated":"2024-08-27T19:03:11Z","published":"2024-08-27T19:03:11Z","title":"Pitfalls and Outlooks in Using COMET","summary":"  Since its introduction, the COMET metric has blazed a trail in the machine\ntranslation community, given its strong correlation with human judgements of\ntranslation quality. Its success stems from being a modified pre-trained\nmultilingual model finetuned for quality assessment. However, it being a\nmachine learning model also gives rise to a new set of pitfalls that may not be\nwidely known. We investigate these unexpected behaviours from three aspects: 1)\ntechnical: obsolete software versions and compute precision; 2) data: empty\ncontent, language mismatch, and translationese at test time as well as\ndistribution and domain biases in training; 3) usage and reporting:\nmulti-reference support and model referencing in the literature. All of these\nproblems imply that COMET scores is not comparable between papers or even\ntechnical setups and we put forward our perspective on fixing each issue.\nFurthermore, we release the SacreCOMET package that can generate a signature\nfor the software and model configuration as well as an appropriate citation.\nThe goal of this work is to help the community make more sound use of the COMET\nmetric.\n","authors":["Vilém Zouhar","Pinzhen Chen","Tsz Kin Lam","Nikita Moghe","Barry Haddow"],"pdf_url":"https://arxiv.org/pdf/2408.15366v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06062v3","updated":"2024-08-27T18:47:13Z","published":"2024-08-12T11:23:24Z","title":"On Tables with Numbers, with Numbers","summary":"  This paper is a critical reflection on the epistemic culture of contemporary\ncomputational linguistics, framed in the context of its growing obsession with\ntables with numbers. We argue against tables with numbers on the basis of their\nepistemic irrelevance, their environmental impact, their role in enabling and\nexacerbating social inequalities, and their deep ties to commercial\napplications and profit-driven research. We substantiate our arguments with\nempirical evidence drawn from a meta-analysis of computational linguistics\nresearch over the last decade.\n","authors":["Konstantinos Kogkalidis","Stergios Chatzikyriakidis"],"pdf_url":"https://arxiv.org/pdf/2408.06062v3.pdf","comment":"v3: Stergios' acknowledgements"},{"id":"http://arxiv.org/abs/2408.15339v1","updated":"2024-08-27T18:04:07Z","published":"2024-08-27T18:04:07Z","title":"UNA: Unifying Alignments of RLHF/PPO, DPO and KTO by a Generalized\n  Implicit Reward Function","summary":"  An LLM is pretrained on trillions of tokens, but the pretrained LLM may still\ngenerate undesired responses. To solve this problem, alignment techniques such\nas RLHF, DPO and KTO are proposed. However, these alignment techniques have\nlimitations. For example, RLHF requires training the reward model and policy\nseparately, which is complex, time-consuming, memory intensive and unstable\nduring training processes. DPO proposes a mapping between an optimal policy and\na reward, greatly simplifying the training process of RLHF. However, it can not\ntake full advantages of a reward model and it is limited to pairwise preference\ndata.\n  In this paper, we propose \\textbf{UN}ified \\textbf{A}lignment (UNA) which\nunifies RLHF/PPO, DPO and KTO. Firstly, we mathematically prove that given the\nclassical RLHF objective, the optimal policy is induced by a generalize\nimplicit reward function. With this novel mapping between a reward model and an\noptimal policy, UNA can 1. unify RLHF/PPO, DPO and KTO into a supervised\nlearning of minimizing the difference between an implicit reward and an\nexplicit reward; 2. outperform RLHF/PPO while simplify, stabilize, speed up and\nreduce memory burden of RL fine-tuning process; 3. accommodate different\nfeedback types including pairwise, binary and scalar feedback. Downstream\nexperiments show UNA outperforms DPO, KTO and RLHF.\n","authors":["Zhichao Wang","Bin Bi","Can Huang","Shiva Kumar Pentyala","Zixu James Zhu","Sitaram Asur","Na Claire Cheng"],"pdf_url":"https://arxiv.org/pdf/2408.15339v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15313v1","updated":"2024-08-27T17:31:21Z","published":"2024-08-27T17:31:21Z","title":"Bi-Factorial Preference Optimization: Balancing Safety-Helpfulness in\n  Language Models","summary":"  Fine-tuning large language models (LLMs) on human preferences, typically\nthrough reinforcement learning from human feedback (RLHF), has proven\nsuccessful in enhancing their capabilities. However, ensuring the safety of\nLLMs during the fine-tuning remains a critical concern, and mitigating the\npotential conflicts in safety and helpfulness is costly in RLHF. To address\nthis issue, we propose a supervised learning framework called Bi-Factorial\nPreference Optimization (BFPO), which re-parameterizes a joint RLHF objective\nof both safety and helpfulness into a single supervised learning objective. In\nthe supervised optimization, a labeling function is used to capture global\npreferences ranking to balance both safety and helpfulness. To evaluate BFPO,\nwe develop a benchmark including comprehensive discriminative and generative\ntasks for helpfulness and harmlessness. The results indicate that our method\nsignificantly outperforms existing approaches in both safety and helpfulness.\nMoreover, BFPO eliminates the need for human prompting and annotation in LLM\nfine-tuning while achieving the same level of safety as methods that heavily\nrely on human labor, with less than 10% of the computational resources. The\ntraining recipes and models will be released.\n","authors":["Wenxuan Zhang","Philip H. S. Torr","Mohamed Elhoseiny","Adel Bibi"],"pdf_url":"https://arxiv.org/pdf/2408.15313v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15297v1","updated":"2024-08-27T11:31:12Z","published":"2024-08-27T11:31:12Z","title":"YOLO-Stutter: End-to-end Region-Wise Speech Dysfluency Detection","summary":"  Dysfluent speech detection is the bottleneck for disordered speech analysis\nand spoken language learning. Current state-of-the-art models are governed by\nrule-based systems which lack efficiency and robustness, and are sensitive to\ntemplate design. In this paper, we propose YOLO-Stutter: a first end-to-end\nmethod that detects dysfluencies in a time-accurate manner. YOLO-Stutter takes\nimperfect speech-text alignment as input, followed by a spatial feature\naggregator, and a temporal dependency extractor to perform region-wise boundary\nand class predictions. We also introduce two dysfluency corpus, VCTK-Stutter\nand VCTK-TTS, that simulate natural spoken dysfluencies including repetition,\nblock, missing, replacement, and prolongation. Our end-to-end method achieves\nstate-of-the-art performance with a minimum number of trainable parameters for\non both simulated data and real aphasia speech. Code and datasets are\nopen-sourced at https://github.com/rorizzz/YOLO-Stutter\n","authors":["Xuanru Zhou","Anshul Kashyap","Steve Li","Ayati Sharma","Brittany Morin","David Baquirin","Jet Vonk","Zoe Ezzes","Zachary Miller","Maria Luisa Gorno Tempini","Jiachen Lian","Gopala Krishna Anumanchipalli"],"pdf_url":"https://arxiv.org/pdf/2408.15297v1.pdf","comment":"Interspeech 2024"},{"id":"http://arxiv.org/abs/2408.15293v1","updated":"2024-08-27T08:19:34Z","published":"2024-08-27T08:19:34Z","title":"Learning Granularity Representation for Temporal Knowledge Graph\n  Completion","summary":"  Temporal Knowledge Graphs (TKGs) incorporate temporal information to reflect\nthe dynamic structural knowledge and evolutionary patterns of real-world facts.\nNevertheless, TKGs are still limited in downstream applications due to the\nproblem of incompleteness. Consequently, TKG completion (also known as link\nprediction) has been widely studied, with recent research focusing on\nincorporating independent embeddings of time or combining them with entities\nand relations to form temporal representations. However, most existing methods\noverlook the impact of history from a multi-granularity aspect. The inherent\nsemantics of human-defined temporal granularities, such as ordinal dates,\nreveal general patterns to which facts typically adhere. To counter this\nlimitation, this paper proposes \\textbf{L}earning \\textbf{G}ranularity\n\\textbf{Re}presentation (termed $\\mathsf{LGRe}$) for TKG completion. It\ncomprises two main components: Granularity Representation Learning (GRL) and\nAdaptive Granularity Balancing (AGB). Specifically, GRL employs time-specific\nmulti-layer convolutional neural networks to capture interactions between\nentities and relations at different granularities. After that, AGB generates\nadaptive weights for these embeddings according to temporal semantics,\nresulting in expressive representations of predictions. Moreover, to reflect\nsimilar semantics of adjacent timestamps, a temporal loss function is\nintroduced. Extensive experimental results on four event benchmarks demonstrate\nthe effectiveness of $\\mathsf{LGRe}$ in learning time-related representations.\nTo ensure reproducibility, our code is available at\nhttps://github.com/KcAcoZhang/LGRe.\n","authors":["Jinchuan Zhang","Tianqi Wan","Chong Mu","Guangxi Lu","Ling Tian"],"pdf_url":"https://arxiv.org/pdf/2408.15293v1.pdf","comment":"15 pages. Accepted at ICONIP 2024"},{"id":"http://arxiv.org/abs/2408.13609v2","updated":"2024-08-27T04:49:46Z","published":"2024-08-24T15:43:02Z","title":"GNN: Graph Neural Network and Large Language Model for Data Discovery","summary":"  Our algorithm GNN: Graph Neural Network and Large Language Model for Data\nDiscovery inherit the benefits of \\cite{hoang2024plod} (PLOD: Predictive\nLearning Optimal Data Discovery), \\cite{Hoang2024BODBO} (BOD: Blindly Optimal\nData Discovery) in terms of overcoming the challenges of having to predefine\nutility function and the human input for attribute ranking, which helps prevent\nthe time-consuming loop process. In addition to these previous works, our\nalgorithm GNN leverages the advantages of graph neural networks and large\nlanguage models to understand text type values that cannot be understood by\nPLOD and MOD, thus making the task of predicting outcomes more reliable. GNN\ncould be seen as an extension of PLOD in terms of understanding the text type\nvalue and the user's preferences, not only numerical values but also text\nvalues, making the promise of data science and analytics purposes.\n","authors":["Thomas Hoang"],"pdf_url":"https://arxiv.org/pdf/2408.13609v2.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2408.15242v1","updated":"2024-08-27T17:59:55Z","published":"2024-08-27T17:59:55Z","title":"Drone-assisted Road Gaussian Splatting with Cross-view Uncertainty","summary":"  Robust and realistic rendering for large-scale road scenes is essential in\nautonomous driving simulation. Recently, 3D Gaussian Splatting (3D-GS) has made\ngroundbreaking progress in neural rendering, but the general fidelity of\nlarge-scale road scene renderings is often limited by the input imagery, which\nusually has a narrow field of view and focuses mainly on the street-level local\narea. Intuitively, the data from the drone's perspective can provide a\ncomplementary viewpoint for the data from the ground vehicle's perspective,\nenhancing the completeness of scene reconstruction and rendering. However,\ntraining naively with aerial and ground images, which exhibit large view\ndisparity, poses a significant convergence challenge for 3D-GS, and does not\ndemonstrate remarkable improvements in performance on road views. In order to\nenhance the novel view synthesis of road views and to effectively use the\naerial information, we design an uncertainty-aware training method that allows\naerial images to assist in the synthesis of areas where ground images have poor\nlearning outcomes instead of weighting all pixels equally in 3D-GS training\nlike prior work did. We are the first to introduce the cross-view uncertainty\nto 3D-GS by matching the car-view ensemble-based rendering uncertainty to\naerial images, weighting the contribution of each pixel to the training\nprocess. Additionally, to systematically quantify evaluation metrics, we\nassemble a high-quality synthesized dataset comprising both aerial and ground\nimages for road scenes.\n","authors":["Saining Zhang","Baijun Ye","Xiaoxue Chen","Yuantao Chen","Zongzheng Zhang","Cheng Peng","Yongliang Shi","Hao Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.15242v1.pdf","comment":"BMVC2024 Project Page: https://sainingzhang.github.io/project/uc-gs/\n  Code: https://github.com/SainingZhang/uc-gs/"},{"id":"http://arxiv.org/abs/2408.15241v1","updated":"2024-08-27T17:59:41Z","published":"2024-08-27T17:59:41Z","title":"GenRec: Unifying Video Generation and Recognition with Diffusion Models","summary":"  Video diffusion models are able to generate high-quality videos by learning\nstrong spatial-temporal priors on large-scale datasets. In this paper, we aim\nto investigate whether such priors derived from a generative process are\nsuitable for video recognition, and eventually joint optimization of generation\nand recognition. Building upon Stable Video Diffusion, we introduce GenRec, the\nfirst unified framework trained with a random-frame conditioning process so as\nto learn generalized spatial-temporal representations. The resulting framework\ncan naturally supports generation and recognition, and more importantly is\nrobust even when visual inputs contain limited information. Extensive\nexperiments demonstrate the efficacy of GenRec for both recognition and\ngeneration. In particular, GenRec achieves competitive recognition performance,\noffering 75.8% and 87.2% accuracy on SSV2 and K400, respectively. GenRec also\nperforms the best class-conditioned image-to-video generation results,\nachieving 46.5 and 49.3 FVD scores on SSV2 and EK-100 datasets. Furthermore,\nGenRec demonstrates extraordinary robustness in scenarios that only limited\nframes can be observed.\n","authors":["Zejia Weng","Xitong Yang","Zhen Xing","Zuxuan Wu","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2408.15241v1.pdf","comment":"17 pages, 6 figures, 7 tables"},{"id":"http://arxiv.org/abs/2408.15239v1","updated":"2024-08-27T17:57:14Z","published":"2024-08-27T17:57:14Z","title":"Generative Inbetweening: Adapting Image-to-Video Models for Keyframe\n  Interpolation","summary":"  We present a method for generating video sequences with coherent motion\nbetween a pair of input key frames. We adapt a pretrained large-scale\nimage-to-video diffusion model (originally trained to generate videos moving\nforward in time from a single input image) for key frame interpolation, i.e.,\nto produce a video in between two input frames. We accomplish this adaptation\nthrough a lightweight fine-tuning technique that produces a version of the\nmodel that instead predicts videos moving backwards in time from a single input\nimage. This model (along with the original forward-moving model) is\nsubsequently used in a dual-directional diffusion sampling process that\ncombines the overlapping model estimates starting from each of the two\nkeyframes. Our experiments show that our method outperforms both existing\ndiffusion-based methods and traditional frame interpolation techniques.\n","authors":["Xiaojuan Wang","Boyang Zhou","Brian Curless","Ira Kemelmacher-Shlizerman","Aleksander Holynski","Steven M. Seitz"],"pdf_url":"https://arxiv.org/pdf/2408.15239v1.pdf","comment":"project page: https://svd-keyframe-interpolation.github.io/"},{"id":"http://arxiv.org/abs/2408.15235v1","updated":"2024-08-27T17:53:18Z","published":"2024-08-27T17:53:18Z","title":"Learning-based Multi-View Stereo: A Survey","summary":"  3D reconstruction aims to recover the dense 3D structure of a scene. It plays\nan essential role in various applications such as Augmented/Virtual Reality\n(AR/VR), autonomous driving and robotics. Leveraging multiple views of a scene\ncaptured from different viewpoints, Multi-View Stereo (MVS) algorithms\nsynthesize a comprehensive 3D representation, enabling precise reconstruction\nin complex environments. Due to its efficiency and effectiveness, MVS has\nbecome a pivotal method for image-based 3D reconstruction. Recently, with the\nsuccess of deep learning, many learning-based MVS methods have been proposed,\nachieving impressive performance against traditional methods. We categorize\nthese learning-based methods as: depth map-based, voxel-based, NeRF-based, 3D\nGaussian Splatting-based, and large feed-forward methods. Among these, we focus\nsignificantly on depth map-based methods, which are the main family of MVS due\nto their conciseness, flexibility and scalability. In this survey, we provide a\ncomprehensive review of the literature at the time of this writing. We\ninvestigate these learning-based methods, summarize their performances on\npopular benchmarks, and discuss promising future research directions in this\narea.\n","authors":["Fangjinhua Wang","Qingtian Zhu","Di Chang","Quankai Gao","Junlin Han","Tong Zhang","Richard Hartley","Marc Pollefeys"],"pdf_url":"https://arxiv.org/pdf/2408.15235v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15231v1","updated":"2024-08-27T17:48:29Z","published":"2024-08-27T17:48:29Z","title":"DCT-CryptoNets: Scaling Private Inference in the Frequency Domain","summary":"  The convergence of fully homomorphic encryption (FHE) and machine learning\noffers unprecedented opportunities for private inference of sensitive data. FHE\nenables computation directly on encrypted data, safeguarding the entire machine\nlearning pipeline, including data and model confidentiality. However, existing\nFHE-based implementations for deep neural networks face significant challenges\nin computational cost, latency, and scalability, limiting their practical\ndeployment. This paper introduces DCT-CryptoNets, a novel approach that\nleverages frequency-domain learning to tackle these issues. Our method operates\ndirectly in the frequency domain, utilizing the discrete cosine transform (DCT)\ncommonly employed in JPEG compression. This approach is inherently compatible\nwith remote computing services, where images are usually transmitted and stored\nin compressed formats. DCT-CryptoNets reduces the computational burden of\nhomomorphic operations by focusing on perceptually relevant low-frequency\ncomponents. This is demonstrated by substantial latency reduction of up to\n5.3$\\times$ compared to prior work on image classification tasks, including a\nnovel demonstration of ImageNet inference within 2.5 hours, down from 12.5\nhours compared to prior work on equivalent compute resources. Moreover,\nDCT-CryptoNets improves the reliability of encrypted accuracy by reducing\nvariability (e.g., from $\\pm$2.5\\% to $\\pm$1.0\\% on ImageNet). This study\ndemonstrates a promising avenue for achieving efficient and practical\nprivacy-preserving deep learning on high resolution images seen in real-world\napplications.\n","authors":["Arjun Roy","Kaushik Roy"],"pdf_url":"https://arxiv.org/pdf/2408.15231v1.pdf","comment":"Under Review; 10 pages content, 3 pages appendix, 4 figures, 8\n  tables; Code TBD"},{"id":"http://arxiv.org/abs/2408.15224v1","updated":"2024-08-27T17:39:33Z","published":"2024-08-27T17:39:33Z","title":"SAM & SAM 2 in 3D Slicer: SegmentWithSAM Extension for Annotating\n  Medical Images","summary":"  Creating annotations for 3D medical data is time-consuming and often requires\nhighly specialized expertise. Various tools have been implemented to aid this\nprocess. Segment Anything Model 2 (SAM 2) offers a general-purpose prompt-based\nsegmentation algorithm designed to annotate videos. In this paper, we adapt\nthis model to the annotation of 3D medical images and offer our implementation\nin the form of an extension to the popular annotation software: 3D Slicer. Our\nextension allows users to place point prompts on 2D slices to generate\nannotation masks and propagate these annotations across entire volumes in\neither single-directional or bi-directional manners. Our code is publicly\navailable on https://github.com/mazurowski-lab/SlicerSegmentWithSAM and can be\neasily installed directly from the Extension Manager of 3D Slicer as well.\n","authors":["Zafer Yildiz","Yuwen Chen","Maciej A. Mazurowski"],"pdf_url":"https://arxiv.org/pdf/2408.15224v1.pdf","comment":"Future work: support for box and mask inputs for the video predictor\n  of SAM 2"},{"id":"http://arxiv.org/abs/2408.15218v1","updated":"2024-08-27T17:31:00Z","published":"2024-08-27T17:31:00Z","title":"Histo-Diffusion: A Diffusion Super-Resolution Method for Digital\n  Pathology with Comprehensive Quality Assessment","summary":"  Digital pathology has advanced significantly over the last decade, with Whole\nSlide Images (WSIs) encompassing vast amounts of data essential for accurate\ndisease diagnosis. High-resolution WSIs are essential for precise diagnosis but\ntechnical limitations in scanning equipment and variablity in slide preparation\ncan hinder obtaining these images. Super-resolution techniques can enhance\nlow-resolution images; while Generative Adversarial Networks (GANs) have been\neffective in natural image super-resolution tasks, they often struggle with\nhistopathology due to overfitting and mode collapse. Traditional evaluation\nmetrics fall short in assessing the complex characteristics of histopathology\nimages, necessitating robust histology-specific evaluation methods.\n  We introduce Histo-Diffusion, a novel diffusion-based method specially\ndesigned for generating and evaluating super-resolution images in digital\npathology. It includes a restoration module for histopathology prior and a\ncontrollable diffusion module for generating high-quality images. We have\ncurated two histopathology datasets and proposed a comprehensive evaluation\nstrategy which incorporates both full-reference and no-reference metrics to\nthoroughly assess the quality of digital pathology images.\n  Comparative analyses on multiple datasets with state-of-the-art methods\nreveal that Histo-Diffusion outperforms GANs. Our method offers a versatile\nsolution for histopathology image super-resolution, capable of handling\nmulti-resolution generation from varied input sizes, providing valuable support\nin diagnostic processes.\n","authors":["Xuan Xu","Saarthak Kapse","Prateek Prasanna"],"pdf_url":"https://arxiv.org/pdf/2408.15218v1.pdf","comment":"We have submitted our paper to Medical Image Analysis and are\n  currently awaiting feedback"},{"id":"http://arxiv.org/abs/2408.15217v1","updated":"2024-08-27T17:30:49Z","published":"2024-08-27T17:30:49Z","title":"Fundus2Video: Cross-Modal Angiography Video Generation from Static\n  Fundus Photography with Clinical Knowledge Guidance","summary":"  Fundus Fluorescein Angiography (FFA) is a critical tool for assessing retinal\nvascular dynamics and aiding in the diagnosis of eye diseases. However, its\ninvasive nature and less accessibility compared to Color Fundus (CF) images\npose significant challenges. Current CF to FFA translation methods are limited\nto static generation. In this work, we pioneer dynamic FFA video generation\nfrom static CF images. We introduce an autoregressive GAN for smooth,\nmemory-saving frame-by-frame FFA synthesis. To enhance the focus on dynamic\nlesion changes in FFA regions, we design a knowledge mask based on clinical\nexperience. Leveraging this mask, our approach integrates innovative knowledge\nmask-guided techniques, including knowledge-boosted attention, knowledge-aware\ndiscriminators, and mask-enhanced patchNCE loss, aimed at refining generation\nin critical areas and addressing the pixel misalignment challenge. Our method\nachieves the best FVD of 1503.21 and PSNR of 11.81 compared to other common\nvideo generation approaches. Human assessment by an ophthalmologist confirms\nits high generation quality. Notably, our knowledge mask surpasses supervised\nlesion segmentation masks, offering a promising non-invasive alternative to\ntraditional FFA for research and clinical applications. The code is available\nat https://github.com/Michi-3000/Fundus2Video.\n","authors":["Weiyi Zhang","Siyu Huang","Jiancheng Yang","Ruoyu Chen","Zongyuan Ge","Yingfeng Zheng","Danli Shi","Mingguang He"],"pdf_url":"https://arxiv.org/pdf/2408.15217v1.pdf","comment":"The paper has been accepted by Medical Image Computing and Computer\n  Assisted Intervention Society (MICCAI) 2024"},{"id":"http://arxiv.org/abs/2407.05921v2","updated":"2024-08-27T17:14:16Z","published":"2024-07-08T13:28:47Z","title":"TAPVid-3D: A Benchmark for Tracking Any Point in 3D","summary":"  We introduce a new benchmark, TAPVid-3D, for evaluating the task of\nlong-range Tracking Any Point in 3D (TAP-3D). While point tracking in two\ndimensions (TAP) has many benchmarks measuring performance on real-world\nvideos, such as TAPVid-DAVIS, three-dimensional point tracking has none. To\nthis end, leveraging existing footage, we build a new benchmark for 3D point\ntracking featuring 4,000+ real-world videos, composed of three different data\nsources spanning a variety of object types, motion patterns, and indoor and\noutdoor environments. To measure performance on the TAP-3D task, we formulate a\ncollection of metrics that extend the Jaccard-based metric used in TAP to\nhandle the complexities of ambiguous depth scales across models, occlusions,\nand multi-track spatio-temporal smoothness. We manually verify a large sample\nof trajectories to ensure correct video annotations, and assess the current\nstate of the TAP-3D task by constructing competitive baselines using existing\ntracking models. We anticipate this benchmark will serve as a guidepost to\nimprove our ability to understand precise 3D motion and surface deformation\nfrom monocular video. Code for dataset download, generation, and model\nevaluation is available at https://tapvid3d.github.io\n","authors":["Skanda Koppula","Ignacio Rocco","Yi Yang","Joe Heyward","João Carreira","Andrew Zisserman","Gabriel Brostow","Carl Doersch"],"pdf_url":"https://arxiv.org/pdf/2407.05921v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15205v1","updated":"2024-08-27T17:06:22Z","published":"2024-08-27T17:06:22Z","title":"Leveraging Hallucinations to Reduce Manual Prompt Dependency in\n  Promptable Segmentation","summary":"  Promptable segmentation typically requires instance-specific manual prompts\nto guide the segmentation of each desired object. To minimize such a need,\ntask-generic promptable segmentation has been introduced, which employs a\nsingle task-generic prompt to segment various images of different objects in\nthe same task. Current methods use Multimodal Large Language Models (MLLMs) to\nreason detailed instance-specific prompts from a task-generic prompt for\nimproving segmentation accuracy. The effectiveness of this segmentation heavily\ndepends on the precision of these derived prompts. However, MLLMs often suffer\nhallucinations during reasoning, resulting in inaccurate prompting. While\nexisting methods focus on eliminating hallucinations to improve a model, we\nargue that MLLM hallucinations can reveal valuable contextual insights when\nleveraged correctly, as they represent pre-trained large-scale knowledge beyond\nindividual images. In this paper, we utilize hallucinations to mine\ntask-related information from images and verify its accuracy for enhancing\nprecision of the generated prompts. Specifically, we introduce an iterative\nPrompt-Mask Cycle generation framework (ProMaC) with a prompt generator and a\nmask generator.The prompt generator uses a multi-scale chain of thought\nprompting, initially exploring hallucinations for extracting extended\ncontextual knowledge on a test image.These hallucinations are then reduced to\nformulate precise instance-specific prompts, directing the mask generator to\nproduce masks that are consistent with task semantics by mask semantic\nalignment. The generated masks iteratively induce the prompt generator to focus\nmore on task-relevant image areas and reduce irrelevant hallucinations,\nresulting jointly in better prompts and masks. Experiments on 5 benchmarks\ndemonstrate the effectiveness of ProMaC. Code given in\nhttps://lwpyh.github.io/ProMaC/.\n","authors":["Jian Hu","Jiayi Lin","Junchi Yan","Shaogang Gong"],"pdf_url":"https://arxiv.org/pdf/2408.15205v1.pdf","comment":"We propose using hallucinations as prior knowledge to extract and\n  validate task-related information, which helps generate instance-specific\n  prompts for reducing reliance on manual prompts in promptable segmentation"},{"id":"http://arxiv.org/abs/2408.15201v1","updated":"2024-08-27T17:02:03Z","published":"2024-08-27T17:02:03Z","title":"An Investigation on The Position Encoding in Vision-Based Dynamics\n  Prediction","summary":"  Despite the success of vision-based dynamics prediction models, which predict\nobject states by utilizing RGB images and simple object descriptions, they were\nchallenged by environment misalignments. Although the literature has\ndemonstrated that unifying visual domains with both environment context and\nobject abstract, such as semantic segmentation and bounding boxes, can\neffectively mitigate the visual domain misalignment challenge, discussions were\nfocused on the abstract of environment context, and the insight of using\nbounding box as the object abstract is under-explored. Furthermore, we notice\nthat, as empirical results shown in the literature, even when the visual\nappearance of objects is removed, object bounding boxes alone, instead of being\ndirectly fed into the network, can indirectly provide sufficient position\ninformation via the Region of Interest Pooling operation for dynamics\nprediction. However, previous literature overlooked discussions regarding how\nsuch position information is implicitly encoded in the dynamics prediction\nmodel. Thus, in this paper, we provide detailed studies to investigate the\nprocess and necessary conditions for encoding position information via using\nthe bounding box as the object abstract into output features. Furthermore, we\nstudy the limitation of solely using object abstracts, such that the dynamics\nprediction performance will be jeopardized when the environment context varies.\n","authors":["Jiageng Zhu","Hanchen Xie","Jiazhi Li","Mahyar Khayatkhoei","Wael AbdAlmageed"],"pdf_url":"https://arxiv.org/pdf/2408.15201v1.pdf","comment":"13 pages, 4 tables, and 3 figures. Accepted to ECCV2024 eXCV workshop"},{"id":"http://arxiv.org/abs/2408.02088v3","updated":"2024-08-27T16:46:53Z","published":"2024-08-04T16:54:49Z","title":"KAN-RCBEVDepth: A multi-modal fusion algorithm in object detection for\n  autonomous driving","summary":"  Accurate 3D object detection in autonomous driving is critical yet\nchallenging due to occlusions, varying object sizes, and complex urban\nenvironments. This paper introduces the KAN-RCBEVDepth method, an innovative\napproach aimed at enhancing 3D object detection by fusing multimodal sensor\ndata from cameras, LiDAR, and millimeter-wave radar. Our unique Bird's Eye\nView-based approach significantly improves detection accuracy and efficiency by\nseamlessly integrating diverse sensor inputs, refining spatial relationship\nunderstanding, and optimizing computational procedures. Experimental results\nshow that the proposed method outperforms existing techniques across multiple\ndetection metrics, achieving a higher Mean Distance AP (0.389, 23\\%\nimprovement), a better ND Score (0.485, 17.1\\% improvement), and a faster\nEvaluation Time (71.28s, 8\\% faster). Additionally, the KAN-RCBEVDepth method\nsignificantly reduces errors compared to BEVDepth, with lower Transformation\nError (0.6044, 13.8\\% improvement), Scale Error (0.2780, 2.6\\% improvement),\nOrientation Error (0.5830, 7.6\\% improvement), Velocity Error (0.4244, 28.3\\%\nimprovement), and Attribute Error (0.2129, 3.2\\% improvement). These findings\nsuggest that our method offers enhanced accuracy, reliability, and efficiency,\nmaking it well-suited for dynamic and demanding autonomous driving scenarios.\nThe code will be released in \\url{https://github.com/laitiamo/RCBEVDepth-KAN}.\n","authors":["Zhihao Lai","Chuanhao Liu","Shihui Sheng","Zhiqiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.02088v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08789v4","updated":"2024-08-27T16:43:17Z","published":"2023-07-17T19:17:10Z","title":"Creating Image Datasets in Agricultural Environments using DALL.E:\n  Generative AI-Powered Large Language Model","summary":"  This research investigated the role of artificial intelligence (AI),\nspecifically the DALL.E model by OpenAI, in advancing data generation and\nvisualization techniques in agriculture. DALL.E, an advanced AI image\ngenerator, works alongside ChatGPT's language processing to transform text\ndescriptions and image clues into realistic visual representations of the\ncontent. The study used both approaches of image generation: text-to-image and\nimage-to image (variation). Six types of datasets depicting fruit crop\nenvironment were generated. These AI-generated images were then compared\nagainst ground truth images captured by sensors in real agricultural fields.\nThe comparison was based on Peak Signal-to-Noise Ratio (PSNR) and Feature\nSimilarity Index (FSIM) metrics. The image-to-image generation exhibited a\n5.78% increase in average PSNR over text-to-image methods, signifying superior\nimage clarity and quality. However, this method also resulted in a 10.23%\ndecrease in average FSIM, indicating a diminished structural and textural\nsimilarity to the original images. Similar to these measures, human evaluation\nalso showed that images generated using image-to-image-based method were more\nrealistic compared to those generated with text-to-image approach. The results\nhighlighted DALL.E's potential in generating realistic agricultural image\ndatasets and thus accelerating the development and adoption of imaging-based\nprecision agricultural solutions.\n","authors":["Ranjan Sapkota","Manoj Karkee"],"pdf_url":"https://arxiv.org/pdf/2307.08789v4.pdf","comment":"9 Figures, 1 table, 17 pages"},{"id":"http://arxiv.org/abs/2408.15185v1","updated":"2024-08-27T16:40:14Z","published":"2024-08-27T16:40:14Z","title":"PoseWatch: A Transformer-based Architecture for Human-centric Video\n  Anomaly Detection Using Spatio-temporal Pose Tokenization","summary":"  Video Anomaly Detection (VAD) presents a significant challenge in computer\nvision, particularly due to the unpredictable and infrequent nature of\nanomalous events, coupled with the diverse and dynamic environments in which\nthey occur. Human-centric VAD, a specialized area within this domain, faces\nadditional complexities, including variations in human behavior, potential\nbiases in data, and substantial privacy concerns related to human subjects.\nThese issues complicate the development of models that are both robust and\ngeneralizable. To address these challenges, recent advancements have focused on\npose-based VAD, which leverages human pose as a high-level feature to mitigate\nprivacy concerns, reduce appearance biases, and minimize background\ninterference. In this paper, we introduce PoseWatch, a novel transformer-based\narchitecture designed specifically for human-centric pose-based VAD. PoseWatch\nfeatures an innovative Spatio-Temporal Pose and Relative Pose (ST-PRP)\ntokenization method that enhances the representation of human motion over time,\nwhich is also beneficial for broader human behavior analysis tasks. The\narchitecture's core, a Unified Encoder Twin Decoders (UETD) transformer,\nsignificantly improves the detection of anomalous behaviors in video data.\nExtensive evaluations across multiple benchmark datasets demonstrate that\nPoseWatch consistently outperforms existing methods, establishing a new\nstate-of-the-art in pose-based VAD. This work not only demonstrates the\nefficacy of PoseWatch but also highlights the potential of integrating Natural\nLanguage Processing techniques with computer vision to advance human behavior\nanalysis.\n","authors":["Ghazal Alinezhad Noghre","Armin Danesh Pazho","Hamed Tabkhi"],"pdf_url":"https://arxiv.org/pdf/2408.15185v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.12040v3","updated":"2024-08-27T16:25:47Z","published":"2024-07-01T17:59:55Z","title":"Comprehensive Performance Evaluation of YOLOv10, YOLOv9 and YOLOv8 on\n  Detecting and Counting Fruitlet in Complex Orchard Environments","summary":"  This study performed an extensive evaluation of the performances of all\nconfigurations of YOLOv8, YOLOv9, and YOLOv10 object detection algorithms for\nfruitlet (of green fruit) detection in commercial orchards. Additionally, this\nresearch performed and validated in-field counting of fruitlets using an iPhone\nand machine vision sensors in 5 different apple varieties (Scifresh, Scilate,\nHoneycrisp, Cosmic crisp & Golden delicious). This comprehensive investigation\nof total 17 different configurations (5 for YOLOv8, 6 for YOLOv9 and 6 for\nYOLOv10) revealed that YOLOv9 outperforms YOLOv10 and YOLOv8 in terms of\nmAP@50, while YOLOv10x outperformed all 17 configurations tested in terms of\nprecision and recall. Specifically, YOLOv9 Gelan-e achieved the highest mAP@50\nof 0.935, outperforming YOLOv10n's 0.921 and YOLOv8s's 0.924. In terms of\nprecision, YOLOv10x achieved the highest precision of 0.908, indicating\nsuperior object identification accuracy compared to other configurations tested\n(e.g. YOLOv9 Gelan-c with a precision of 0.903 and YOLOv8m with 0.897. In terms\nof recall, YOLOv10s achieved the highest in its series (0.872), while YOLOv9\nGelan m performed the best among YOLOv9 configurations (0.899), and YOLOv8n\nperformed the best among the YOLOv8 configurations (0.883). Meanwhile, three\nconfigurations of YOLOv10: YOLOv10b, YOLOv10l, and YOLOv10x achieved superior\npost-processing speeds of 1.5 milliseconds, outperforming all other\nconfigurations within the YOLOv9 and YOLOv8 families. Specifically, YOLOv9\nGelan-e recorded a post-processing speed of 1.9 milliseconds, and YOLOv8m\nachieved 2.1 milliseconds. Furthermore, YOLOv8n exhibited the highest inference\nspeed among all configurations tested, achieving a processing time of 4.1\nmilliseconds while YOLOv9 Gelan-t and YOLOv10n also demonstrated comparatively\nslower inference speeds of 9.3 ms and 5.5 ms, respectively.\n","authors":["Ranjan Sapkota","Zhichao Meng","Martin Churuvija","Xiaoqiang Du","Zenghong Ma","Manoj Karkee"],"pdf_url":"https://arxiv.org/pdf/2407.12040v3.pdf","comment":"14 figures, 2 tables"},{"id":"http://arxiv.org/abs/2408.15178v1","updated":"2024-08-27T16:22:18Z","published":"2024-08-27T16:22:18Z","title":"A Review of Transformer-Based Models for Computer Vision Tasks:\n  Capturing Global Context and Spatial Relationships","summary":"  Transformer-based models have transformed the landscape of natural language\nprocessing (NLP) and are increasingly applied to computer vision tasks with\nremarkable success. These models, renowned for their ability to capture\nlong-range dependencies and contextual information, offer a promising\nalternative to traditional convolutional neural networks (CNNs) in computer\nvision. In this review paper, we provide an extensive overview of various\ntransformer architectures adapted for computer vision tasks. We delve into how\nthese models capture global context and spatial relationships in images,\nempowering them to excel in tasks such as image classification, object\ndetection, and segmentation. Analyzing the key components, training\nmethodologies, and performance metrics of transformer-based models, we\nhighlight their strengths, limitations, and recent advancements. Additionally,\nwe discuss potential research directions and applications of transformer-based\nmodels in computer vision, offering insights into their implications for future\nadvancements in the field.\n","authors":["Gracile Astlin Pereira","Muhammad Hussain"],"pdf_url":"https://arxiv.org/pdf/2408.15178v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10636v2","updated":"2024-08-27T16:19:22Z","published":"2024-08-20T08:22:29Z","title":"UWF-RI2FA: Generating Multi-frame Ultrawide-field Fluorescein\n  Angiography from Ultrawide-field Retinal Imaging Improves Diabetic\n  Retinopathy Stratification","summary":"  Ultrawide-field fluorescein angiography (UWF-FA) facilitates diabetic\nretinopathy (DR) detection by providing a clear visualization of peripheral\nretinal lesions. However, the intravenous dye injection with potential risks\nhamper its application. We aim to acquire dye-free UWF-FA images from\nnoninvasive UWF retinal imaging (UWF-RI) using generative artificial\nintelligence (GenAI) and evaluate its effectiveness in DR screening. A total of\n18,321 UWF-FA images of different phases were registered with corresponding\nUWF-RI images and fed into a generative adversarial networks (GAN)-based model\nfor training. The quality of generated UWF-FA images was evaluated through\nquantitative metrics and human evaluation. The DeepDRiD dataset was used to\nexternally assess the contribution of generated UWF-FA images to DR\nclassification, using area under the receiver operating characteristic curve\n(AUROC) as outcome metrics. The generated early, mid, and late phase UWF-FA\nimages achieved high authenticity, with multi-scale similarity scores ranging\nfrom 0.70 to 0.91 and qualitative visual scores ranging from 1.64 to 1.98\n(1=real UWF-FA quality). In fifty randomly selected images, 56% to 76% of the\ngenerated images were difficult to distinguish from real images in the Turing\ntest. Moreover, adding these generated UWF-FA images for DR classification\nsignificantly increased the AUROC from 0.869 to 0.904 compared to the baseline\nmodel using UWF-RI images (P < .001). The model successfully generates\nrealistic multi-frame UWF-FA images for enhancing DR stratification without\nintravenous dye injection.\n","authors":["Ruoyu Chen","Kezheng Xu","Kangyan Zheng","Weiyi Zhang","Yan Lu","Danli Shi","Mingguang He"],"pdf_url":"https://arxiv.org/pdf/2408.10636v2.pdf","comment":"22 pages, 2 figures"},{"id":"http://arxiv.org/abs/2408.13698v2","updated":"2024-08-27T16:11:44Z","published":"2024-08-25T01:27:35Z","title":"CNN-Transformer Rectified Collaborative Learning for Medical Image\n  Segmentation","summary":"  Automatic and precise medical image segmentation (MIS) is of vital importance\nfor clinical diagnosis and analysis. Current MIS methods mainly rely on the\nconvolutional neural network (CNN) or self-attention mechanism (Transformer)\nfor feature modeling. However, CNN-based methods suffer from the inaccurate\nlocalization owing to the limited global dependency while Transformer-based\nmethods always present the coarse boundary for the lack of local emphasis.\nAlthough some CNN-Transformer hybrid methods are designed to synthesize the\ncomplementary local and global information for better performance, the\ncombination of CNN and Transformer introduces numerous parameters and increases\nthe computation cost. To this end, this paper proposes a CNN-Transformer\nrectified collaborative learning (CTRCL) framework to learn stronger CNN-based\nand Transformer-based models for MIS tasks via the bi-directional knowledge\ntransfer between them. Specifically, we propose a rectified logit-wise\ncollaborative learning (RLCL) strategy which introduces the ground truth to\nadaptively select and rectify the wrong regions in student soft labels for\naccurate knowledge transfer in the logit space. We also propose a class-aware\nfeature-wise collaborative learning (CFCL) strategy to achieve effective\nknowledge transfer between CNN-based and Transformer-based models in the\nfeature space by granting their intermediate features the similar capability of\ncategory perception. Extensive experiments on three popular MIS benchmarks\ndemonstrate that our CTRCL outperforms most state-of-the-art collaborative\nlearning methods under different evaluation metrics.\n","authors":["Lanhu Wu","Miao Zhang","Yongri Piao","Zhenyan Yao","Weibing Sun","Feng Tian","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2408.13698v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15172v1","updated":"2024-08-27T16:10:21Z","published":"2024-08-27T16:10:21Z","title":"X-Reflect: Cross-Reflection Prompting for Multimodal Recommendation","summary":"  Large Language Models (LLMs) and Large Multimodal Models (LMMs) have been\nshown to enhance the effectiveness of enriching item descriptions, thereby\nimproving the accuracy of recommendation systems. However, most existing\napproaches either rely on text-only prompting or employ basic multimodal\nstrategies that do not fully exploit the complementary information available\nfrom both textual and visual modalities. This paper introduces a novel\nframework, Cross-Reflection Prompting, termed X-Reflect, designed to address\nthese limitations by prompting LMMs to explicitly identify and reconcile\nsupportive and conflicting information between text and images. By capturing\nnuanced insights from both modalities, this approach generates more\ncomprehensive and contextually richer item representations. Extensive\nexperiments conducted on two widely used benchmarks demonstrate that our method\noutperforms existing prompting baselines in downstream recommendation accuracy.\nAdditionally, we evaluate the generalizability of our framework across\ndifferent LMM backbones and the robustness of the prompting strategies,\noffering insights for optimization. This work underscores the importance of\nintegrating multimodal information and presents a novel solution for improving\nitem understanding in multimodal recommendation systems.\n","authors":["Hanjia Lyu","Ryan Rossi","Xiang Chen","Md Mehrab Tanjim","Stefano Petrangeli","Somdeb Sarkhel","Jiebo Luo"],"pdf_url":"https://arxiv.org/pdf/2408.15172v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13993v3","updated":"2024-08-27T15:56:33Z","published":"2024-04-22T08:59:35Z","title":"Zero-Shot Character Identification and Speaker Prediction in Comics via\n  Iterative Multimodal Fusion","summary":"  Recognizing characters and predicting speakers of dialogue are critical for\ncomic processing tasks, such as voice generation or translation. However,\nbecause characters vary by comic title, supervised learning approaches like\ntraining character classifiers which require specific annotations for each\ncomic title are infeasible. This motivates us to propose a novel zero-shot\napproach, allowing machines to identify characters and predict speaker names\nbased solely on unannotated comic images. In spite of their importance in\nreal-world applications, these task have largely remained unexplored due to\nchallenges in story comprehension and multimodal integration. Recent large\nlanguage models (LLMs) have shown great capability for text understanding and\nreasoning, while their application to multimodal content analysis is still an\nopen problem. To address this problem, we propose an iterative multimodal\nframework, the first to employ multimodal information for both character\nidentification and speaker prediction tasks. Our experiments demonstrate the\neffectiveness of the proposed framework, establishing a robust baseline for\nthese tasks. Furthermore, since our method requires no training data or\nannotations, it can be used as-is on any comic series.\n","authors":["Yingxuan Li","Ryota Hinami","Kiyoharu Aizawa","Yusuke Matsui"],"pdf_url":"https://arxiv.org/pdf/2404.13993v3.pdf","comment":"Accepted to ACM Multimedia 2024"},{"id":"http://arxiv.org/abs/2408.15159v1","updated":"2024-08-27T15:55:18Z","published":"2024-08-27T15:55:18Z","title":"Empowering Sign Language Communication: Integrating Sentiment and\n  Semantics for Facial Expression Synthesis","summary":"  Translating written sentences from oral languages to a sequence of manual and\nnon-manual gestures plays a crucial role in building a more inclusive society\nfor deaf and hard-of-hearing people. Facial expressions (non-manual), in\nparticular, are responsible for encoding the grammar of the sentence to be\nspoken, applying punctuation, pronouns, or emphasizing signs. These non-manual\ngestures are closely related to the semantics of the sentence being spoken and\nalso to the utterance of the speaker's emotions. However, most Sign Language\nProduction (SLP) approaches are centered on synthesizing manual gestures and do\nnot focus on modeling the speakers expression. This paper introduces a new\nmethod focused in synthesizing facial expressions for sign language. Our goal\nis to improve sign language production by integrating sentiment information in\nfacial expression generation. The approach leverages a sentence sentiment and\nsemantic features to sample from a meaningful representation space, integrating\nthe bias of the non-manual components into the sign language production\nprocess. To evaluate our method, we extend the Frechet Gesture Distance (FGD)\nand propose a new metric called Frechet Expression Distance (FED) and apply an\nextensive set of metrics to assess the quality of specific regions of the face.\nThe experimental results showed that our method achieved state of the art,\nbeing superior to the competitors on How2Sign and PHOENIX14T datasets.\nMoreover, our architecture is based on a carefully designed graph pyramid that\nmakes it simpler, easier to train, and capable of leveraging emotions to\nproduce facial expressions.\n","authors":["Rafael Azevedo","Thiago Coutinho","João Ferreira","Thiago Gomes","Erickson Nascimento"],"pdf_url":"https://arxiv.org/pdf/2408.15159v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15143v1","updated":"2024-08-27T15:31:45Z","published":"2024-08-27T15:31:45Z","title":"A Preliminary Exploration Towards General Image Restoration","summary":"  Despite the tremendous success of deep models in various individual image\nrestoration tasks, there are at least two major technical challenges preventing\nthese works from being applied to real-world usages: (1) the lack of\ngeneralization ability and (2) the complex and unknown degradations in\nreal-world scenarios. Existing deep models, tailored for specific individual\nimage restoration tasks, often fall short in effectively addressing these\nchallenges. In this paper, we present a new problem called general image\nrestoration (GIR) which aims to address these challenges within a unified\nmodel. GIR covers most individual image restoration tasks (\\eg, image\ndenoising, deblurring, deraining and super-resolution) and their combinations\nfor general purposes. This paper proceeds to delineate the essential aspects of\nGIR, including problem definition and the overarching significance of\ngeneralization performance. Moreover, the establishment of new datasets and a\nthorough evaluation framework for GIR models is discussed. We conduct a\ncomprehensive evaluation of existing approaches for tackling the GIR challenge,\nilluminating their strengths and pragmatic challenges. By analyzing these\napproaches, we not only underscore the effectiveness of GIR but also highlight\nthe difficulties in its practical implementation. At last, we also try to\nunderstand and interpret these models' behaviors to inspire the future\ndirection. Our work can open up new valuable research directions and contribute\nto the research of general vision.\n","authors":["Xiangtao Kong","Jinjin Gu","Yihao Liu","Wenlong Zhang","Xiangyu Chen","Yu Qiao","Chao Dong"],"pdf_url":"https://arxiv.org/pdf/2408.15143v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13896v2","updated":"2024-08-27T15:13:01Z","published":"2024-08-25T17:33:40Z","title":"RT-Attack: Jailbreaking Text-to-Image Models via Random Token","summary":"  Recently, Text-to-Image(T2I) models have achieved remarkable success in image\ngeneration and editing, yet these models still have many potential issues,\nparticularly in generating inappropriate or Not-Safe-For-Work(NSFW) content.\nStrengthening attacks and uncovering such vulnerabilities can advance the\ndevelopment of reliable and practical T2I models. Most of the previous works\ntreat T2I models as white-box systems, using gradient optimization to generate\nadversarial prompts. However, accessing the model's gradient is often\nimpossible in real-world scenarios. Moreover, existing defense methods, those\nusing gradient masking, are designed to prevent attackers from obtaining\naccurate gradient information. While some black-box jailbreak attacks have been\nexplored, these typically rely on simply replacing sensitive words, leading to\nsuboptimal attack performance. To address this issue, we introduce a two-stage\nquery-based black-box attack method utilizing random search. In the first\nstage, we establish a preliminary prompt by maximizing the semantic similarity\nbetween the adversarial and target harmful prompts. In the second stage, we use\nthis initial prompt to refine our approach, creating a detailed adversarial\nprompt aimed at jailbreaking and maximizing the similarity in image features\nbetween the images generated from this prompt and those produced by the target\nharmful prompt. Extensive experiments validate the effectiveness of our method\nin attacking the latest prompt checkers, post-hoc image checkers, securely\ntrained T2I models, and online commercial models.\n","authors":["Sensen Gao","Xiaojun Jia","Yihao Huang","Ranjie Duan","Jindong Gu","Yang Liu","Qing Guo"],"pdf_url":"https://arxiv.org/pdf/2408.13896v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15127v1","updated":"2024-08-27T15:07:58Z","published":"2024-08-27T15:07:58Z","title":"T-FAKE: Synthesizing Thermal Images for Facial Landmarking","summary":"  Facial analysis is a key component in a wide range of applications such as\nsecurity, autonomous driving, entertainment, and healthcare. Despite the\navailability of various facial RGB datasets, the thermal modality, which plays\na crucial role in life sciences, medicine, and biometrics, has been largely\noverlooked. To address this gap, we introduce the T-FAKE dataset, a new\nlarge-scale synthetic thermal dataset with sparse and dense landmarks. To\nfacilitate the creation of the dataset, we propose a novel RGB2Thermal loss\nfunction, which enables the transfer of thermal style to RGB faces. By\nutilizing the Wasserstein distance between thermal and RGB patches and the\nstatistical analysis of clinical temperature distributions on faces, we ensure\nthat the generated thermal images closely resemble real samples. Using\nRGB2Thermal style transfer based on our RGB2Thermal loss function, we create\nthe T-FAKE dataset, a large-scale synthetic thermal dataset of faces.\nLeveraging our novel T-FAKE dataset, probabilistic landmark prediction, and\nlabel adaptation networks, we demonstrate significant improvements in landmark\ndetection methods on thermal images across different landmark conventions. Our\nmodels show excellent performance with both sparse 70-point landmarks and dense\n478-point landmark annotations. Our code and models are available at\nhttps://github.com/phflot/tfake.\n","authors":["Philipp Flotho","Moritz Piening","Anna Kukleva","Gabriele Steidl"],"pdf_url":"https://arxiv.org/pdf/2408.15127v1.pdf","comment":"22 pages, 12 figures, Philipp Flotho and Moritz Piening share equal\n  contribution"},{"id":"http://arxiv.org/abs/2408.15122v1","updated":"2024-08-27T15:03:20Z","published":"2024-08-27T15:03:20Z","title":"Machine Learning for Methane Detection and Quantification from Space --\n  A survey","summary":"  Methane (CH_4) is a potent anthropogenic greenhouse gas, contributing 86\ntimes more to global warming than Carbon Dioxide (CO_2) over 20 years, and it\nalso acts as an air pollutant. Given its high radiative forcing potential and\nrelatively short atmospheric lifetime (9\\textpm1 years), methane has important\nimplications for climate change, therefore, cutting methane emissions is\ncrucial for effective climate change mitigation. This work expands existing\ninformation on operational methane point source detection sensors in the\nShort-Wave Infrared (SWIR) bands. It reviews the state-of-the-art for\ntraditional as well as Machine Learning (ML) approaches. The architecture and\ndata used in such ML models will be discussed separately for methane plume\nsegmentation and emission rate estimation. Traditionally, experts rely on\nlabor-intensive manually adjusted methods for methane detection. However, ML\napproaches offer greater scalability. Our analysis reveals that ML models\noutperform traditional methods, particularly those based on convolutional\nneural networks (CNN), which are based on the U-net and transformer\narchitectures. These ML models extract valuable information from\nmethane-sensitive spectral data, enabling a more accurate detection. Challenges\narise when comparing these methods due to variations in data, sensor\nspecifications, and evaluation metrics. To address this, we discuss existing\ndatasets and metrics, providing an overview of available resources and\nidentifying open research problems. Finally, we explore potential future\nadvances in ML, emphasizing approaches for model comparability, large dataset\ncreation, and the European Union's forthcoming methane strategy.\n","authors":["Enno Tiemann","Shanyu Zhou","Alexander Kläser","Konrad Heidler","Rochelle Schneider","Xiao Xiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.15122v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05892v3","updated":"2024-08-27T15:00:53Z","published":"2024-08-12T02:10:18Z","title":"Polyp SAM 2: Advancing Zero shot Polyp Segmentation in Colorectal Cancer\n  Detection","summary":"  Polyp segmentation plays a crucial role in the early detection and diagnosis\nof colorectal cancer. However, obtaining accurate segmentations often requires\nlabor-intensive annotations and specialized models. Recently, Meta AI Research\nreleased a general Segment Anything Model 2 (SAM 2), which has demonstrated\npromising performance in several segmentation tasks. In this manuscript, we\nevaluate the performance of SAM 2 in segmenting polyps under various prompted\nsettings. We hope this report will provide insights to advance the field of\npolyp segmentation and promote more interesting work in the future. This\nproject is publicly available at https://github.com/ sajjad-sh33/Polyp-SAM-2.\n","authors":["Mobina Mansoori","Sajjad Shahabodini","Jamshid Abouei","Konstantinos N. Plataniotis","Arash Mohammadi"],"pdf_url":"https://arxiv.org/pdf/2408.05892v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15119v1","updated":"2024-08-27T14:58:13Z","published":"2024-08-27T14:58:13Z","title":"Urdu Digital Text Word Optical Character Recognition Using Permuted Auto\n  Regressive Sequence Modeling","summary":"  This research paper introduces an innovative word-level Optical Character\nRecognition (OCR) model specifically designed for digital Urdu text\nrecognition. Utilizing transformer-based architectures and attention\nmechanisms, the model was trained on a comprehensive dataset of approximately\n160,000 Urdu text images, achieving a character error rate (CER) of 0.178,\nwhich highlights its superior accuracy in recognizing Urdu characters. The\nmodel's strength lies in its unique architecture, incorporating the permuted\nautoregressive sequence (PARSeq) model, which allows for context-aware\ninference and iterative refinement by leveraging bidirectional context\ninformation to enhance recognition accuracy. Furthermore, its capability to\nhandle a diverse range of Urdu text styles, fonts, and variations enhances its\napplicability in real-world scenarios. Despite its promising results, the model\nhas some limitations, such as difficulty with blurred images, non-horizontal\norientations, and overlays of patterns, lines, or other text, which can\noccasionally lead to suboptimal performance. Additionally, trailing or\nfollowing punctuation marks can introduce noise into the recognition process.\nAddressing these challenges will be a focus of future research, aiming to\nrefine the model further, explore data augmentation techniques, optimize\nhyperparameters, and integrate contextual improvements for more accurate and\nefficient Urdu text recognition.\n","authors":["Ahmed Mustafa","Ijlal Baig","Hasan Sajid"],"pdf_url":"https://arxiv.org/pdf/2408.15119v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15118v1","updated":"2024-08-27T14:58:08Z","published":"2024-08-27T14:58:08Z","title":"DIFR3CT: Latent Diffusion for Probabilistic 3D CT Reconstruction from\n  Few Planar X-Rays","summary":"  Computed Tomography (CT) scans are the standard-of-care for the visualization\nand diagnosis of many clinical ailments, and are needed for the treatment\nplanning of external beam radiotherapy. Unfortunately, the availability of CT\nscanners in low- and mid-resource settings is highly variable. Planar x-ray\nradiography units, in comparison, are far more prevalent, but can only provide\nlimited 2D observations of the 3D anatomy. In this work we propose DIFR3CT, a\n3D latent diffusion model, that can generate a distribution of plausible CT\nvolumes from one or few (<10) planar x-ray observations. DIFR3CT works by\nfusing 2D features from each x-ray into a joint 3D space, and performing\ndiffusion conditioned on these fused features in a low-dimensional latent\nspace. We conduct extensive experiments demonstrating that DIFR3CT is better\nthan recent sparse CT reconstruction baselines in terms of standard pixel-level\n(PSNR, SSIM) on both the public LIDC and in-house post-mastectomy CT datasets.\nWe also show that DIFR3CT supports uncertainty quantification via Monte Carlo\nsampling, which provides an opportunity to measure reconstruction reliability.\nFinally, we perform a preliminary pilot study evaluating DIFR3CT for automated\nbreast radiotherapy contouring and planning -- and demonstrate promising\nfeasibility. Our code is available at https://github.com/yransun/DIFR3CT.\n","authors":["Yiran Sun","Hana Baroudi","Tucker Netherton","Laurence Court","Osama Mawlawi","Ashok Veeraraghavan","Guha Balakrishnan"],"pdf_url":"https://arxiv.org/pdf/2408.15118v1.pdf","comment":"11 pages, 9 figures"},{"id":"http://arxiv.org/abs/2408.15114v1","updated":"2024-08-27T14:54:33Z","published":"2024-08-27T14:54:33Z","title":"Few-Shot Unsupervised Implicit Neural Shape Representation Learning with\n  Spatial Adversaries","summary":"  Implicit Neural Representations have gained prominence as a powerful\nframework for capturing complex data modalities, encompassing a wide range from\n3D shapes to images and audio. Within the realm of 3D shape representation,\nNeural Signed Distance Functions (SDF) have demonstrated remarkable potential\nin faithfully encoding intricate shape geometry. However, learning SDFs from\nsparse 3D point clouds in the absence of ground truth supervision remains a\nvery challenging task. While recent methods rely on smoothness priors to\nregularize the learning, our method introduces a regularization term that\nleverages adversarial samples around the shape to improve the learned SDFs.\nThrough extensive experiments and evaluations, we illustrate the efficacy of\nour proposed method, highlighting its capacity to improve SDF learning with\nrespect to baselines and the state-of-the-art using synthetic and real data.\n","authors":["Amine Ouasfi","Adnane Boukhayma"],"pdf_url":"https://arxiv.org/pdf/2408.15114v1.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2408.15113v1","updated":"2024-08-27T14:51:34Z","published":"2024-08-27T14:51:34Z","title":"AnomalousPatchCore: Exploring the Use of Anomalous Samples in Industrial\n  Anomaly Detection","summary":"  Visual inspection, or industrial anomaly detection, is one of the most common\nquality control types in manufacturing. The task is to identify the presence of\nan anomaly given an image, e.g., a missing component on an image of a circuit\nboard, for subsequent manual inspection. While industrial anomaly detection has\nseen a surge in recent years, most anomaly detection methods still utilize\nknowledge only from normal samples, failing to leverage the information from\nthe frequently available anomalous samples. Additionally, they heavily rely on\nvery general feature extractors pre-trained on common image classification\ndatasets. In this paper, we address these shortcomings and propose the new\nanomaly detection system AnomalousPatchCore~(APC) based on a feature extractor\nfine-tuned with normal and anomalous in-domain samples and a subsequent memory\nbank for identifying unusual features. To fine-tune the feature extractor in\nAPC, we propose three auxiliary tasks that address the different aspects of\nanomaly detection~(classification vs. localization) and mitigate the effect of\nthe imbalance between normal and anomalous samples. Our extensive evaluation on\nthe MVTec dataset shows that APC outperforms state-of-the-art systems in\ndetecting anomalies, which is especially important in industrial anomaly\ndetection given the subsequent manual inspection. In detailed ablation studies,\nwe further investigate the properties of our APC.\n","authors":["Mykhailo Koshil","Tilman Wegener","Detlef Mentrup","Simone Frintrop","Christian Wilms"],"pdf_url":"https://arxiv.org/pdf/2408.15113v1.pdf","comment":"Accepted at the 2nd workshop on Vision-based InduStrial InspectiON\n  (VISION) @ ECCV"},{"id":"http://arxiv.org/abs/2408.15103v1","updated":"2024-08-27T14:40:19Z","published":"2024-08-27T14:40:19Z","title":"Enhancing License Plate Super-Resolution: A Layout-Aware and\n  Character-Driven Approach","summary":"  Despite significant advancements in License Plate Recognition (LPR) through\ndeep learning, most improvements rely on high-resolution images with clear\ncharacters. This scenario does not reflect real-world conditions where traffic\nsurveillance often captures low-resolution and blurry images. Under these\nconditions, characters tend to blend with the background or neighboring\ncharacters, making accurate LPR challenging. To address this issue, we\nintroduce a novel loss function, Layout and Character Oriented Focal Loss\n(LCOFL), which considers factors such as resolution, texture, and structural\ndetails, as well as the performance of the LPR task itself. We enhance\ncharacter feature learning using deformable convolutions and shared weights in\nan attention module and employ a GAN-based training approach with an Optical\nCharacter Recognition (OCR) model as the discriminator to guide the\nsuper-resolution process. Our experimental results show significant\nimprovements in character reconstruction quality, outperforming two\nstate-of-the-art methods in both quantitative and qualitative measures. Our\ncode is publicly available at https://github.com/valfride/lpsr-lacd\n","authors":["Valfride Nascimento","Rayson Laroca","Rafael O. Ribeiro","William Robson Schwartz","David Menotti"],"pdf_url":"https://arxiv.org/pdf/2408.15103v1.pdf","comment":"Accepted for presentation at the Conference on Graphics, Patterns and\n  Images (SIBGRAPI) 2024"},{"id":"http://arxiv.org/abs/2408.15101v1","updated":"2024-08-27T14:36:46Z","published":"2024-08-27T14:36:46Z","title":"MTMamba++: Enhancing Multi-Task Dense Scene Understanding via\n  Mamba-Based Decoders","summary":"  Multi-task dense scene understanding, which trains a model for multiple dense\nprediction tasks, has a wide range of application scenarios. Capturing\nlong-range dependency and enhancing cross-task interactions are crucial to\nmulti-task dense prediction. In this paper, we propose MTMamba++, a novel\narchitecture for multi-task scene understanding featuring with a Mamba-based\ndecoder. It contains two types of core blocks: self-task Mamba (STM) block and\ncross-task Mamba (CTM) block. STM handles long-range dependency by leveraging\nstate-space models, while CTM explicitly models task interactions to facilitate\ninformation exchange across tasks. We design two types of CTM block, namely\nF-CTM and S-CTM, to enhance cross-task interaction from feature and semantic\nperspectives, respectively. Experiments on NYUDv2, PASCAL-Context, and\nCityscapes datasets demonstrate the superior performance of MTMamba++ over\nCNN-based and Transformer-based methods. The code is available at\nhttps://github.com/EnVision-Research/MTMamba.\n","authors":["Baijiong Lin","Weisen Jiang","Pengguang Chen","Shu Liu","Ying-Cong Chen"],"pdf_url":"https://arxiv.org/pdf/2408.15101v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2407.02228"},{"id":"http://arxiv.org/abs/2408.15098v1","updated":"2024-08-27T14:30:36Z","published":"2024-08-27T14:30:36Z","title":"CLIP-AGIQA: Boosting the Performance of AI-Generated Image Quality\n  Assessment with CLIP","summary":"  With the rapid development of generative technologies, AI-Generated Images\n(AIGIs) have been widely applied in various aspects of daily life. However, due\nto the immaturity of the technology, the quality of the generated images\nvaries, so it is important to develop quality assessment techniques for the\ngenerated images. Although some models have been proposed to assess the quality\nof generated images, they are inadequate when faced with the ever-increasing\nand diverse categories of generated images. Consequently, the development of\nmore advanced and effective models for evaluating the quality of generated\nimages is urgently needed. Recent research has explored the significant\npotential of the visual language model CLIP in image quality assessment,\nfinding that it performs well in evaluating the quality of natural images.\nHowever, its application to generated images has not been thoroughly\ninvestigated. In this paper, we build on this idea and further explore the\npotential of CLIP in evaluating the quality of generated images. We design\nCLIP-AGIQA, a CLIP-based regression model for quality assessment of generated\nimages, leveraging rich visual and textual knowledge encapsulated in CLIP.\nParticularly, we implement multi-category learnable prompts to fully utilize\nthe textual knowledge in CLIP for quality assessment. Extensive experiments on\nseveral generated image quality assessment benchmarks, including AGIQA-3K and\nAIGCIQA2023, demonstrate that CLIP-AGIQA outperforms existing IQA models,\nachieving excellent results in evaluating the quality of generated images.\n","authors":["Zhenchen Tang","Zichuan Wang","Bo Peng","Jing Dong"],"pdf_url":"https://arxiv.org/pdf/2408.15098v1.pdf","comment":"accepted by ICPR2024"},{"id":"http://arxiv.org/abs/2408.15094v1","updated":"2024-08-27T14:25:42Z","published":"2024-08-27T14:25:42Z","title":"Constrained Diffusion Models via Dual Training","summary":"  Diffusion models have attained prominence for their ability to synthesize a\nprobability distribution for a given dataset via a diffusion process, enabling\nthe generation of new data points with high fidelity. However, diffusion\nprocesses are prone to generating biased data based on the training dataset. To\naddress this issue, we develop constrained diffusion models by imposing\ndiffusion constraints based on desired distributions that are informed by\nrequirements. Specifically, we cast the training of diffusion models under\nrequirements as a constrained distribution optimization problem that aims to\nreduce the distribution difference between original and generated data while\nobeying constraints on the distribution of generated data. We show that our\nconstrained diffusion models generate new data from a mixture data distribution\nthat achieves the optimal trade-off among objective and constraints. To train\nconstrained diffusion models, we develop a dual training algorithm and\ncharacterize the optimality of the trained constrained diffusion model. We\nempirically demonstrate the effectiveness of our constrained models in two\nconstrained generation tasks: (i) we consider a dataset with one or more\nunderrepresented classes where we train the model with constraints to ensure\nfairly sampling from all classes during inference; (ii) we fine-tune a\npre-trained diffusion model to sample from a new dataset while avoiding\noverfitting.\n","authors":["Shervin Khalafi","Dongsheng Ding","Alejandro Ribeiro"],"pdf_url":"https://arxiv.org/pdf/2408.15094v1.pdf","comment":"41 pages, 4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2307.10895v4","updated":"2024-08-27T14:23:51Z","published":"2023-07-20T14:18:44Z","title":"Variational Autoencoding of Dental Point Clouds","summary":"  Digital dentistry has made significant advancements, yet numerous challenges\nremain. This paper introduces the FDI 16 dataset, an extensive collection of\ntooth meshes and point clouds. Additionally, we present a novel approach:\nVariational FoldingNet (VF-Net), a fully probabilistic variational autoencoder\nfor point clouds. Notably, prior latent variable models for point clouds lack a\none-to-one correspondence between input and output points. Instead, they rely\non optimizing Chamfer distances, a metric that lacks a normalized\ndistributional counterpart, rendering it unsuitable for probabilistic modeling.\nWe replace the explicit minimization of Chamfer distances with a suitable\nencoder, increasing computational efficiency while simplifying the\nprobabilistic extension. This allows for straightforward application in various\ntasks, including mesh generation, shape completion, and representation\nlearning. Empirically, we provide evidence of lower reconstruction error in\ndental reconstruction and interpolation, showcasing state-of-the-art\nperformance in dental sample generation while identifying valuable latent\nrepresentations\n","authors":["Johan Ziruo Ye","Thomas Ørkild","Peter Lempel Søndergaard","Søren Hauberg"],"pdf_url":"https://arxiv.org/pdf/2307.10895v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.01870v2","updated":"2024-08-27T14:16:06Z","published":"2022-09-05T10:06:03Z","title":"Unsupervised Domain Adaptation via Style-Aware Self-intermediate Domain","summary":"  Unsupervised domain adaptation (UDA) has attracted considerable attention,\nwhich transfers knowledge from a label-rich source domain to a related but\nunlabeled target domain. Reducing inter-domain differences has always been a\ncrucial factor to improve performance in UDA, especially for tasks where there\nis a large gap between source and target domains. To this end, we propose a\nnovel style-aware feature fusion method (SAFF) to bridge the large domain gap\nand transfer knowledge while alleviating the loss of class-discriminative\ninformation. Inspired by the human transitive inference and learning ability, a\nnovel style-aware self-intermediate domain (SSID) is investigated to link two\nseemingly unrelated concepts through a series of intermediate auxiliary\nsynthesized concepts. Specifically, we propose a novel learning strategy of\nSSID, which selects samples from both source and target domains as anchors, and\nthen randomly fuses the object and style features of these anchors to generate\nlabeled and style-rich intermediate auxiliary features for knowledge transfer.\nMoreover, we design an external memory bank to store and update specified\nlabeled features to obtain stable class features and class-wise style features.\nBased on the proposed memory bank, the intra- and inter-domain loss functions\nare designed to improve the class recognition ability and feature\ncompatibility, respectively. Meanwhile, we simulate the rich latent feature\nspace of SSID by infinite sampling and the convergence of the loss function by\nmathematical theory. Finally, we conduct comprehensive experiments on commonly\nused domain adaptive benchmarks to evaluate the proposed SAFF, and the\nexperimental results show that the proposed SAFF can be easily combined with\ndifferent backbone networks and obtain better performance as a plug-in-plug-out\nmodule.\n","authors":["Lianyu Wang","Meng Wang","Daoqiang Zhang","Huazhu Fu"],"pdf_url":"https://arxiv.org/pdf/2209.01870v2.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2408.13627v2","updated":"2024-08-27T14:14:51Z","published":"2024-08-24T16:48:25Z","title":"Recent Event Camera Innovations: A Survey","summary":"  Event-based vision, inspired by the human visual system, offers\ntransformative capabilities such as low latency, high dynamic range, and\nreduced power consumption. This paper presents a comprehensive survey of event\ncameras, tracing their evolution over time. It introduces the fundamental\nprinciples of event cameras, compares them with traditional frame cameras, and\nhighlights their unique characteristics and operational differences. The survey\ncovers various event camera models from leading manufacturers, key\ntechnological milestones, and influential research contributions. It explores\ndiverse application areas across different domains and discusses essential\nreal-world and synthetic datasets for research advancement. Additionally, the\nrole of event camera simulators in testing and development is discussed. This\nsurvey aims to consolidate the current state of event cameras and inspire\nfurther innovation in this rapidly evolving field. To support the research\ncommunity, a GitHub page\n(https://github.com/chakravarthi589/Event-based-Vision_Resources) categorizes\npast and future research articles and consolidates valuable resources.\n","authors":["Bharatesh Chakravarthi","Aayush Atul Verma","Kostas Daniilidis","Cornelia Fermuller","Yezhou Yang"],"pdf_url":"https://arxiv.org/pdf/2408.13627v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15077v1","updated":"2024-08-27T14:05:48Z","published":"2024-08-27T14:05:48Z","title":"MMASD+: A Novel Dataset for Privacy-Preserving Behavior Analysis of\n  Children with Autism Spectrum Disorder","summary":"  Autism spectrum disorder (ASD) is characterized by significant challenges in\nsocial interaction and comprehending communication signals. Recently,\ntherapeutic interventions for ASD have increasingly utilized Deep learning\npowered-computer vision techniques to monitor individual progress over time.\nThese models are trained on private, non-public datasets from the autism\ncommunity, creating challenges in comparing results across different models due\nto privacy-preserving data-sharing issues. This work introduces MMASD+. MMASD+\nconsists of diverse data modalities, including 3D-Skeleton, 3D Body Mesh, and\nOptical Flow data. It integrates the capabilities of Yolov8 and Deep SORT\nalgorithms to distinguish between the therapist and children, addressing a\nsignificant barrier in the original dataset. Additionally, a Multimodal\nTransformer framework is proposed to predict 11 action types and the presence\nof ASD. This framework achieves an accuracy of 95.03% for predicting action\ntypes and 96.42% for predicting ASD presence, demonstrating over a 10%\nimprovement compared to models trained on single data modalities. These\nfindings highlight the advantages of integrating multiple data modalities\nwithin the Multimodal Transformer framework.\n","authors":["Pavan Uttej Ravva","Behdokht Kiafar","Pinar Kullu","Jicheng Li","Anjana Bhat","Roghayeh Leila Barmaki"],"pdf_url":"https://arxiv.org/pdf/2408.15077v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15069v1","updated":"2024-08-27T13:56:48Z","published":"2024-08-27T13:56:48Z","title":"Geometric Artifact Correction for Symmetric Multi-Linear Trajectory CT:\n  Theory, Method, and Generalization","summary":"  For extending CT field-of-view to perform non-destructive testing, the\nSymmetric Multi-Linear trajectory Computed Tomography (SMLCT) has been\ndeveloped as a successful example of non-standard CT scanning modes. However,\ninevitable geometric errors can cause severe artifacts in the reconstructed\nimages. The existing calibration method for SMLCT is both crude and\ninefficient. It involves reconstructing hundreds of images by exhaustively\nsubstituting each potential error, and then manually identifying the images\nwith the fewest geometric artifacts to estimate the final geometric errors for\ncalibration. In this paper, we comprehensively and efficiently address the\nchallenging geometric artifacts in SMLCT, , and the corresponding works mainly\ninvolve theory, method, and generalization. In particular, after identifying\nsensitive parameters and conducting some theory analysis of geometric\nartifacts, we summarize several key properties between sensitive geometric\nparameters and artifact characteristics. Then, we further construct\nmathematical relationships that relate sensitive geometric errors to the pixel\noffsets of reconstruction images with artifact characteristics. To accurately\nextract pixel bias, we innovatively adapt the Generalized Cross-Correlation\nwith Phase Transform (GCC-PHAT) algorithm, commonly used in sound processing,\nfor our image registration task for each paired symmetric LCT. This adaptation\nleads to the design of a highly efficient rigid translation registration\nmethod. Simulation and physical experiments have validated the excellent\nperformance of this work. Additionally, our results demonstrate significant\ngeneralization to common rotated CT and a variant of SMLCT.\n","authors":["Zhisheng Wang","Yanxu Sun","Shangyu Li","Legeng Lin","Shunli Wang","Junning Cui"],"pdf_url":"https://arxiv.org/pdf/2408.15069v1.pdf","comment":"15 pages, 10 figures"},{"id":"http://arxiv.org/abs/2312.11470v2","updated":"2024-08-27T13:55:17Z","published":"2023-11-14T11:36:20Z","title":"An Improved Anomaly Detection Model for Automated Inspection of Power\n  Line Insulators","summary":"  Inspection of insulators is important to ensure reliable operation of the\npower system. Deep learning is being increasingly exploited to automate the\ninspection process by leveraging object detection models to analyse aerial\nimages captured by drones. A purely object detection-based approach, however,\nsuffers from class imbalance-induced poor performance, which can be accentuated\nfor infrequent and hard-to-detect incipient faults. This article proposes the\nuse of anomaly detection along with object detection in a two-stage approach\nfor incipient fault detection in a data-efficient manner. An explainable\nconvolutional one-class classifier is adopted for anomaly detection. The\none-class formulation reduces the reliance on plentifully available images of\nfaulty insulators, while the explainability of the model is expected to promote\nadoption by the industry. A modified loss function is developed that addresses\ncomputational and interpretability issues with the existing model, also\nallowing for the integration of other losses. The superiority of the novel loss\nfunction is demonstrated with MVTec-AD dataset. The models are trained for\ninsulator inspection with two datasets -- representing data-abundant and\ndata-scarce scenarios -- in unsupervised and semi-supervised settings. The\nresults suggest that including as few as five real anomalies in the training\ndataset significantly improves the model's performance and enables reliable\ndetection of rarely occurring incipient faults in insulators.\n","authors":["Laya Das","Blazhe Gjorgiev","Giovanni Sansavini"],"pdf_url":"https://arxiv.org/pdf/2312.11470v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15063v1","updated":"2024-08-27T13:47:31Z","published":"2024-08-27T13:47:31Z","title":"Adapting Segment Anything Model to Multi-modal Salient Object Detection\n  with Semantic Feature Fusion Guidance","summary":"  Although most existing multi-modal salient object detection (SOD) methods\ndemonstrate effectiveness through training models from scratch, the limited\nmulti-modal data hinders these methods from reaching optimality. In this paper,\nwe propose a novel framework to explore and exploit the powerful feature\nrepresentation and zero-shot generalization ability of the pre-trained Segment\nAnything Model (SAM) for multi-modal SOD. Despite serving as a recent vision\nfundamental model, driving the class-agnostic SAM to comprehend and detect\nsalient objects accurately is non-trivial, especially in challenging scenes. To\nthis end, we develop \\underline{SAM} with se\\underline{m}antic\nf\\underline{e}ature fu\\underline{s}ion guidanc\\underline{e} (Sammese), which\nincorporates multi-modal saliency-specific knowledge into SAM to adapt SAM to\nmulti-modal SOD tasks. However, it is difficult for SAM trained on single-modal\ndata to directly mine the complementary benefits of multi-modal inputs and\ncomprehensively utilize them to achieve accurate saliency prediction.To address\nthese issues, we first design a multi-modal complementary fusion module to\nextract robust multi-modal semantic features by integrating information from\nvisible and thermal or depth image pairs. Then, we feed the extracted\nmulti-modal semantic features into both the SAM image encoder and mask decoder\nfor fine-tuning and prompting, respectively. Specifically, in the image\nencoder, a multi-modal adapter is proposed to adapt the single-modal SAM to\nmulti-modal information. In the mask decoder, a semantic-geometric prompt\ngeneration strategy is proposed to produce corresponding embeddings with\nvarious saliency cues. Extensive experiments on both RGB-D and RGB-T SOD\nbenchmarks show the effectiveness of the proposed framework.\n","authors":["Kunpeng Wang","Keke Chen","Chenglong Li","Zhengzheng Tu","Bin Luo"],"pdf_url":"https://arxiv.org/pdf/2408.15063v1.pdf","comment":"10 pages, 9 figures"},{"id":"http://arxiv.org/abs/2407.04833v3","updated":"2024-08-27T13:45:49Z","published":"2024-07-05T19:38:10Z","title":"3D Adaptive Structural Convolution Network for Domain-Invariant Point\n  Cloud Recognition","summary":"  Adapting deep learning networks for point cloud data recognition in\nself-driving vehicles faces challenges due to the variability in datasets and\nsensor technologies, emphasizing the need for adaptive techniques to maintain\naccuracy across different conditions. In this paper, we introduce the 3D\nAdaptive Structural Convolution Network (3D-ASCN), a cutting-edge framework for\n3D point cloud recognition. It combines 3D convolution kernels, a structural\ntree structure, and adaptive neighborhood sampling for effective geometric\nfeature extraction. This method obtains domain-invariant features and\ndemonstrates robust, adaptable performance on a variety of point cloud\ndatasets, ensuring compatibility across diverse sensor configurations without\nthe need for parameter adjustments. This highlights its potential to\nsignificantly enhance the reliability and efficiency of self-driving vehicle\ntechnology.\n","authors":["Younggun Kim","Beomsik Cho","Seonghoon Ryoo","Soomok Lee"],"pdf_url":"https://arxiv.org/pdf/2407.04833v3.pdf","comment":"11 pages, 3 figures"},{"id":"http://arxiv.org/abs/2209.11200v3","updated":"2024-08-27T13:44:11Z","published":"2022-09-22T17:42:44Z","title":"Attention is All They Need: Exploring the Media Archaeology of the\n  Computer Vision Research Paper","summary":"  Research papers, in addition to textual documents, are a designed interface\nthrough which researchers communicate. Recently, rapid growth has transformed\nthat interface in many fields of computing. In this work, we examine the\neffects of this growth from a media archaeology perspective, through the\nchanges to figures and tables in research papers. Specifically, we study these\nchanges in computer vision over the past decade, as the deep learning\nrevolution has driven unprecedented growth in the discipline. We ground our\ninvestigation through interviews with veteran researchers spanning computer\nvision, graphics, and visualization. Our analysis focuses on the research\nattention economy: how research paper elements contribute towards advertising,\nmeasuring, and disseminating an increasingly commodified \"contribution.\"\nThrough this work, we seek to motivate future discussion surrounding the design\nof both the research paper itself as well as the larger sociotechnical research\npublishing system, including tools for finding, reading, and writing research\npapers.\n","authors":["Samuel Goree","Gabriel Appleby","David Crandall","Norman Su"],"pdf_url":"https://arxiv.org/pdf/2209.11200v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14745v3","updated":"2024-08-27T13:36:12Z","published":"2024-04-23T04:54:32Z","title":"TAAT: Think and Act from Arbitrary Texts in Text2Motion","summary":"  Text to Motion aims to generate human motions from texts. Existing settings\nassume that texts include action labels, which limits flexibility in practical\nscenarios. This paper extends this task with a more realistic assumption that\nthe texts are arbitrary. Specifically, in our setting, arbitrary texts include\nexisting action texts composed of action labels and introduce scene texts\nwithout explicit action labels. To address this practical issue, we extend the\naction texts in the HUMANML3D dataset by incorporating additional scene texts,\nthereby creating a new dataset, HUMANML3D++. Concurrently, we propose a simple\nframework that extracts action representations from arbitrary texts using a\nLarge Language Model (LLM) and subsequently generates motions. Furthermore, we\nenhance the existing evaluation methodologies to address their inadequacies.\nExtensive experiments are conducted under different application scenarios to\nvalidate the effectiveness of the proposed framework on existing and proposed\ndatasets. The results indicate that Text to Motion in this realistic setting is\nvery challenging, fostering new research in this practical direction. Our\ndataset and code will be released.\n","authors":["Runqi Wang","Caoyuan Ma","Guopeng Li","Zheng Wang"],"pdf_url":"https://arxiv.org/pdf/2404.14745v3.pdf","comment":"Updated errors in author information"},{"id":"http://arxiv.org/abs/2408.15045v1","updated":"2024-08-27T13:13:38Z","published":"2024-08-27T13:13:38Z","title":"DocLayLLM: An Efficient and Effective Multi-modal Extension of Large\n  Language Models for Text-rich Document Understanding","summary":"  Text-rich document understanding (TDU) refers to analyzing and comprehending\ndocuments containing substantial textual content. With the rapid evolution of\nlarge language models (LLMs), they have been widely leveraged for TDU due to\ntheir remarkable versatility and generalization. In this paper, we introduce\nDocLayLLM, an efficient and effective multi-modal extension of LLMs\nspecifically designed for TDU. By integrating visual patch tokens and 2D\npositional tokens into LLMs and encoding the document content using the LLMs\nthemselves, we fully take advantage of the document comprehension capability of\nLLMs and enhance their perception of OCR information. We have also deeply\nconsidered the role of the chain-of-thought (CoT) and innovatively proposed the\ntechniques of CoT Pre-training and CoT Annealing. Our DocLayLLM can achieve\nremarkable performances with lightweight training settings, showcasing its\nefficiency and effectiveness. Experimental results demonstrate that our\nDocLayLLM surpasses existing OCR-dependent methods and also outperforms\nOCR-free competitors.\n","authors":["Wenhui Liao","Jiapeng Wang","Hongliang Li","Chengyu Wang","Jun Huang","Lianwen Jin"],"pdf_url":"https://arxiv.org/pdf/2408.15045v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15038v1","updated":"2024-08-27T13:07:09Z","published":"2024-08-27T13:07:09Z","title":"Interactive Occlusion Boundary Estimation through Exploitation of\n  Synthetic Data","summary":"  Occlusion boundaries (OBs) geometrically localize the occlusion events in a\n2D image, and contain useful information for addressing various scene\nunderstanding problems. To advance their study, we have led the investigation\nin the following three aspects. Firstly, we have studied interactive estimation\nof OBs, which is the first in the literature, and proposed an efficient\ndeep-network-based method using multiple-scribble intervention, named DNMMSI,\nwhich significantly improves the performance over the state-of-the-art\nfully-automatic methods. Secondly, we propose to exploit the synthetic\nbenchmark for the training process, thanks to the particularity that OBs are\ndetermined geometrically and unambiguously from the 3D scene. To this end, we\nhave developed an efficient tool, named Mesh2OB, for the automatic generation\nof 2D images together with their ground-truth OBs, using which we have\nconstructed a synthetic benchmark, named OB-FUTURE. Abundant experimental\nresults demonstrate that leveraging such a synthetic benchmark for training\nachieves promising performance, even without the use of domain adaptation\ntechniques. Finally, to achieve a more compelling and robust evaluation in\nOB-related research, we have created a real benchmark, named OB-LabName,\nconsisting of 120 high-resolution images together with their ground-truth OBs,\nwith precision surpassing that of previous benchmarks. We will release DNMMSI\nwith pre-trained parameters, Mesh2OB, OB-FUTURE, and OB-LabName to support\nfurther research.\n","authors":["Lintao Xu","Chaohui Wang"],"pdf_url":"https://arxiv.org/pdf/2408.15038v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.20710v2","updated":"2024-08-27T13:05:27Z","published":"2023-10-31T17:59:58Z","title":"FPO++: Efficient Encoding and Rendering of Dynamic Neural Radiance\n  Fields by Analyzing and Enhancing Fourier PlenOctrees","summary":"  Fourier PlenOctrees have shown to be an efficient representation for\nreal-time rendering of dynamic Neural Radiance Fields (NeRF). Despite its many\nadvantages, this method suffers from artifacts introduced by the involved\ncompression when combining it with recent state-of-the-art techniques for\ntraining the static per-frame NeRF models. In this paper, we perform an\nin-depth analysis of these artifacts and leverage the resulting insights to\npropose an improved representation. In particular, we present a novel density\nencoding that adapts the Fourier-based compression to the characteristics of\nthe transfer function used by the underlying volume rendering procedure and\nleads to a substantial reduction of artifacts in the dynamic model.\nFurthermore, we show an augmentation of the training data that relaxes the\nperiodicity assumption of the compression. We demonstrate the effectiveness of\nour enhanced Fourier PlenOctrees in the scope of quantitative and qualitative\nevaluations on synthetic and real-world scenes.\n","authors":["Saskia Rabich","Patrick Stotko","Reinhard Klein"],"pdf_url":"https://arxiv.org/pdf/2310.20710v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15032v1","updated":"2024-08-27T13:01:19Z","published":"2024-08-27T13:01:19Z","title":"Mamba2MIL: State Space Duality Based Multiple Instance Learning for\n  Computational Pathology","summary":"  Computational pathology (CPath) has significantly advanced the clinical\npractice of pathology. Despite the progress made, Multiple Instance Learning\n(MIL), a promising paradigm within CPath, continues to face challenges,\nparticularly related to incomplete information utilization. Existing\nframeworks, such as those based on Convolutional Neural Networks (CNNs),\nattention, and selective scan space state sequential model (SSM), lack\nsufficient flexibility and scalability in fusing diverse features, and cannot\neffectively fuse diverse features. Additionally, current approaches do not\nadequately exploit order-related and order-independent features, resulting in\nsuboptimal utilization of sequence information. To address these limitations,\nwe propose a novel MIL framework called Mamba2MIL. Our framework utilizes the\nstate space duality model (SSD) to model long sequences of patches of whole\nslide images (WSIs), which, combined with weighted feature selection, supports\nthe fusion processing of more branching features and can be extended according\nto specific application needs. Moreover, we introduce a sequence transformation\nmethod tailored to varying WSI sizes, which enhances sequence-independent\nfeatures while preserving local sequence information, thereby improving\nsequence information utilization. Extensive experiments demonstrate that\nMamba2MIL surpasses state-of-the-art MIL methods. We conducted extensive\nexperiments across multiple datasets, achieving improvements in nearly all\nperformance metrics. Specifically, on the NSCLC dataset, Mamba2MIL achieves a\nbinary tumor classification AUC of 0.9533 and an accuracy of 0.8794. On the\nBRACS dataset, it achieves a multiclass classification AUC of 0.7986 and an\naccuracy of 0.4981. The code is available at\nhttps://github.com/YuqiZhang-Buaa/Mamba2MIL.\n","authors":["Yuqi Zhang","Xiaoqian Zhang","Jiakai Wang","Yuancheng Yang","Taiying Peng","Chao Tong"],"pdf_url":"https://arxiv.org/pdf/2408.15032v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15026v1","updated":"2024-08-27T12:55:54Z","published":"2024-08-27T12:55:54Z","title":"Sequence-aware Pre-training for Echocardiography Probe Guidance","summary":"  Cardiac ultrasound probe guidance aims to help novices adjust the 6-DOF probe\npose to obtain high-quality sectional images. Cardiac ultrasound faces two\nmajor challenges: (1) the inherently complex structure of the heart, and (2)\nsignificant individual variations. Previous works have only learned the\npopulation-averaged 2D and 3D structures of the heart rather than personalized\ncardiac structural features, leading to a performance bottleneck. Clinically,\nwe observed that sonographers adjust their understanding of a patient's cardiac\nstructure based on prior scanning sequences, thereby modifying their scanning\nstrategies. Inspired by this, we propose a sequence-aware self-supervised\npre-training method. Specifically, our approach learns personalized 2D and 3D\ncardiac structural features by predicting the masked-out images and actions in\na scanning sequence. We hypothesize that if the model can predict the missing\ncontent it has acquired a good understanding of the personalized cardiac\nstructure. In the downstream probe guidance task, we also introduced a sequence\nmodeling approach that models individual cardiac structural information based\non the images and actions from historical scan data, enabling more accurate\nnavigation decisions. Experiments on a large-scale dataset with 1.36 million\nsamples demonstrated that our proposed sequence-aware paradigm can\nsignificantly reduce navigation errors, with translation errors decreasing by\n15.90% to 36.87% and rotation errors decreasing by 11.13% to 20.77%, compared\nto state-of-the-art methods.\n","authors":["Haojun Jiang","Zhenguo Sun","Yu Sun","Ning Jia","Meng Li","Shaqi Luo","Shiji Song","Gao Huang"],"pdf_url":"https://arxiv.org/pdf/2408.15026v1.pdf","comment":"Tech Report"},{"id":"http://arxiv.org/abs/2408.15020v1","updated":"2024-08-27T12:53:25Z","published":"2024-08-27T12:53:25Z","title":"Hierarchical Graph Interaction Transformer with Dynamic Token Clustering\n  for Camouflaged Object Detection","summary":"  Camouflaged object detection (COD) aims to identify the objects that\nseamlessly blend into the surrounding backgrounds. Due to the intrinsic\nsimilarity between the camouflaged objects and the background region, it is\nextremely challenging to precisely distinguish the camouflaged objects by\nexisting approaches. In this paper, we propose a hierarchical graph interaction\nnetwork termed HGINet for camouflaged object detection, which is capable of\ndiscovering imperceptible objects via effective graph interaction among the\nhierarchical tokenized features. Specifically, we first design a region-aware\ntoken focusing attention (RTFA) with dynamic token clustering to excavate the\npotentially distinguishable tokens in the local region. Afterwards, a\nhierarchical graph interaction transformer (HGIT) is proposed to construct\nbi-directional aligned communication between hierarchical features in the\nlatent interaction space for visual semantics enhancement. Furthermore, we\npropose a decoder network with confidence aggregated feature fusion (CAFF)\nmodules, which progressively fuses the hierarchical interacted features to\nrefine the local detail in ambiguous regions. Extensive experiments conducted\non the prevalent datasets, i.e. COD10K, CAMO, NC4K and CHAMELEON demonstrate\nthe superior performance of HGINet compared to existing state-of-the-art\nmethods. Our code is available at https://github.com/Garyson1204/HGINet.\n","authors":["Siyuan Yao","Hao Sun","Tian-Zhu Xiang","Xiao Wang","Xiaochun Cao"],"pdf_url":"https://arxiv.org/pdf/2408.15020v1.pdf","comment":"Submitted to IEEE Transactions on Image Processing"},{"id":"http://arxiv.org/abs/2408.15015v1","updated":"2024-08-27T12:50:12Z","published":"2024-08-27T12:50:12Z","title":"Alternating Minimization Schemes for Computing\n  Rate-Distortion-Perception Functions with $f$-Divergence Perception\n  Constraints","summary":"  We study the computation of the rate-distortion-perception function (RDPF)\nfor discrete memoryless sources subject to a single-letter average distortion\nconstraint and a perception constraint that belongs to the family of\n$f$-divergences. In this setting, the RDPF forms a convex programming problem\nfor which we characterize the optimal parametric solutions. We employ the\ndeveloped solutions in an alternating minimization scheme, namely Optimal\nAlternating Minimization (OAM), for which we provide convergence guarantees.\nNevertheless, the OAM scheme does not lead to a direct implementation of a\ngeneralized Blahut-Arimoto (BA) type of algorithm due to the presence of\nimplicit equations in the structure of the iteration. To overcome this\ndifficulty, we propose two alternative minimization approaches whose\napplicability depends on the smoothness of the used perception metric: a\nNewton-based Alternating Minimization (NAM) scheme, relying on Newton's\nroot-finding method for the approximation of the optimal iteration solution,\nand a Relaxed Alternating Minimization (RAM) scheme, based on a relaxation of\nthe OAM iterates. Both schemes are shown, via the derivation of necessary and\nsufficient conditions, to guarantee convergence to a globally optimal solution.\nWe also provide sufficient conditions on the distortion and the perception\nconstraints which guarantee that the proposed algorithms converge exponentially\nfast in the number of iteration steps. We corroborate our theoretical results\nwith numerical simulations and draw connections with existing results.\n","authors":["Giuseppe Serra","Photios A. Stavrou","Marios Kountouris"],"pdf_url":"https://arxiv.org/pdf/2408.15015v1.pdf","comment":"This work has been submitted for possible publication"},{"id":"http://arxiv.org/abs/2408.15011v1","updated":"2024-08-27T12:48:46Z","published":"2024-08-27T12:48:46Z","title":"Pre-training Everywhere: Parameter-Efficient Fine-Tuning for Medical\n  Image Analysis via Target Parameter Pre-training","summary":"  Parameter-efficient fine-tuning (PEFT) techniques have emerged to address\nissues of overfitting and high computational costs associated with fully\nfine-tuning in the paradigm of self-supervised learning. Mainstream methods\nbased on PEFT involve adding a few trainable parameters while keeping the\npre-trained parameters of the backbone fixed. These methods achieve\ncomparative, and often superior, performance to fully fine-tuning,\ndemonstrating the powerful representation ability of the pre-trained backbone.\nDespite its success, these methods typically ignore the initialization of the\nnew parameters, often relying solely on random initialization. We argue that if\npre-training is significantly beneficial, it should be applied to all\nparameters requiring representational capacity. Motivated by this insight, we\npropose a simple yet effective fine-tuning framework based on Target Parameter\nPre-training (TPP). The target parameters refer to the new parameters\nintroduced during fine-tuning. TPP includes an additional stage before PEFT to\npre-train these target parameters. During this stage, the pre-trained backbone\nparameters are frozen, and only the target parameters are trainable. A defined\npre-text task is used to encourage the target parameters to learn specific\nrepresentations of downstream data. When PEFT is subsequently employed, the\npre-trained target parameters are loaded to enhance fine-tuning efficiency. The\nproposed TPP framework is versatile, allowing for the integration of various\npretext tasks for pre-training and supporting different PEFT methods as\nbackbones. We evaluated the fine-tining performance of our method using five\npublic datasets, including three modalities and two task types. The results\ndemonstrate that the proposed TPP can be easily integrated into existing PEFT\nmethods, significantly improving performance.\n","authors":["Xingliang Lei","Yiwen Ye","Ziyang Chen","Minglei Shu","Yong Xia"],"pdf_url":"https://arxiv.org/pdf/2408.15011v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.03209v2","updated":"2024-08-27T12:39:18Z","published":"2024-08-06T14:08:22Z","title":"IPAdapter-Instruct: Resolving Ambiguity in Image-based Conditioning\n  using Instruct Prompts","summary":"  Diffusion models continuously push the boundary of state-of-the-art image\ngeneration, but the process is hard to control with any nuance: practice proves\nthat textual prompts are inadequate for accurately describing image style or\nfine structural details (such as faces). ControlNet and IPAdapter address this\nshortcoming by conditioning the generative process on imagery instead, but each\nindividual instance is limited to modeling a single conditional posterior: for\npractical use-cases, where multiple different posteriors are desired within the\nsame workflow, training and using multiple adapters is cumbersome. We propose\nIPAdapter-Instruct, which combines natural-image conditioning with ``Instruct''\nprompts to swap between interpretations for the same conditioning image: style\ntransfer, object extraction, both, or something else still? IPAdapterInstruct\nefficiently learns multiple tasks with minimal loss in quality compared to\ndedicated per-task models.\n","authors":["Ciara Rowles","Shimon Vainer","Dante De Nigris","Slava Elizarov","Konstantin Kutsy","Simon Donné"],"pdf_url":"https://arxiv.org/pdf/2408.03209v2.pdf","comment":"17 pages, 10 figures, Project page:\n  https://unity-research.github.io/IP-Adapter-Instruct.github.io/"},{"id":"http://arxiv.org/abs/2408.15002v1","updated":"2024-08-27T12:34:41Z","published":"2024-08-27T12:34:41Z","title":"Knowledge Discovery in Optical Music Recognition: Enhancing Information\n  Retrieval with Instance Segmentation","summary":"  Optical Music Recognition (OMR) automates the transcription of musical\nnotation from images into machine-readable formats like MusicXML, MEI, or MIDI,\nsignificantly reducing the costs and time of manual transcription. This study\nexplores knowledge discovery in OMR by applying instance segmentation using\nMask R-CNN to enhance the detection and delineation of musical symbols in sheet\nmusic. Unlike Optical Character Recognition (OCR), OMR must handle the\nintricate semantics of Common Western Music Notation (CWMN), where symbol\nmeanings depend on shape, position, and context. Our approach leverages\ninstance segmentation to manage the density and overlap of musical symbols,\nfacilitating more precise information retrieval from music scores. Evaluations\non the DoReMi and MUSCIMA++ datasets demonstrate substantial improvements, with\nour method achieving a mean Average Precision (mAP) of up to 59.70\\% in dense\nsymbol environments, achieving comparable results to object detection.\nFurthermore, using traditional computer vision techniques, we add a parallel\nstep for staff detection to infer the pitch for the recognised symbols. This\nstudy emphasises the role of pixel-wise segmentation in advancing accurate\nmusic symbol recognition, contributing to knowledge discovery in OMR. Our\nfindings indicate that instance segmentation provides more precise\nrepresentations of musical symbols, particularly in densely populated scores,\nadvancing OMR technology. We make our implementation, pre-processing scripts,\ntrained models, and evaluation results publicly available to support further\nresearch and development.\n","authors":["Elona Shatri","George Fazekas"],"pdf_url":"https://arxiv.org/pdf/2408.15002v1.pdf","comment":"8 pages content and one references, accepted version at the\n  International Conference on Knowledge Discovery and Information Retrieval\n  2024, Porto, Portugal"},{"id":"http://arxiv.org/abs/2408.14998v1","updated":"2024-08-27T12:28:41Z","published":"2024-08-27T12:28:41Z","title":"FastTextSpotter: A High-Efficiency Transformer for Multilingual Scene\n  Text Spotting","summary":"  The proliferation of scene text in both structured and unstructured\nenvironments presents significant challenges in optical character recognition\n(OCR), necessitating more efficient and robust text spotting solutions. This\npaper presents FastTextSpotter, a framework that integrates a Swin Transformer\nvisual backbone with a Transformer Encoder-Decoder architecture, enhanced by a\nnovel, faster self-attention unit, SAC2, to improve processing speeds while\nmaintaining accuracy. FastTextSpotter has been validated across multiple\ndatasets, including ICDAR2015 for regular texts and CTW1500 and TotalText for\narbitrary-shaped texts, benchmarking against current state-of-the-art models.\nOur results indicate that FastTextSpotter not only achieves superior accuracy\nin detecting and recognizing multilingual scene text (English and Vietnamese)\nbut also improves model efficiency, thereby setting new benchmarks in the\nfield. This study underscores the potential of advanced transformer\narchitectures in improving the adaptability and speed of text spotting\napplications in diverse real-world settings. The dataset, code, and pre-trained\nmodels have been released in our Github.\n","authors":["Alloy Das","Sanket Biswas","Umapada Pal","Josep Lladós","Saumik Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2408.14998v1.pdf","comment":"Accepted in ICPR 2024"},{"id":"http://arxiv.org/abs/2408.14997v1","updated":"2024-08-27T12:25:12Z","published":"2024-08-27T12:25:12Z","title":"Depth Restoration of Hand-Held Transparent Objects for Human-to-Robot\n  Handover","summary":"  Transparent objects are common in daily life, while their unique optical\nproperties pose challenges for RGB-D cameras, which struggle to capture\naccurate depth information. For assistant robots, accurately perceiving\ntransparent objects held by humans is essential for effective human-robot\ninteraction. This paper presents a Hand-Aware Depth Restoration (HADR) method\nfor hand-held transparent objects based on creating an implicit neural\nrepresentation function from a single RGB-D image. The proposed method\nintroduces the hand posture as an important guidance to leverage semantic and\ngeometric information. To train and evaluate the proposed method, we create a\nhigh-fidelity synthetic dataset called TransHand-14K with a real-to-sim data\ngeneration scheme. Experiments show that our method has a better performance\nand generalization ability compared with existing methods. We further develop a\nreal-world human-to-robot handover system based on the proposed depth\nrestoration method, demonstrating its application value in human-robot\ninteraction.\n","authors":["Ran Yu","Haixin Yu","Huang Yan","Ziwu Song","Shoujie Li","Wenbo Ding"],"pdf_url":"https://arxiv.org/pdf/2408.14997v1.pdf","comment":"7 pages, 7 figures, conference"},{"id":"http://arxiv.org/abs/2407.21687v2","updated":"2024-08-27T12:03:00Z","published":"2024-07-31T15:29:34Z","title":"Dynamic Object Queries for Transformer-based Incremental Object\n  Detection","summary":"  Incremental object detection (IOD) aims to sequentially learn new classes,\nwhile maintaining the capability to locate and identify old ones. As the\ntraining data arrives with annotations only with new classes, IOD suffers from\ncatastrophic forgetting. Prior methodologies mainly tackle the forgetting issue\nthrough knowledge distillation and exemplar replay, ignoring the conflict\nbetween limited model capacity and increasing knowledge. In this paper, we\nexplore \\textit{dynamic object queries} for incremental object detection built\non Transformer architecture. We propose the \\textbf{Dy}namic object\n\\textbf{Q}uery-based \\textbf{DE}tection \\textbf{TR}ansformer (DyQ-DETR), which\nincrementally expands the model representation ability to achieve\nstability-plasticity tradeoff. First, a new set of learnable object queries are\nfed into the decoder to represent new classes. These new object queries are\naggregated with those from previous phases to adapt both old and new knowledge\nwell. Second, we propose the isolated bipartite matching for object queries in\ndifferent phases, based on disentangled self-attention. The interaction among\nthe object queries at different phases is eliminated to reduce inter-class\nconfusion. Thanks to the separate supervision and computation over object\nqueries, we further present the risk-balanced partial calibration for effective\nexemplar replay. Extensive experiments demonstrate that DyQ-DETR significantly\nsurpasses the state-of-the-art methods, with limited parameter overhead. Code\nwill be made publicly available.\n","authors":["Jichuan Zhang","Wei Li","Shuang Cheng","Ya-Li Li","Shengjin Wang"],"pdf_url":"https://arxiv.org/pdf/2407.21687v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.07257v2","updated":"2024-08-27T11:44:56Z","published":"2024-05-12T11:41:44Z","title":"Listen, Disentangle, and Control: Controllable Speech-Driven Talking\n  Head Generation","summary":"  Most earlier investigations on talking face generation have focused on the\nsynchronization of lip motion and speech content. However, human head pose and\nfacial emotions are equally important characteristics of natural human faces.\nWhile audio-driven talking face generation has seen notable advancements,\nexisting methods either overlook facial emotions or are limited to specific\nindividuals and cannot be applied to arbitrary subjects. In this paper, we\npropose a one-shot Talking Head Generation framework (SPEAK) that distinguishes\nitself from general Talking Face Generation by enabling emotional and postural\ncontrol. Specifically, we introduce the Inter-Reconstructed Feature\nDisentanglement (IRFD) method to decouple human facial features into three\nlatent spaces. We then design a face editing module that modifies speech\ncontent and facial latent codes into a single latent space. Subsequently, we\npresent a novel generator that employs modified latent codes derived from the\nediting module to regulate emotional expression, head poses, and speech content\nin synthesizing facial animations. Extensive trials demonstrate that our method\ncan generate realistic talking head with coordinated lip motions, authentic\nfacial emotions, and smooth head movements. The demo video is available at the\nanonymous link: https://anonymous.4open.science/r/SPEAK-F56E\n","authors":["Changpeng Cai","Guinan Guo","Jiao Li","Junhao Su","Chenghao He","Jing Xiao","Yuanxu Chen","Lei Dai","Feiyu Zhu"],"pdf_url":"https://arxiv.org/pdf/2405.07257v2.pdf","comment":"Due to our negligence, there are factual errors in the experimental\n  results, so we are considering resubmitting the paper after an overhaul"},{"id":"http://arxiv.org/abs/2408.14977v1","updated":"2024-08-27T11:40:23Z","published":"2024-08-27T11:40:23Z","title":"LN-Gen: Rectal Lymph Nodes Generation via Anatomical Features","summary":"  Accurate segmentation of rectal lymph nodes is crucial for the staging and\ntreatment planning of rectal cancer. However, the complexity of the surrounding\nanatomical structures and the scarcity of annotated data pose significant\nchallenges. This study introduces a novel lymph node synthesis technique aimed\nat generating diverse and realistic synthetic rectal lymph node samples to\nmitigate the reliance on manual annotation. Unlike direct diffusion methods,\nwhich often produce masks that are discontinuous and of suboptimal quality, our\napproach leverages an implicit SDF-based method for mask generation, ensuring\nthe production of continuous, stable, and morphologically diverse masks.\nExperimental results demonstrate that our synthetic data significantly improves\nsegmentation performance. Our work highlights the potential of diffusion model\nfor accurately synthesizing structurally complex lesions, such as lymph nodes\nin rectal cancer, alleviating the challenge of limited annotated data in this\nfield and aiding in advancements in rectal cancer diagnosis and treatment.\n","authors":["Weidong Guo","Hantao Zhang","Shouhong Wan","Bingbing Zou","Wanqin Wang","Peiquan Jin"],"pdf_url":"https://arxiv.org/pdf/2408.14977v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2408.14976v1","updated":"2024-08-27T11:38:01Z","published":"2024-08-27T11:38:01Z","title":"Prior-free Balanced Replay: Uncertainty-guided Reservoir Sampling for\n  Long-Tailed Continual Learning","summary":"  Even in the era of large models, one of the well-known issues in continual\nlearning (CL) is catastrophic forgetting, which is significantly challenging\nwhen the continual data stream exhibits a long-tailed distribution, termed as\nLong-Tailed Continual Learning (LTCL). Existing LTCL solutions generally\nrequire the label distribution of the data stream to achieve re-balance\ntraining. However, obtaining such prior information is often infeasible in real\nscenarios since the model should learn without pre-identifying the majority and\nminority classes. To this end, we propose a novel Prior-free Balanced Replay\n(PBR) framework to learn from long-tailed data stream with less forgetting.\nConcretely, motivated by our experimental finding that the minority classes are\nmore likely to be forgotten due to the higher uncertainty, we newly design an\nuncertainty-guided reservoir sampling strategy to prioritize rehearsing\nminority data without using any prior information, which is based on the mutual\ndependence between the model and samples. Additionally, we incorporate two\nprior-free components to further reduce the forgetting issue: (1) Boundary\nconstraint is to preserve uncertain boundary supporting samples for continually\nre-estimating task boundaries. (2) Prototype constraint is to maintain the\nconsistency of learned class prototypes along with training. Our approach is\nevaluated on three standard long-tailed benchmarks, demonstrating superior\nperformance to existing CL methods and previous SOTA LTCL approach in both\ntask- and class-incremental learning settings, as well as ordered- and\nshuffled-LTCL settings.\n","authors":["Lei Liu","Li Liu","Yawen Cui"],"pdf_url":"https://arxiv.org/pdf/2408.14976v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14975v1","updated":"2024-08-27T11:31:47Z","published":"2024-08-27T11:31:47Z","title":"MegActor-$Σ$: Unlocking Flexible Mixed-Modal Control in Portrait\n  Animation with Diffusion Transformer","summary":"  Diffusion models have demonstrated superior performance in the field of\nportrait animation. However, current approaches relied on either visual or\naudio modality to control character movements, failing to exploit the potential\nof mixed-modal control. This challenge arises from the difficulty in balancing\nthe weak control strength of audio modality and the strong control strength of\nvisual modality. To address this issue, we introduce MegActor-$\\Sigma$: a\nmixed-modal conditional diffusion transformer (DiT), which can flexibly inject\naudio and visual modality control signals into portrait animation.\nSpecifically, we make substantial advancements over its predecessor, MegActor,\nby leveraging the promising model structure of DiT and integrating audio and\nvisual conditions through advanced modules within the DiT framework. To further\nachieve flexible combinations of mixed-modal control signals, we propose a\n``Modality Decoupling Control\" training strategy to balance the control\nstrength between visual and audio modalities, along with the ``Amplitude\nAdjustment\" inference strategy to freely regulate the motion amplitude of each\nmodality. Finally, to facilitate extensive studies in this field, we design\nseveral dataset evaluation metrics to filter out public datasets and solely use\nthis filtered dataset to train MegActor-$\\Sigma$. Extensive experiments\ndemonstrate the superiority of our approach in generating vivid portrait\nanimations, outperforming previous methods trained on private dataset.\n","authors":["Shurong Yang","Huadong Li","Juhao Wu","Minhao Jing","Linze Li","Renhe Ji","Jiajun Liang","Haoqiang Fan","Jin Wang"],"pdf_url":"https://arxiv.org/pdf/2408.14975v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14962v1","updated":"2024-08-27T11:09:34Z","published":"2024-08-27T11:09:34Z","title":"Deep Learning-based Average Shear Wave Velocity Prediction using\n  Accelerometer Records","summary":"  Assessing seismic hazards and thereby designing earthquake-resilient\nstructures or evaluating structural damage that has been incurred after an\nearthquake are important objectives in earthquake engineering. Both tasks\nrequire critical evaluation of strong ground motion records, and the knowledge\nof site conditions at the earthquake stations plays a major role in achieving\nthe aforementioned objectives. Site conditions are generally represented by the\ntime-averaged shear wave velocity in the upper 30 meters of the geological\nmaterials (Vs30). Several strong motion stations lack Vs30 measurements\nresulting in potentially inaccurate assessment of seismic hazards and\nevaluation of ground motion records. In this study, we present a deep\nlearning-based approach for predicting Vs30 at strong motion station locations\nusing three-channel earthquake records. For this purpose, Convolutional Neural\nNetworks (CNNs) with dilated and causal convolutional layers are used to\nextract deep features from accelerometer records collected from over 700\nstations located in Turkey. In order to overcome the limited availability of\nlabeled data, we propose a two-phase training approach. In the first phase, a\nCNN is trained to estimate the epicenters, for which ground truth is available\nfor all records. After the CNN is trained, the pre-trained encoder is\nfine-tuned based on the Vs30 ground truth. The performance of the proposed\nmethod is compared with machine learning models that utilize hand-crafted\nfeatures. The results demonstrate that the deep convolutional encoder based\nVs30 prediction model outperforms the machine learning models that rely on\nhand-crafted features.\n","authors":["Barış Yılmaz","Melek Türkmen","Sanem Meral","Erdem Akagündüz","Salih Tileylioglu"],"pdf_url":"https://arxiv.org/pdf/2408.14962v1.pdf","comment":"12 pages, 14 figures, Accepted by 18th World Conference on Earthquake\n  Engineering WCEE2024"},{"id":"http://arxiv.org/abs/2408.14961v1","updated":"2024-08-27T11:07:19Z","published":"2024-08-27T11:07:19Z","title":"CVPT: Cross-Attention help Visual Prompt Tuning adapt visual task","summary":"  In recent years, the rapid expansion of model sizes has led to large-scale\npre-trained models demonstrating remarkable capabilities. Consequently, there\nhas been a trend towards increasing the scale of models. However, this trend\nintroduces significant challenges, including substantial computational costs of\ntraining and transfer to downstream tasks. To address these issues,\nParameter-Efficient Fine-Tuning (PEFT) methods have been introduced. These\nmethods optimize large-scale pre-trained models for specific tasks by\nfine-tuning a select group of parameters. Among these PEFT methods,\nadapter-based and prompt-based methods are the primary techniques.\nSpecifically, in the field of visual fine-tuning, adapters gain prominence over\nprompts because of the latter's relatively weaker performance and efficiency.\nUnder the circumstances, we refine the widely-used Visual Prompt Tuning (VPT)\nmethod, proposing Cross Visual Prompt Tuning (CVPT). CVPT calculates\ncross-attention between the prompt tokens and the embedded tokens, which allows\nus to compute the semantic relationship between them and conduct the\nfine-tuning of models exactly to adapt visual tasks better. Furthermore, we\nintroduce the weight-sharing mechanism to initialize the parameters of\ncross-attention, which avoids massive learnable parameters from cross-attention\nand enhances the representative capability of cross-attention. We conduct\ncomprehensive testing across 25 datasets and the result indicates that CVPT\nsignificantly improves VPT's performance and efficiency in visual tasks. For\nexample, on the VTAB-1K benchmark, CVPT outperforms VPT over 4% in average\naccuracy, rivaling the advanced adapter-based methods in performance and\nefficiency. Our experiments confirm that prompt-based methods can achieve\nexceptional results in visual fine-tuning.\n","authors":["Lingyun Huang","Jianxu Mao","Yaonan Wang","Junfei Yi","Ziming Tao"],"pdf_url":"https://arxiv.org/pdf/2408.14961v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14957v1","updated":"2024-08-27T11:04:53Z","published":"2024-08-27T11:04:53Z","title":"Applying ViT in Generalized Few-shot Semantic Segmentation","summary":"  This paper explores the capability of ViT-based models under the generalized\nfew-shot semantic segmentation (GFSS) framework. We conduct experiments with\nvarious combinations of backbone models, including ResNets and pretrained\nVision Transformer (ViT)-based models, along with decoders featuring a linear\nclassifier, UPerNet, and Mask Transformer. The structure made of DINOv2 and\nlinear classifier takes the lead on popular few-shot segmentation bench mark\nPASCAL-$5^i$, substantially outperforming the best of ResNet structure by 116%\nin one-shot scenario. We demonstrate the great potential of large pretrained\nViT-based model on GFSS task, and expect further improvement on testing\nbenchmarks. However, a potential caveat is that when applying pure ViT-based\nmodel and large scale ViT decoder, the model is easy to overfit.\n","authors":["Liyuan Geng","Jinhong Xia","Yuanhe Guo"],"pdf_url":"https://arxiv.org/pdf/2408.14957v1.pdf","comment":"7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2406.17640v2","updated":"2024-08-27T11:00:47Z","published":"2024-06-25T15:24:06Z","title":"BayTTA: Uncertainty-aware medical image classification with optimized\n  test-time augmentation using Bayesian model averaging","summary":"  Test-time augmentation (TTA) is a well-known technique employed during the\ntesting phase of computer vision tasks. It involves aggregating multiple\naugmented versions of input data. Combining predictions using a simple average\nformulation is a common and straightforward approach after performing TTA. This\npaper introduces a novel framework for optimizing TTA, called BayTTA\n(Bayesian-based TTA), which is based on Bayesian Model Averaging (BMA). First,\nwe generate a prediction list associated with different variations of the input\ndata created through TTA. Then, we use BMA to combine predictions weighted by\nthe respective posterior probabilities. Such an approach allows one to take\ninto account model uncertainty, and thus to enhance the predictive performance\nof the related machine learning or deep learning model. We evaluate the\nperformance of BayTTA on various public data, including three medical image\ndatasets comprising skin cancer, breast cancer, and chest X-ray images and two\nwell-known gene editing datasets, CRISPOR and GUIDE-seq. Our experimental\nresults indicate that BayTTA can be effectively integrated into\nstate-of-the-art deep learning models used in medical image analysis as well as\ninto some popular pre-trained CNN models such as VGG-16, MobileNetV2,\nDenseNet201, ResNet152V2, and InceptionRes-NetV2, leading to the enhancement in\ntheir accuracy and robustness performance. The source code of the proposed\nBayTTA method is freely available at: \\underline\n{https://github.com/Z-Sherkat/BayTTA}.\n","authors":["Zeinab Sherkatghanad","Moloud Abdar","Mohammadreza Bakhtyari","Pawel Plawiak","Vladimir Makarenkov"],"pdf_url":"https://arxiv.org/pdf/2406.17640v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14950v1","updated":"2024-08-27T10:54:37Z","published":"2024-08-27T10:54:37Z","title":"NeuralOOD: Improving Out-of-Distribution Generalization Performance with\n  Brain-machine Fusion Learning Framework","summary":"  Deep Neural Networks (DNNs) have demonstrated exceptional recognition\ncapabilities in traditional computer vision (CV) tasks. However, existing CV\nmodels often suffer a significant decrease in accuracy when confronted with\nout-of-distribution (OOD) data. In contrast to these DNN models, human can\nmaintain a consistently low error rate when facing OOD scenes, partly\nattributed to the rich prior cognitive knowledge stored in the human brain.\nPrevious OOD generalization researches only focus on the single modal,\noverlooking the advantages of multimodal learning method. In this paper, we\nutilize the multimodal learning method to improve the OOD generalization and\npropose a novel Brain-machine Fusion Learning (BMFL) framework. We adopt the\ncross-attention mechanism to fuse the visual knowledge from CV model and prior\ncognitive knowledge from the human brain. Specially, we employ a pre-trained\nvisual neural encoding model to predict the functional Magnetic Resonance\nImaging (fMRI) from visual features which eliminates the need for the fMRI data\ncollection and pre-processing, effectively reduces the workload associated with\nconventional BMFL methods. Furthermore, we construct a brain transformer to\nfacilitate the extraction of knowledge inside the fMRI data. Moreover, we\nintroduce the Pearson correlation coefficient maximization regularization\nmethod into the training process, which improves the fusion capability with\nbetter constrains. Our model outperforms the DINOv2 and baseline models on the\nImageNet-1k validation dataset as well as six curated OOD datasets, showcasing\nits superior performance in diverse scenarios.\n","authors":["Shuangchen Zhao","Changde Du","Hui Li","Huiguang He"],"pdf_url":"https://arxiv.org/pdf/2408.14950v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19513v2","updated":"2024-08-27T10:50:13Z","published":"2024-04-30T12:45:41Z","title":"A Smartphone-Based Method for Assessing Tomato Nutrient Status through\n  Trichome Density Measurement","summary":"  Early detection of fertilizer-induced stress in tomato plants is crucial for\ntimely crop management interventions and yield optimization. Conventional\noptical methods detect fertilizer stress in young leaves with difficulty. This\nstudy proposes a novel, noninvasive technique for quantifying the density of\ntrichomes-elongated hair-like structures found on plant surfaces-on young\nleaves using a smartphone. This method exhibits superior detection latency,\nenabling earlier and more accurate identification of fertilizer stress in\ntomato plants. Our approach combines augmented reality technology and image\nprocessing algorithms to analyze smartphone images of a specialized measurement\npaper. This measurement paper is applied to a tomato leaf to transfer trichomes\nonto its adhesive surface. The captured images are then processed through a\npipeline involving region of interest extraction, perspective transformation,\nand illumination correction. Trichome detection and spatial distribution\nanalysis of these preprocessed images yield a robust density metric. We\nvalidated our method through experiments on hydroponically grown tomatoes under\nvarying fertilizer concentrations. Using leave-one-out cross-validation\n(LOOCV), our model achieves a mean area under the precision-recall curve of\n0.824 and a receiver operating characteristic curve of 0.641 for predicting\nadditional fertilization needs. Based on LOOCV, quantitative analysis revealed\na strong relationship between trichome density and explanatory variables,\nincluding nitrate ion concentration, explaining 62.48% of the variation ($R^2 =\n0.625$). The predicted and actual trichome densities were strongly correlated\n($r = 0.794$). This straightforward and cost-effective method overcomes the\nlimitations of traditional techniques, demonstrating the potential of using\nsmartphones for practical plant nutrition diagnosis.\n","authors":["Sho Ueda","Xujun Ye"],"pdf_url":"https://arxiv.org/pdf/2404.19513v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14253v2","updated":"2024-08-27T10:50:13Z","published":"2024-08-26T13:16:03Z","title":"Text3DAug -- Prompted Instance Augmentation for LiDAR Perception","summary":"  LiDAR data of urban scenarios poses unique challenges, such as heterogeneous\ncharacteristics and inherent class imbalance. Therefore, large-scale datasets\nare necessary to apply deep learning methods. Instance augmentation has emerged\nas an efficient method to increase dataset diversity. However, current methods\nrequire the time-consuming curation of 3D models or costly manual data\nannotation. To overcome these limitations, we propose Text3DAug, a novel\napproach leveraging generative models for instance augmentation. Text3DAug does\nnot depend on labeled data and is the first of its kind to generate instances\nand annotations from text. This allows for a fully automated pipeline,\neliminating the need for manual effort in practical applications. Additionally,\nText3DAug is sensor agnostic and can be applied regardless of the LiDAR sensor\nused. Comprehensive experimental analysis on LiDAR segmentation, detection and\nnovel class discovery demonstrates that Text3DAug is effective in supplementing\nexisting methods or as a standalone method, performing on par or better than\nestablished methods, however while overcoming their specific drawbacks. The\ncode is publicly available.\n","authors":["Laurenz Reichardt","Luca Uhr","Oliver Wasenmüller"],"pdf_url":"https://arxiv.org/pdf/2408.14253v2.pdf","comment":"Accepted at the 2024 IEEE/RSJ International Conference on Intelligent\n  Robots and Systems (IROS 2024)"},{"id":"http://arxiv.org/abs/2308.13997v2","updated":"2024-08-27T10:47:47Z","published":"2023-08-27T03:54:55Z","title":"Adaptive Fusion of Radiomics and Deep Features for Lung Adenocarcinoma\n  Subtype Recognition","summary":"  The most common type of lung cancer, lung adenocarcinoma (LUAD), has been\nincreasingly detected since the advent of low-dose computed tomography\nscreening technology. In clinical practice, pre-invasive LUAD (Pre-IAs) should\nonly require regular follow-up care, while invasive LUAD (IAs) should receive\nimmediate treatment with appropriate lung cancer resection, based on the cancer\nsubtype. However, prior research on diagnosing LUAD has mainly focused on\nclassifying Pre-IAs/IAs, as techniques for distinguishing different subtypes of\nIAs have been lacking. In this study, we proposed a multi-head attentional\nfeature fusion (MHA-FF) model for not only distinguishing IAs from Pre-IAs, but\nalso for distinguishing the different subtypes of IAs. To predict the subtype\nof each nodule accurately, we leveraged both radiomics and deep features\nextracted from computed tomography images. Furthermore, those features were\naggregated through an adaptive fusion module that can learn attention-based\ndiscriminative features. The utility of our proposed method is demonstrated\nhere by means of real-world data collected from a multi-center cohort.\n","authors":["Jing Zhou","Xiaotong Fu","Xirong Li","Ying Ji"],"pdf_url":"https://arxiv.org/pdf/2308.13997v2.pdf","comment":"7 pages, 5 figures and 4 tables"},{"id":"http://arxiv.org/abs/2408.14947v1","updated":"2024-08-27T10:44:34Z","published":"2024-08-27T10:44:34Z","title":"ERX: A Fast Real-Time Anomaly Detection Algorithm for Hyperspectral\n  Line-Scanning","summary":"  Detecting unexpected objects (anomalies) in real-time has great potential for\nmonitoring, managing, and protecting the environment. Hyperspectral line-scan\ncameras are a low-cost solution that enhance confidence in anomaly detection\nover RGB and multispectral imagery. However, real-time algorithms for these\ncameras must be fast when using small computers (e.g., those onboard a drone or\nsmall satellite), scalable to high dimensions, adaptable to changing scenery,\nand robust against geometric and radiometric distortions. This paper introduces\nthe Exponentially moving RX algorithm (ERX) and compares it to existing\nRX-based anomaly detection methods for real-time line-scanning. ERX was tested\nusing a Jetson Xavier NX compute module, achieving the best combination of\nspeed and detection across three novel datasets compared to the other\nalgorithms. This research paves the way for future studies in grouping and\nlocating anomalous objects, adaptive and automatic threshold selection, and\nreal-time field tests. The Python code for the algorithms and experiments is\navailable at https://github.com/WiseGamgee/HyperAD.\n","authors":["Samuel Garske","Bradley Evans","Christopher Artlett","KC Wong"],"pdf_url":"https://arxiv.org/pdf/2408.14947v1.pdf","comment":"10 pages, 9 figures, 3 tables, code and datasets accessible at\n  https://github.com/WiseGamgee/HyperAD"},{"id":"http://arxiv.org/abs/2408.14941v1","updated":"2024-08-27T10:26:05Z","published":"2024-08-27T10:26:05Z","title":"BOX3D: Lightweight Camera-LiDAR Fusion for 3D Object Detection and\n  Localization","summary":"  Object detection and global localization play a crucial role in robotics,\nspanning across a great spectrum of applications from autonomous cars to\nmulti-layered 3D Scene Graphs for semantic scene understanding. This article\nproposes BOX3D, a novel multi-modal and lightweight scheme for localizing\nobjects of interest by fusing the information from RGB camera and 3D LiDAR.\nBOX3D is structured around a three-layered architecture, building up from the\nlocal perception of the incoming sequential sensor data to the global\nperception refinement that covers for outliers and the general consistency of\neach object's observation. More specifically, the first layer handles the\nlow-level fusion of camera and LiDAR data for initial 3D bounding box\nextraction. The second layer converts each LiDAR's scan 3D bounding boxes to\nthe world coordinate frame and applies a spatial pairing and merging mechanism\nto maintain the uniqueness of objects observed from different viewpoints.\nFinally, BOX3D integrates the third layer that supervises the consistency of\nthe results on the global map iteratively, using a point-to-voxel comparison\nfor identifying all points in the global map that belong to the object.\nBenchmarking results of the proposed novel architecture are showcased in\nmultiple experimental trials on public state-of-the-art large-scale dataset of\nurban environments.\n","authors":["Mario A. V. Saucedo","Nikolaos Stathoulopoulos","Vidya Sumathy","Christoforos Kanellakis","George Nikolakopoulos"],"pdf_url":"https://arxiv.org/pdf/2408.14941v1.pdf","comment":"Presented in MED 2024"},{"id":"http://arxiv.org/abs/2408.14930v1","updated":"2024-08-27T10:09:17Z","published":"2024-08-27T10:09:17Z","title":"Cross-Modal Temporal Alignment for Event-guided Video Deblurring","summary":"  Video deblurring aims to enhance the quality of restored results in\nmotion-blurred videos by effectively gathering information from adjacent video\nframes to compensate for the insufficient data in a single blurred frame.\nHowever, when faced with consecutively severe motion blur situations,\nframe-based video deblurring methods often fail to find accurate temporal\ncorrespondence among neighboring video frames, leading to diminished\nperformance. To address this limitation, we aim to solve the video deblurring\ntask by leveraging an event camera with micro-second temporal resolution. To\nfully exploit the dense temporal resolution of the event camera, we propose two\nmodules: 1) Intra-frame feature enhancement operates within the exposure time\nof a single blurred frame, iteratively enhancing cross-modality features in a\nrecurrent manner to better utilize the rich temporal information of events, 2)\nInter-frame temporal feature alignment gathers valuable long-range temporal\ninformation to target frames, aggregating sharp features leveraging the\nadvantages of the events. In addition, we present a novel dataset composed of\nreal-world blurred RGB videos, corresponding sharp videos, and event data. This\ndataset serves as a valuable resource for evaluating event-guided deblurring\nmethods. We demonstrate that our proposed methods outperform state-of-the-art\nframe-based and event-based motion deblurring methods through extensive\nexperiments conducted on both synthetic and real-world deblurring datasets. The\ncode and dataset are available at https://github.com/intelpro/CMTA.\n","authors":["Taewoo Kim","Hoonhee Cho","Kuk-Jin Yoon"],"pdf_url":"https://arxiv.org/pdf/2408.14930v1.pdf","comment":"Accepted in ECCV2024"},{"id":"http://arxiv.org/abs/2408.14927v1","updated":"2024-08-27T10:01:58Z","published":"2024-08-27T10:01:58Z","title":"Automatic Detection of COVID-19 from Chest X-ray Images Using Deep\n  Learning Model","summary":"  The infectious disease caused by novel corona virus (2019-nCoV) has been\nwidely spreading since last year and has shaken the entire world. It has caused\nan unprecedented effect on daily life, global economy and public health. Hence\nthis disease detection has life-saving importance for both patients as well as\ndoctors. Due to limited test kits, it is also a daunting task to test every\npatient with severe respiratory problems using conventional techniques\n(RT-PCR). Thus implementing an automatic diagnosis system is urgently required\nto overcome the scarcity problem of Covid-19 test kits at hospital, health care\nsystems. The diagnostic approach is mainly classified into two\ncategories-laboratory based and Chest radiography approach. In this paper, a\nnovel approach for computerized corona virus (2019-nCoV) detection from lung\nx-ray images is presented. Here, we propose models using deep learning to show\nthe effectiveness of diagnostic systems. In the experimental result, we\nevaluate proposed models on publicly available data-set which exhibit\nsatisfactory performance and promising results compared with other previous\nexisting methods.\n","authors":["Alloy Das","Rohit Agarwal","Rituparna Singh","Arindam Chowdhury","Debashis Nandi"],"pdf_url":"https://arxiv.org/pdf/2408.14927v1.pdf","comment":"Accepted in AIP Conference Proceedings (Vol. 2424, No. 1)"},{"id":"http://arxiv.org/abs/2311.12084v2","updated":"2024-08-27T09:55:37Z","published":"2023-11-20T11:08:06Z","title":"ODDR: Outlier Detection & Dimension Reduction Based Defense Against\n  Adversarial Patches","summary":"  Adversarial attacks present a significant challenge to the dependable\ndeployment of machine learning models, with patch-based attacks being\nparticularly potent. These attacks introduce adversarial perturbations in\nlocalized regions of an image, deceiving even well-trained models. In this\npaper, we propose Outlier Detection and Dimension Reduction (ODDR), a\ncomprehensive defense strategy engineered to counteract patch-based adversarial\nattacks through advanced statistical methodologies. Our approach is based on\nthe observation that input features corresponding to adversarial\npatches-whether naturalistic or synthetic-deviate from the intrinsic\ndistribution of the remaining image data and can thus be identified as\noutliers. ODDR operates through a robust three-stage pipeline: Fragmentation,\nSegregation, and Neutralization. This model-agnostic framework is versatile,\noffering protection across various tasks, including image classification,\nobject detection, and depth estimation, and is proved effective in both\nCNN-based and Transformer-based architectures. In the Fragmentation stage,\nimage samples are divided into smaller segments, preparing them for the\nSegregation stage, where advanced outlier detection techniques isolate\nanomalous features linked to adversarial perturbations. The Neutralization\nstage then applies dimension reduction techniques to these outliers,\neffectively neutralizing the adversarial impact while preserving critical\ninformation for the machine learning task. Extensive evaluation on benchmark\ndatasets against state-of-the-art adversarial patches underscores the efficacy\nof ODDR. Our method enhances model accuracy from 39.26% to 79.1% under the\nGoogleAp attack, outperforming leading defenses such as LGS (53.86%), Jujutsu\n(60%), and Jedi (64.34%).\n","authors":["Nandish Chattopadhyay","Amira Guesmi","Muhammad Abdullah Hanif","Bassem Ouni","Muhammad Shafique"],"pdf_url":"https://arxiv.org/pdf/2311.12084v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14916v1","updated":"2024-08-27T09:44:54Z","published":"2024-08-27T09:44:54Z","title":"Towards Real-world Event-guided Low-light Video Enhancement and\n  Deblurring","summary":"  In low-light conditions, capturing videos with frame-based cameras often\nrequires long exposure times, resulting in motion blur and reduced visibility.\nWhile frame-based motion deblurring and low-light enhancement have been\nstudied, they still pose significant challenges. Event cameras have emerged as\na promising solution for improving image quality in low-light environments and\naddressing motion blur. They provide two key advantages: capturing scene\ndetails well even in low light due to their high dynamic range, and effectively\ncapturing motion information during long exposures due to their high temporal\nresolution. Despite efforts to tackle low-light enhancement and motion\ndeblurring using event cameras separately, previous work has not addressed both\nsimultaneously. To explore the joint task, we first establish real-world\ndatasets for event-guided low-light enhancement and deblurring using a hybrid\ncamera system based on beam splitters. Subsequently, we introduce an end-to-end\nframework to effectively handle these tasks. Our framework incorporates a\nmodule to efficiently leverage temporal information from events and frames.\nFurthermore, we propose a module to utilize cross-modal feature information to\nemploy a low-pass filter for noise suppression while enhancing the main\nstructural information. Our proposed method significantly outperforms existing\napproaches in addressing the joint task. Our project pages are available at\nhttps://github.com/intelpro/ELEDNet.\n","authors":["Taewoo Kim","Jaeseok Jeong","Hoonhee Cho","Yuhwan Jeong","Kuk-Jin Yoon"],"pdf_url":"https://arxiv.org/pdf/2408.14916v1.pdf","comment":"Accepted in ECCV2024"},{"id":"http://arxiv.org/abs/2408.14899v1","updated":"2024-08-27T09:23:18Z","published":"2024-08-27T09:23:18Z","title":"MeshUp: Multi-Target Mesh Deformation via Blended Score Distillation","summary":"  We propose MeshUp, a technique that deforms a 3D mesh towards multiple target\nconcepts, and intuitively controls the region where each concept is expressed.\nConveniently, the concepts can be defined as either text queries, e.g., \"a dog\"\nand \"a turtle,\" or inspirational images, and the local regions can be selected\nas any number of vertices on the mesh. We can effectively control the influence\nof the concepts and mix them together using a novel score distillation\napproach, referred to as the Blended Score Distillation (BSD). BSD operates on\neach attention layer of the denoising U-Net of a diffusion model as it extracts\nand injects the per-objective activations into a unified denoising pipeline\nfrom which the deformation gradients are calculated. To localize the expression\nof these activations, we create a probabilistic Region of Interest (ROI) map on\nthe surface of the mesh, and turn it into 3D-consistent masks that we use to\ncontrol the expression of these activations. We demonstrate the effectiveness\nof BSD empirically and show that it can deform various meshes towards multiple\nobjectives.\n","authors":["Hyunwoo Kim","Itai Lang","Noam Aigerman","Thibault Groueix","Vladimir G. Kim","Rana Hanocka"],"pdf_url":"https://arxiv.org/pdf/2408.14899v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14895v1","updated":"2024-08-27T09:18:57Z","published":"2024-08-27T09:18:57Z","title":"VHAKG: A Multi-modal Knowledge Graph Based on Synchronized Multi-view\n  Videos of Daily Activities","summary":"  Multi-modal knowledge graphs (MMKGs), which ground various non-symbolic data\n(e.g., images and videos) into symbols, have attracted attention as resources\nenabling knowledge processing and machine learning across modalities. However,\nthe construction of MMKGs for videos consisting of multiple events, such as\ndaily activities, is still in the early stages. In this paper, we construct an\nMMKG based on synchronized multi-view simulated videos of daily activities.\nBesides representing the content of daily life videos as event-centric\nknowledge, our MMKG also includes frame-by-frame fine-grained changes, such as\nbounding boxes within video frames. In addition, we provide support tools for\nquerying our MMKG. As an application example, we demonstrate that our MMKG\nfacilitates benchmarking vision-language models by providing the necessary\nvision-language datasets for a tailored task.\n","authors":["Shusaku Egami","Takahiro Ugai","Ken Fukuda"],"pdf_url":"https://arxiv.org/pdf/2408.14895v1.pdf","comment":"5 pages,4 figures, accepted by CIKM2024 Resource Track"},{"id":"http://arxiv.org/abs/2408.00591v2","updated":"2024-08-27T09:09:18Z","published":"2024-08-01T14:20:47Z","title":"Regional quality estimation for echocardiography using deep learning","summary":"  Automatic estimation of cardiac ultrasound image quality can be beneficial\nfor guiding operators and ensuring the accuracy of clinical measurements.\nPrevious work often fails to distinguish the view correctness of the\nechocardiogram from the image quality. Additionally, previous studies only\nprovide a global image quality value, which limits their practical utility. In\nthis work, we developed and compared three methods to estimate image quality:\n1) classic pixel-based metrics like the generalized contrast-to-noise ratio\n(gCNR) on myocardial segments as region of interest and left ventricle lumen as\nbackground, obtained using a U-Net segmentation 2) local image coherence\nderived from a U-Net model that predicts coherence from B-Mode images 3) a deep\nconvolutional network that predicts the quality of each region directly in an\nend-to-end fashion. We evaluate each method against manual regional image\nquality annotations by three experienced cardiologists. The results indicate\npoor performance of the gCNR metric, with Spearman correlation to the\nannotations of rho = 0.24. The end-to-end learning model obtains the best\nresult, rho = 0.69, comparable to the inter-observer correlation, rho = 0.63.\nFinally, the coherence-based method, with rho = 0.58, outperformed the\nclassical metrics and is more generic than the end-to-end approach.\n","authors":["Gilles Van De Vyver","Svein-Erik Måsøy","Håvard Dalen","Bjørnar Leangen Grenne","Espen Holte","Sindre Hellum Olaisen","John Nyberg","Andreas Østvik","Lasse Løvstakken","Erik Smistad"],"pdf_url":"https://arxiv.org/pdf/2408.00591v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14879v1","updated":"2024-08-27T08:48:21Z","published":"2024-08-27T08:48:21Z","title":"Adversarial Manhole: Challenging Monocular Depth Estimation and Semantic\n  Segmentation Models with Patch Attack","summary":"  Monocular depth estimation (MDE) and semantic segmentation (SS) are crucial\nfor the navigation and environmental interpretation of many autonomous driving\nsystems. However, their vulnerability to practical adversarial attacks is a\nsignificant concern. This paper presents a novel adversarial attack using\npractical patches that mimic manhole covers to deceive MDE and SS models. The\ngoal is to cause these systems to misinterpret scenes, leading to false\ndetections of near obstacles or non-passable objects. We use Depth Planar\nMapping to precisely position these patches on road surfaces, enhancing the\nattack's effectiveness. Our experiments show that these adversarial patches\ncause a 43% relative error in MDE and achieve a 96% attack success rate in SS.\nThese patches create affected error regions over twice their size in MDE and\napproximately equal to their size in SS. Our studies also confirm the patch's\neffectiveness in physical simulations, the adaptability of the patches across\ndifferent target models, and the effectiveness of our proposed modules,\nhighlighting their practical implications.\n","authors":["Naufal Suryanto","Andro Aprila Adiputra","Ahmada Yusril Kadiptya","Yongsu Kim","Howon Kim"],"pdf_url":"https://arxiv.org/pdf/2408.14879v1.pdf","comment":"Accepted for WISA 2024. Code and dataset:\n  https://github.com/naufalso/adversarial-manhole"},{"id":"http://arxiv.org/abs/2408.14868v1","updated":"2024-08-27T08:39:47Z","published":"2024-08-27T08:39:47Z","title":"ZeroMamba: Exploring Visual State Space Model for Zero-Shot Learning","summary":"  Zero-shot learning (ZSL) aims to recognize unseen classes by transferring\nsemantic knowledge from seen classes to unseen ones, guided by semantic\ninformation. To this end, existing works have demonstrated remarkable\nperformance by utilizing global visual features from Convolutional Neural\nNetworks (CNNs) or Vision Transformers (ViTs) for visual-semantic interactions.\nDue to the limited receptive fields of CNNs and the quadratic complexity of\nViTs, however, these visual backbones achieve suboptimal visual-semantic\ninteractions. In this paper, motivated by the visual state space model (i.e.,\nVision Mamba), which is capable of capturing long-range dependencies and\nmodeling complex visual dynamics, we propose a parameter-efficient ZSL\nframework called ZeroMamba to advance ZSL. Our ZeroMamba comprises three key\ncomponents: Semantic-aware Local Projection (SLP), Global Representation\nLearning (GRL), and Semantic Fusion (SeF). Specifically, SLP integrates\nsemantic embeddings to map visual features to local semantic-related\nrepresentations, while GRL encourages the model to learn global semantic\nrepresentations. SeF combines these two semantic representations to enhance the\ndiscriminability of semantic features. We incorporate these designs into Vision\nMamba, forming an end-to-end ZSL framework. As a result, the learned semantic\nrepresentations are better suited for classification. Through extensive\nexperiments on four prominent ZSL benchmarks, ZeroMamba demonstrates superior\nperformance, significantly outperforming the state-of-the-art (i.e., CNN-based\nand ViT-based) methods under both conventional ZSL (CZSL) and generalized ZSL\n(GZSL) settings. Code is available at:\nhttps://anonymous.4open.science/r/ZeroMamba.\n","authors":["Wenjin Hou","Dingjie Fu","Kun Li","Shiming Chen","Hehe Fan","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2408.14868v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00737v2","updated":"2024-08-27T08:33:31Z","published":"2024-06-30T15:50:32Z","title":"LLM4GEN: Leveraging Semantic Representation of LLMs for Text-to-Image\n  Generation","summary":"  Diffusion models have exhibited substantial success in text-to-image\ngeneration. However, they often encounter challenges when dealing with complex\nand dense prompts involving multiple objects, attribute binding, and long\ndescriptions. In this paper, we propose a novel framework called\n\\textbf{LLM4GEN}, which enhances the semantic understanding of text-to-image\ndiffusion models by leveraging the representation of Large Language Models\n(LLMs). It can be seamlessly incorporated into various diffusion models as a\nplug-and-play component. A specially designed Cross-Adapter Module (CAM)\nintegrates the original text features of text-to-image models with LLM\nfeatures, thereby enhancing text-to-image generation. Additionally, to\nfacilitate and correct entity-attribute relationships in text prompts, we\ndevelop an entity-guided regularization loss to further improve generation\nperformance. We also introduce DensePrompts, which contains $7,000$ dense\nprompts to provide a comprehensive evaluation for the text-to-image generation\ntask. Experiments indicate that LLM4GEN significantly improves the semantic\nalignment of SD1.5 and SDXL, demonstrating increases of 9.69\\% and 12.90\\% in\ncolor on T2I-CompBench, respectively. Moreover, it surpasses existing models in\nterms of sample quality, image-text alignment, and human evaluation.\n","authors":["Mushui Liu","Yuhang Ma","Yang Zhen","Jun Dan","Yunlong Yu","Zeng Zhao","Zhipeng Hu","Bai Liu","Changjie Fan"],"pdf_url":"https://arxiv.org/pdf/2407.00737v2.pdf","comment":"11 pages, 13 figures"},{"id":"http://arxiv.org/abs/2408.14860v1","updated":"2024-08-27T08:28:01Z","published":"2024-08-27T08:28:01Z","title":"DiffSurf: A Transformer-based Diffusion Model for Generating and\n  Reconstructing 3D Surfaces in Pose","summary":"  This paper presents DiffSurf, a transformer-based denoising diffusion model\nfor generating and reconstructing 3D surfaces. Specifically, we design a\ndiffusion transformer architecture that predicts noise from noisy 3D surface\nvertices and normals. With this architecture, DiffSurf is able to generate 3D\nsurfaces in various poses and shapes, such as human bodies, hands, animals and\nman-made objects. Further, DiffSurf is versatile in that it can address various\n3D downstream tasks including morphing, body shape variation and 3D human mesh\nfitting to 2D keypoints. Experimental results on 3D human model benchmarks\ndemonstrate that DiffSurf can generate shapes with greater diversity and higher\nquality than previous generative models. Furthermore, when applied to the task\nof single-image 3D human mesh recovery, DiffSurf achieves accuracy comparable\nto prior techniques at a near real-time rate.\n","authors":["Yusuke Yoshiyasu","Leyuan Sun"],"pdf_url":"https://arxiv.org/pdf/2408.14860v1.pdf","comment":"Accepted at ECCV2024"},{"id":"http://arxiv.org/abs/2405.18911v2","updated":"2024-08-27T08:22:54Z","published":"2024-05-29T09:13:30Z","title":"Exploring Human-in-the-Loop Test-Time Adaptation by Synergizing Active\n  Learning and Model Selection","summary":"  Existing test-time adaptation (TTA) approaches often adapt models with the\nunlabeled testing data stream. A recent attempt relaxed the assumption by\nintroducing limited human annotation, referred to as Human-In-the-Loop\nTest-Time Adaptation (HILTTA) in this study. The focus of existing HILTTA\nstudies lies in selecting the most informative samples to label, a.k.a. active\nlearning. In this work, we are motivated by a pitfall of TTA, i.e. sensitivity\nto hyper-parameters, and propose to approach HILTTA by synergizing active\nlearning and model selection. Specifically, we first select samples for human\nannotation (active learning) and then use the labeled data to select optimal\nhyper-parameters (model selection). To prevent the model selection process from\noverfitting to local distributions, multiple regularization techniques are\nemployed to complement the validation objective. A sample selection strategy is\nfurther tailored by considering the balance between active learning and model\nselection purposes. We demonstrate on 5 TTA datasets that the proposed HILTTA\napproach is compatible with off-the-shelf TTA methods and such combinations\nsubstantially outperform the state-of-the-art HILTTA methods. Importantly, our\nproposed method can always prevent choosing the worst hyper-parameters on all\noff-the-shelf TTA methods. The source code will be released upon publication.\n","authors":["Yushu Li","Yongyi Su","Xulei Yang","Kui Jia","Xun Xu"],"pdf_url":"https://arxiv.org/pdf/2405.18911v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09126v2","updated":"2024-08-27T08:19:22Z","published":"2024-08-17T07:27:14Z","title":"Barbie: Text to Barbie-Style 3D Avatars","summary":"  Recent advances in text-guided 3D avatar generation have made substantial\nprogress by distilling knowledge from diffusion models. Despite the plausible\ngenerated appearance, existing methods cannot achieve fine-grained\ndisentanglement or high-fidelity modeling between inner body and outfit. In\nthis paper, we propose Barbie, a novel framework for generating 3D avatars that\ncan be dressed in diverse and high-quality Barbie-like garments and\naccessories. Instead of relying on a holistic model, Barbie achieves\nfine-grained disentanglement on avatars by semantic-aligned separated models\nfor human body and outfits. These disentangled 3D representations are then\noptimized by different expert models to guarantee the domain-specific fidelity.\nTo balance geometry diversity and reasonableness, we propose a series of losses\nfor template-preserving and human-prior evolving. The final avatar is enhanced\nby unified texture refinement for superior texture consistency. Extensive\nexperiments demonstrate that Barbie outperforms existing methods in both\ndressed human and outfit generation, supporting flexible apparel combination\nand animation. The code will be released for research purposes. Our project\npage is: https://xiaokunsun.github.io/Barbie.github.io/.\n","authors":["Xiaokun Sun","Zhenyu Zhang","Ying Tai","Qian Wang","Hao Tang","Zili Yi","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2408.09126v2.pdf","comment":"9 pages, 7 figures"},{"id":"http://arxiv.org/abs/2407.21705v2","updated":"2024-08-27T08:14:16Z","published":"2024-07-31T15:53:20Z","title":"Tora: Trajectory-oriented Diffusion Transformer for Video Generation","summary":"  Recent advancements in Diffusion Transformer (DiT) have demonstrated\nremarkable proficiency in producing high-quality video content. Nonetheless,\nthe potential of transformer-based diffusion models for effectively generating\nvideos with controllable motion remains an area of limited exploration. This\npaper introduces Tora, the first trajectory-oriented DiT framework that\nconcurrently integrates textual, visual, and trajectory conditions, thereby\nenabling scalable video generation with effective motion guidance.\nSpecifically, Tora consists of a Trajectory Extractor(TE), a Spatial-Temporal\nDiT, and a Motion-guidance Fuser(MGF). The TE encodes arbitrary trajectories\ninto hierarchical spacetime motion patches with a 3D video compression network.\nThe MGF integrates the motion patches into the DiT blocks to generate\nconsistent videos that accurately follow designated trajectories. Our design\naligns seamlessly with DiT's scalability, allowing precise control of video\ncontent's dynamics with diverse durations, aspect ratios, and resolutions.\nExtensive experiments demonstrate Tora's excellence in achieving high motion\nfidelity, while also meticulously simulating the intricate movement of the\nphysical world.\n","authors":["Zhenghao Zhang","Junchao Liao","Menghao Li","Zuozhuo Dai","Bingxue Qiu","Siyu Zhu","Long Qin","Weizhi Wang"],"pdf_url":"https://arxiv.org/pdf/2407.21705v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.16580v4","updated":"2024-08-27T08:13:01Z","published":"2023-05-26T02:09:48Z","title":"TFDet: Target-Aware Fusion for RGB-T Pedestrian Detection","summary":"  Pedestrian detection plays a critical role in computer vision as it\ncontributes to ensuring traffic safety. Existing methods that rely solely on\nRGB images suffer from performance degradation under low-light conditions due\nto the lack of useful information. To address this issue, recent multispectral\ndetection approaches have combined thermal images to provide complementary\ninformation and have obtained enhanced performances. Nevertheless, few\napproaches focus on the negative effects of false positives caused by noisy\nfused feature maps. Different from them, we comprehensively analyze the impacts\nof false positives on the detection performance and find that enhancing feature\ncontrast can significantly reduce these false positives. In this paper, we\npropose a novel target-aware fusion strategy for multispectral pedestrian\ndetection, named TFDet. TFDet achieves state-of-the-art performance on two\nmultispectral pedestrian benchmarks, KAIST and LLVIP. TFDet can easily extend\nto multi-class object detection scenarios. It outperforms the previous best\napproaches on two multispectral object detection benchmarks, FLIR and M3FD.\nImportantly, TFDet has comparable inference efficiency to the previous\napproaches, and has remarkably good detection performance even under low-light\nconditions, which is a significant advancement for ensuring road safety.\n","authors":["Xue Zhang","Xiaohan Zhang","Jiangtao Wang","Jiacheng Ying","Zehua Sheng","Heng Yu","Chunguang Li","Hui-Liang Shen"],"pdf_url":"https://arxiv.org/pdf/2305.16580v4.pdf","comment":"This paper has been accepted by IEEE T-NNLS journal. Please jump to\n  External DOI to view the official version"},{"id":"http://arxiv.org/abs/2408.13766v2","updated":"2024-08-27T08:07:20Z","published":"2024-08-25T08:23:06Z","title":"Enhancing Robustness of Human Detection Algorithms in Maritime SAR\n  through Augmented Aerial Images to Simulate Weather Conditions","summary":"  7,651 cases of Search and Rescue Missions (SAR) were reported by the United\nStates Coast Guard in 2024, with over 1322 SAR helicopters deployed in the 6\nfirst months alone. Through the utilizations of YOLO, we were able to run\ndifferent weather conditions and lighting from our augmented dataset for\ntraining. YOLO then utilizes CNNs to apply a series of convolutions and pooling\nlayers to the input image, where the convolution layers are able to extract the\nmain features of the image. Through this, our YOLO model is able to learn to\ndifferentiate different objects which may considerably improve its accuracy,\npossibly enhancing the efficiency of SAR operations through enhanced detection\naccuracy. This paper aims to improve the model's accuracy of human detection in\nmaritime SAR by evaluating a robust datasets containing various elevations and\ngeological locations, as well as through data augmentation which simulates\ndifferent weather and lighting. We observed that models trained on augmented\ndatasets outperformed their non-augmented counterparts in which the human\nrecall scores ranged from 0.891 to 0.911 with an improvement rate of 3.4\\% on\nthe YOLOv5l model. Results showed that these models demonstrate greater\nrobustness to real-world conditions in varying of weather, brightness, tint,\nand contrast.\n","authors":["Miguel Tjia","Artem Kim","Elaine Wynette Wijaya","Hanna Tefara","Kevin Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.13766v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19698v2","updated":"2024-08-27T08:06:38Z","published":"2024-07-29T04:43:58Z","title":"Classification Matters: Improving Video Action Detection with\n  Class-Specific Attention","summary":"  Video action detection (VAD) aims to detect actors and classify their actions\nin a video. We figure that VAD suffers more from classification rather than\nlocalization of actors. Hence, we analyze how prevailing methods form features\nfor classification and find that they prioritize actor regions, yet often\noverlooking the essential contextual information necessary for accurate\nclassification. Accordingly, we propose to reduce the bias toward actor and\nencourage paying attention to the context that is relevant to each action\nclass. By assigning a class-dedicated query to each action class, our model can\ndynamically determine where to focus for effective classification. The proposed\nmodel demonstrates superior performance on three challenging benchmarks with\nsignificantly fewer parameters and less computation.\n","authors":["Jinsung Lee","Taeoh Kim","Inwoong Lee","Minho Shim","Dongyoon Wee","Minsu Cho","Suha Kwak"],"pdf_url":"https://arxiv.org/pdf/2407.19698v2.pdf","comment":"31 pages, accepted to ECCV 2024 (oral)"},{"id":"http://arxiv.org/abs/2408.14847v1","updated":"2024-08-27T07:58:08Z","published":"2024-08-27T07:58:08Z","title":"Intraoperative Glioma Segmentation with YOLO + SAM for Improved Accuracy\n  in Tumor Resection","summary":"  Gliomas, a common type of malignant brain tumor, present significant surgical\nchallenges due to their similarity to healthy tissue. Preoperative Magnetic\nResonance Imaging (MRI) images are often ineffective during surgery due to\nfactors such as brain shift, which alters the position of brain structures and\ntumors. This makes real-time intraoperative MRI (ioMRI) crucial, as it provides\nupdated imaging that accounts for these shifts, ensuring more accurate tumor\nlocalization and safer resections. This paper presents a deep learning pipeline\ncombining You Only Look Once Version 8 (YOLOv8) and Segment Anything Model\nVision Transformer-base (SAM ViT-b) to enhance glioma detection and\nsegmentation during ioMRI. Our model was trained using the Brain Tumor\nSegmentation 2021 (BraTS 2021) dataset, which includes standard magnetic\nresonance imaging (MRI) images, and noise-augmented MRI images that simulate\nioMRI images. Noised MRI images are harder for a deep learning pipeline to\nsegment, but they are more representative of surgical conditions. Achieving a\nDice Similarity Coefficient (DICE) score of 0.79, our model performs comparably\nto state-of-the-art segmentation models tested on noiseless data. This\nperformance demonstrates the model's potential to assist surgeons in maximizing\ntumor resection and improving surgical outcomes.\n","authors":["Samir Kassam","Angelo Markham","Katie Vo","Yashas Revanakara","Michael Lam","Kevin Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.14847v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14846v1","updated":"2024-08-27T07:57:58Z","published":"2024-08-27T07:57:58Z","title":"Diffusion-Occ: 3D Point Cloud Completion via Occupancy Diffusion","summary":"  Point clouds are crucial for capturing three-dimensional data but often\nsuffer from incompleteness due to limitations such as resolution and occlusion.\nTraditional methods typically rely on point-based approaches within\ndiscriminative frameworks for point cloud completion. In this paper, we\nintroduce \\textbf{Diffusion-Occ}, a novel framework for Diffusion Point Cloud\nCompletion. Diffusion-Occ utilizes a two-stage coarse-to-fine approach. In the\nfirst stage, the Coarse Density Voxel Prediction Network (CDNet) processes\npartial points to predict coarse density voxels, streamlining global feature\nextraction through voxel classification, as opposed to previous\nregression-based methods. In the second stage, we introduce the Occupancy\nGeneration Network (OccGen), a conditional occupancy diffusion model based on a\ntransformer architecture and enhanced by our Point-Voxel Fuse (PVF) block. This\nblock integrates coarse density voxels with partial points to leverage both\nglobal and local features for comprehensive completion. By thresholding the\noccupancy field, we convert it into a complete point cloud. Additionally, our\nmethod employs diverse training mixtures and efficient diffusion\nparameterization to enable effective one-step sampling during both training and\ninference. Experimental results demonstrate that Diffusion-Occ outperforms\nexisting discriminative and generative methods.\n","authors":["Guoqing Zhang","Jian Liu"],"pdf_url":"https://arxiv.org/pdf/2408.14846v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14842v1","updated":"2024-08-27T07:54:01Z","published":"2024-08-27T07:54:01Z","title":"From Bias to Balance: Detecting Facial Expression Recognition Biases in\n  Large Multimodal Foundation Models","summary":"  This study addresses the racial biases in facial expression recognition (FER)\nsystems within Large Multimodal Foundation Models (LMFMs). Despite advances in\ndeep learning and the availability of diverse datasets, FER systems often\nexhibit higher error rates for individuals with darker skin tones. Existing\nresearch predominantly focuses on traditional FER models (CNNs, RNNs, ViTs),\nleaving a gap in understanding racial biases in LMFMs. We benchmark four\nleading LMFMs: GPT-4o, PaliGemma, Gemini, and CLIP to assess their performance\nin facial emotion detection across different racial demographics. A linear\nclassifier trained on CLIP embeddings obtains accuracies of 95.9\\% for RADIATE,\n90.3\\% for Tarr, and 99.5\\% for Chicago Face. Furthermore, we identify that\nAnger is misclassified as Disgust 2.1 times more often in Black Females than\nWhite Females. This study highlights the need for fairer FER systems and\nestablishes a foundation for developing unbiased, accurate FER technologies.\nVisit https://kvjvhub.github.io/FERRacialBias/ for further information\nregarding the biases within facial expression recognition.\n","authors":["Kaylee Chhua","Zhoujinyi Wen","Vedant Hathalia","Kevin Zhu","Sean O'Brien"],"pdf_url":"https://arxiv.org/pdf/2408.14842v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14841v1","updated":"2024-08-27T07:52:44Z","published":"2024-08-27T07:52:44Z","title":"Diffusion based Semantic Outlier Generation via Nuisance Awareness for\n  Out-of-Distribution Detection","summary":"  Out-of-distribution (OOD) detection, which determines whether a given sample\nis part of the in-distribution (ID), has recently shown promising results\nthrough training with synthetic OOD datasets. Nonetheless, existing methods\noften produce outliers that are considerably distant from the ID, showing\nlimited efficacy for capturing subtle distinctions between ID and OOD. To\naddress these issues, we propose a novel framework, Semantic Outlier generation\nvia Nuisance Awareness (SONA), which notably produces challenging outliers by\ndirectly leveraging pixel-space ID samples through diffusion models. Our\napproach incorporates SONA guidance, providing separate control over semantic\nand nuisance regions of ID samples. Thereby, the generated outliers achieve two\ncrucial properties: (i) they present explicit semantic-discrepant information,\nwhile (ii) maintaining various levels of nuisance resemblance with ID.\nFurthermore, the improved OOD detector training with SONA outliers facilitates\nlearning with a focus on semantic distinctions. Extensive experiments\ndemonstrate the effectiveness of our framework, achieving an impressive AUROC\nof 88% on near-OOD datasets, which surpasses the performance of baseline\nmethods by a significant margin of approximately 6%.\n","authors":["Suhee Yoon","Sanghyu Yoon","Hankook Lee","Ye Seul Sim","Sungik Choi","Kyungeun Lee","Hye-Seung Cho","Woohyung Lim"],"pdf_url":"https://arxiv.org/pdf/2408.14841v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14837v1","updated":"2024-08-27T07:46:07Z","published":"2024-08-27T07:46:07Z","title":"Diffusion Models Are Real-Time Game Engines","summary":"  We present GameNGen, the first game engine powered entirely by a neural model\nthat enables real-time interaction with a complex environment over long\ntrajectories at high quality. GameNGen can interactively simulate the classic\ngame DOOM at over 20 frames per second on a single TPU. Next frame prediction\nachieves a PSNR of 29.4, comparable to lossy JPEG compression. Human raters are\nonly slightly better than random chance at distinguishing short clips of the\ngame from clips of the simulation. GameNGen is trained in two phases: (1) an\nRL-agent learns to play the game and the training sessions are recorded, and\n(2) a diffusion model is trained to produce the next frame, conditioned on the\nsequence of past frames and actions. Conditioning augmentations enable stable\nauto-regressive generation over long trajectories.\n","authors":["Dani Valevski","Yaniv Leviathan","Moab Arar","Shlomi Fruchter"],"pdf_url":"https://arxiv.org/pdf/2408.14837v1.pdf","comment":"Project page: https://gamengen.github.io/"},{"id":"http://arxiv.org/abs/2407.16232v2","updated":"2024-08-27T07:31:37Z","published":"2024-07-23T07:17:10Z","title":"Channel-Partitioned Windowed Attention And Frequency Learning for Single\n  Image Super-Resolution","summary":"  Recently, window-based attention methods have shown great potential for\ncomputer vision tasks, particularly in Single Image Super-Resolution (SISR).\nHowever, it may fall short in capturing long-range dependencies and\nrelationships between distant tokens. Additionally, we find that learning on\nspatial domain does not convey the frequency content of the image, which is a\ncrucial aspect in SISR. To tackle these issues, we propose a new\nChannel-Partitioned Attention Transformer (CPAT) to better capture long-range\ndependencies by sequentially expanding windows along the height and width of\nfeature maps. In addition, we propose a novel Spatial-Frequency Interaction\nModule (SFIM), which incorporates information from spatial and frequency\ndomains to provide a more comprehensive information from feature maps. This\nincludes information about the frequency content and enhances the receptive\nfield across the entire image. Experimental findings show the effectiveness of\nour proposed modules and architecture. In particular, CPAT surpasses current\nstate-of-the-art methods by up to 0.31dB at x2 SR on Urban100.\n","authors":["Dinh Phu Tran","Dao Duy Hung","Daeyoung Kim"],"pdf_url":"https://arxiv.org/pdf/2407.16232v2.pdf","comment":"Camera ready version, BMVC 2024"},{"id":"http://arxiv.org/abs/2408.14829v1","updated":"2024-08-27T07:26:10Z","published":"2024-08-27T07:26:10Z","title":"Time-Aware Face Anti-Spoofing with Rotation Invariant Local Binary\n  Patterns and Deep Learning","summary":"  Facial recognition systems have become an integral part of the modern world.\nThese methods accomplish the task of human identification in an automatic,\nfast, and non-interfering way. Past research has uncovered high vulnerability\nto simple imitation attacks that could lead to erroneous identification and\nsubsequent authentication of attackers. Similar to face recognition, imitation\nattacks can also be detected with Machine Learning. Attack detection systems\nuse a variety of facial features and advanced machine learning models for\nuncovering the presence of attacks. In this work, we assess existing work on\nliveness detection and propose a novel approach that promises high\nclassification accuracy by combining previously unused features with time-aware\ndeep learning strategies.\n","authors":["Moritz Finke","Alexandra Dmitrienko"],"pdf_url":"https://arxiv.org/pdf/2408.14829v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12017v3","updated":"2024-08-27T07:23:22Z","published":"2023-08-23T09:20:05Z","title":"Distribution-Aware Calibration for Object Detection with Noisy Bounding\n  Boxes","summary":"  Large-scale well-annotated datasets are of great importance for training an\neffective object detector. However, obtaining accurate bounding box annotations\nis laborious and demanding. Unfortunately, the resultant noisy bounding boxes\ncould cause corrupt supervision signals and thus diminish detection\nperformance. Motivated by the observation that the real ground-truth is usually\nsituated in the aggregation region of the proposals assigned to a noisy\nground-truth, we propose DIStribution-aware CalibratiOn (DISCO) to model the\nspatial distribution of proposals for calibrating supervision signals. In\nDISCO, spatial distribution modeling is performed to statistically extract the\npotential locations of objects. Based on the modeled distribution, three\ndistribution-aware techniques, i.e., distribution-aware proposal augmentation\n(DA-Aug), distribution-aware box refinement (DA-Ref), and distribution-aware\nconfidence estimation (DA-Est), are developed to improve classification,\nlocalization, and interpretability, respectively. Extensive experiments on\nlarge-scale noisy image datasets (i.e., Pascal VOC and MS-COCO) demonstrate\nthat DISCO can achieve state-of-the-art detection performance, especially at\nhigh noise levels. Code is available at https://github.com/Correr-Zhou/DISCO.\n","authors":["Donghao Zhou","Jialin Li","Jinpeng Li","Jiancheng Huang","Qiang Nie","Yong Liu","Bin-Bin Gao","Qiong Wang","Pheng-Ann Heng","Guangyong Chen"],"pdf_url":"https://arxiv.org/pdf/2308.12017v3.pdf","comment":"Accepted by BMVC2024"},{"id":"http://arxiv.org/abs/2408.11413v2","updated":"2024-08-27T07:21:02Z","published":"2024-08-21T08:19:12Z","title":"Pano2Room: Novel View Synthesis from a Single Indoor Panorama","summary":"  Recent single-view 3D generative methods have made significant advancements\nby leveraging knowledge distilled from extensive 3D object datasets. However,\nchallenges persist in the synthesis of 3D scenes from a single view, primarily\ndue to the complexity of real-world environments and the limited availability\nof high-quality prior resources. In this paper, we introduce a novel approach\ncalled Pano2Room, designed to automatically reconstruct high-quality 3D indoor\nscenes from a single panoramic image. These panoramic images can be easily\ngenerated using a panoramic RGBD inpainter from captures at a single location\nwith any camera. The key idea is to initially construct a preliminary mesh from\nthe input panorama, and iteratively refine this mesh using a panoramic RGBD\ninpainter while collecting photo-realistic 3D-consistent pseudo novel views.\nFinally, the refined mesh is converted into a 3D Gaussian Splatting field and\ntrained with the collected pseudo novel views. This pipeline enables the\nreconstruction of real-world 3D scenes, even in the presence of large\nocclusions, and facilitates the synthesis of photo-realistic novel views with\ndetailed geometry. Extensive qualitative and quantitative experiments have been\nconducted to validate the superiority of our method in single-panorama indoor\nnovel synthesis compared to the state-of-the-art. Our code and data are\navailable at \\url{https://github.com/TrickyGo/Pano2Room}.\n","authors":["Guo Pu","Yiming Zhao","Zhouhui Lian"],"pdf_url":"https://arxiv.org/pdf/2408.11413v2.pdf","comment":"SIGGRAPH Asia 2024 Conference Papers (SA Conference Papers '24),\n  December 3--6, 2024, Tokyo, Japan"},{"id":"http://arxiv.org/abs/2408.14826v1","updated":"2024-08-27T07:13:44Z","published":"2024-08-27T07:13:44Z","title":"Alfie: Democratising RGBA Image Generation With No $$$","summary":"  Designs and artworks are ubiquitous across various creative fields, requiring\ngraphic design skills and dedicated software to create compositions that\ninclude many graphical elements, such as logos, icons, symbols, and art scenes,\nwhich are integral to visual storytelling. Automating the generation of such\nvisual elements improves graphic designers' productivity, democratizes and\ninnovates the creative industry, and helps generate more realistic synthetic\ndata for related tasks. These illustration elements are mostly RGBA images with\nirregular shapes and cutouts, facilitating blending and scene composition.\nHowever, most image generation models are incapable of generating such images\nand achieving this capability requires expensive computational resources,\nspecific training recipes, or post-processing solutions. In this work, we\npropose a fully-automated approach for obtaining RGBA illustrations by\nmodifying the inference-time behavior of a pre-trained Diffusion Transformer\nmodel, exploiting the prompt-guided controllability and visual quality offered\nby such models with no additional computational cost. We force the generation\nof entire subjects without sharp croppings, whose background is easily removed\nfor seamless integration into design projects or artistic scenes. We show with\na user study that, in most cases, users prefer our solution over generating and\nthen matting an image, and we show that our generated illustrations yield good\nresults when used as inputs for composite scene generation pipelines. We\nrelease the code at https://github.com/aimagelab/Alfie.\n","authors":["Fabio Quattrini","Vittorio Pippi","Silvia Cascianelli","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2408.14826v1.pdf","comment":"Accepted at ECCV AI for Visual Arts Workshop and Challenges"},{"id":"http://arxiv.org/abs/2408.13423v2","updated":"2024-08-27T07:12:52Z","published":"2024-08-24T01:33:28Z","title":"Training-free Long Video Generation with Chain of Diffusion Model\n  Experts","summary":"  Video generation models hold substantial potential in areas such as\nfilmmaking. However, current video diffusion models need high computational\ncosts and produce suboptimal results due to high complexity of video generation\ntask. In this paper, we propose \\textbf{ConFiner}, an efficient high-quality\nvideo generation framework that decouples video generation into easier\nsubtasks: structure \\textbf{con}trol and spatial-temporal re\\textbf{fine}ment.\nIt can generate high-quality videos with chain of off-the-shelf diffusion model\nexperts, each expert responsible for a decoupled subtask. During the\nrefinement, we introduce coordinated denoising, which can merge multiple\ndiffusion experts' capabilities into a single sampling. Furthermore, we design\nConFiner-Long framework, which can generate long coherent video with three\nconstraint strategies on ConFiner. Experimental results indicate that with only\n10\\% of the inference cost, our ConFiner surpasses representative models like\nLavie and Modelscope across all objective and subjective metrics. And\nConFiner-Long can generate high-quality and coherent videos with up to 600\nframes.\n","authors":["Wenhao Li","Yichao Cao","Xiu Su","Xi Lin","Shan You","Mingkai Zheng","Yi Chen","Chang Xu"],"pdf_url":"https://arxiv.org/pdf/2408.13423v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14825v1","updated":"2024-08-27T07:11:45Z","published":"2024-08-27T07:11:45Z","title":"From Rule-Based Models to Deep Learning Transformers Architectures for\n  Natural Language Processing and Sign Language Translation Systems: Survey,\n  Taxonomy and Performance Evaluation","summary":"  With the growing Deaf and Hard of Hearing population worldwide and the\npersistent shortage of certified sign language interpreters, there is a\npressing need for an efficient, signs-driven, integrated end-to-end translation\nsystem, from sign to gloss to text and vice-versa. There has been a wealth of\nresearch on machine translations and related reviews. However, there are few\nworks on sign language machine translation considering the particularity of the\nlanguage being continuous and dynamic. This paper aims to address this void,\nproviding a retrospective analysis of the temporal evolution of sign language\nmachine translation algorithms and a taxonomy of the Transformers\narchitectures, the most used approach in language translation. We also present\nthe requirements of a real-time Quality-of-Service sign language ma-chine\ntranslation system underpinned by accurate deep learning algorithms. We propose\nfuture research directions for sign language translation systems.\n","authors":["Nada Shahin","Leila Ismail"],"pdf_url":"https://arxiv.org/pdf/2408.14825v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14823v1","updated":"2024-08-27T07:06:49Z","published":"2024-08-27T07:06:49Z","title":"LapisGS: Layered Progressive 3D Gaussian Splatting for Adaptive\n  Streaming","summary":"  The rise of Extended Reality (XR) requires efficient streaming of 3D online\nworlds, challenging current 3DGS representations to adapt to\nbandwidth-constrained environments. This paper proposes LapisGS, a layered 3DGS\nthat supports adaptive streaming and progressive rendering. Our method\nconstructs a layered structure for cumulative representation, incorporates\ndynamic opacity optimization to maintain visual fidelity, and utilizes\noccupancy maps to efficiently manage Gaussian splats. This proposed model\noffers a progressive representation supporting a continuous rendering quality\nadapted for bandwidth-aware streaming. Extensive experiments validate the\neffectiveness of our approach in balancing visual fidelity with the compactness\nof the model, with up to 50.71% improvement in SSIM, 286.53% improvement in\nLPIPS, and 318.41% reduction in model size, and shows its potential for\nbandwidth-adapted 3D streaming and rendering applications.\n","authors":["Yuang Shi","Simone Gasparini","Géraldine Morin","Wei Tsang Ooi"],"pdf_url":"https://arxiv.org/pdf/2408.14823v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09193v2","updated":"2024-08-27T07:02:07Z","published":"2024-04-14T09:01:26Z","title":"FaceCat: Enhancing Face Recognition Security with a Unified Diffusion\n  Model","summary":"  Face anti-spoofing (FAS) and adversarial detection (FAD) have been regarded\nas critical technologies to ensure the safety of face recognition systems.\nHowever, due to limited practicality, complex deployment, and the additional\ncomputational overhead, it is necessary to implement both detection techniques\nwithin a unified framework. This paper aims to achieve this goal by breaking\nthrough two primary obstacles: 1) the suboptimal face feature representation\nand 2) the scarcity of training data. To address the limited performance caused\nby existing feature representations, motivated by the rich structural and\ndetailed features of face diffusion models, we propose FaceCat, the first\napproach leveraging the diffusion model to simultaneously enhance the\nperformance of FAS and FAD. Specifically, FaceCat elaborately designs a\nhierarchical fusion mechanism to capture rich face semantic features of the\ndiffusion model. These features then serve as a robust foundation for a\nlightweight head, designed to execute FAS and FAD simultaneously. Due to the\nlimitations in feature representation that arise from relying solely on\nsingle-modality image data, we further propose a novel text-guided multi-modal\nalignment strategy that utilizes text prompts to enrich feature representation,\nthereby enhancing performance. To combat data scarcity, we build a\ncomprehensive dataset with a wide range of 28 attack types, offering greater\npotential for a unified framework in facial security. Extensive experiments\nvalidate the effectiveness of FaceCat generalizes significantly better and\nobtains excellent robustness against common input transformations.\n","authors":["Jiawei Chen","Xiao Yang","Yinpeng Dong","Hang Su","Zhaoxia Yin"],"pdf_url":"https://arxiv.org/pdf/2404.09193v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2408.14819v1","updated":"2024-08-27T07:01:56Z","published":"2024-08-27T07:01:56Z","title":"Build-A-Scene: Interactive 3D Layout Control for Diffusion-Based Image\n  Generation","summary":"  We propose a diffusion-based approach for Text-to-Image (T2I) generation with\ninteractive 3D layout control. Layout control has been widely studied to\nalleviate the shortcomings of T2I diffusion models in understanding objects'\nplacement and relationships from text descriptions. Nevertheless, existing\napproaches for layout control are limited to 2D layouts, require the user to\nprovide a static layout beforehand, and fail to preserve generated images under\nlayout changes. This makes these approaches unsuitable for applications that\nrequire 3D object-wise control and iterative refinements, e.g., interior design\nand complex scene generation. To this end, we leverage the recent advancements\nin depth-conditioned T2I models and propose a novel approach for interactive 3D\nlayout control. We replace the traditional 2D boxes used in layout control with\n3D boxes. Furthermore, we revamp the T2I task as a multi-stage generation\nprocess, where at each stage, the user can insert, change, and move an object\nin 3D while preserving objects from earlier stages. We achieve this through our\nproposed Dynamic Self-Attention (DSA) module and the consistent 3D object\ntranslation strategy. Experiments show that our approach can generate\ncomplicated scenes based on 3D layouts, boosting the object generation success\nrate over the standard depth-conditioned T2I methods by 2x. Moreover, it\noutperforms other methods in comparison in preserving objects under layout\nchanges. Project Page: \\url{https://abdo-eldesokey.github.io/build-a-scene/}\n","authors":["Abdelrahman Eldesokey","Peter Wonka"],"pdf_url":"https://arxiv.org/pdf/2408.14819v1.pdf","comment":"Project Page: https://abdo-eldesokey.github.io/build-a-scene/"},{"id":"http://arxiv.org/abs/2408.14812v1","updated":"2024-08-27T06:50:28Z","published":"2024-08-27T06:50:28Z","title":"HPT++: Hierarchically Prompting Vision-Language Models with\n  Multi-Granularity Knowledge Generation and Improved Structure Modeling","summary":"  Prompt learning has become a prevalent strategy for adapting vision-language\nfoundation models (VLMs) such as CLIP to downstream tasks. With the emergence\nof large language models (LLMs), recent studies have explored the potential of\nusing category-related descriptions to enhance prompt effectiveness. However,\nconventional descriptions lack explicit structured information necessary to\nrepresent the interconnections among key elements like entities or attributes\nwith relation to a particular category. Since existing prompt tuning methods\ngive little consideration to managing structured knowledge, this paper\nadvocates leveraging LLMs to construct a graph for each description to\nprioritize such structured knowledge. Consequently, we propose a novel approach\ncalled Hierarchical Prompt Tuning (HPT), enabling simultaneous modeling of both\nstructured and conventional linguistic knowledge. Specifically, we introduce a\nrelationship-guided attention module to capture pair-wise associations among\nentities and attributes for low-level prompt learning. In addition, by\nincorporating high-level and global-level prompts modeling overall semantics,\nthe proposed hierarchical structure forges cross-level interlinks and empowers\nthe model to handle more complex and long-term relationships. Finally, by\nenhancing multi-granularity knowledge generation, redesigning the\nrelationship-driven attention re-weighting module, and incorporating consistent\nconstraints on the hierarchical text encoder, we propose HPT++, which further\nimproves the performance of HPT. Our experiments are conducted across a wide\nrange of evaluation settings, including base-to-new generalization,\ncross-dataset evaluation, and domain generalization. Extensive results and\nablation studies demonstrate the effectiveness of our methods, which\nconsistently outperform existing SOTA methods.\n","authors":["Yubin Wang","Xinyang Jiang","De Cheng","Wenli Sun","Dongsheng Li","Cairong Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.14812v1.pdf","comment":"19 pages, 7 figures, 7 tables. arXiv admin note: substantial text\n  overlap with arXiv:2312.06323"},{"id":"http://arxiv.org/abs/2408.14810v1","updated":"2024-08-27T06:49:21Z","published":"2024-08-27T06:49:21Z","title":"Generalist Segmentation Algorithm for Photoreceptors Analysis in\n  Adaptive Optics Imaging","summary":"  Analyzing the cone photoreceptor pattern in images obtained from the living\nhuman retina using quantitative methods can be crucial for the early detection\nand management of various eye conditions. Confocal adaptive optics scanning\nlight ophthalmoscope (AOSLO) imaging enables visualization of the cones from\nreflections of waveguiding cone photoreceptors. While there have been\nsignificant improvements in automated algorithms for segmenting cones in\nconfocal AOSLO images, the process of labelling data remains labor-intensive\nand manual. This paper introduces a method based on deep learning (DL) for\ndetecting and segmenting cones in AOSLO images. The models were trained on a\nsemi-automatically labelled dataset of 20 AOSLO batches of images of 18\nparticipants for 0$^{\\circ}$, 1$^{\\circ}$, and 2$^{\\circ}$ from the foveal\ncenter. F1 scores were 0.968, 0.958, and 0.954 for 0$^{\\circ}$, 1$^{\\circ}$,\nand 2$^{\\circ}$, respectively, which is better than previously reported DL\napproaches. Our method minimizes the need for labelled data by only\nnecessitating a fraction of labelled cones, which is especially beneficial in\nthe field of ophthalmology, where labelled data can often be limited.\n","authors":["Mikhail Kulyabin","Aline Sindel","Hilde Pedersen","Stuart Gilson","Rigmor Baraas","Andreas Maier"],"pdf_url":"https://arxiv.org/pdf/2408.14810v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19271v2","updated":"2024-08-27T06:34:00Z","published":"2024-07-27T14:45:34Z","title":"Sewer Image Super-Resolution with Depth Priors and Its Lightweight\n  Network","summary":"  The Quick-view (QV) technique serves as a primary method for detecting\ndefects within sewerage systems. However, the effectiveness of QV is impeded by\nthe limited visual range of its hardware, resulting in suboptimal image quality\nfor distant portions of the sewer network. Image super-resolution is an\neffective way to improve image quality and has been applied in a variety of\nscenes. However, research on super-resolution for sewer images remains\nconsiderably unexplored. In response, this study leverages the inherent depth\nrelationships present within QV images and introduces a novel Depth-guided,\nReference-based Super-Resolution framework denoted as DSRNet. It comprises two\ncore components: a depth extraction module and a depth information matching\nmodule (DMM). DSRNet utilizes the adjacent frames of the low-resolution image\nas reference images and helps them recover texture information based on the\ncorrelation. By combining these modules, the integration of depth priors\nsignificantly enhances both visual quality and performance benchmarks. Besides,\nin pursuit of computational efficiency and compactness, a super-resolution\nknowledge distillation model based on an attention mechanism is introduced.\nThis mechanism facilitates the acquisition of feature similarity between a more\ncomplex teacher model and a streamlined student model, with the latter being a\nlightweight version of DSRNet. Experimental results demonstrate that DSRNet\nsignificantly improves PSNR and SSIM compared with other methods. This study\nalso conducts experiments on sewer defect semantic segmentation, object\ndetection, and classification on the Pipe dataset and Sewer-ML dataset.\nExperiments show that the method can improve the performance of low-resolution\nsewer images in these tasks.\n","authors":["Gang Pan","Chen Wang","Zhijie Sui","Shuai Guo","Yaozhi Lv","Honglie Li","Di Sun","Zixia Xia"],"pdf_url":"https://arxiv.org/pdf/2407.19271v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14805v1","updated":"2024-08-27T06:24:51Z","published":"2024-08-27T06:24:51Z","title":"Platypus: A Generalized Specialist Model for Reading Text in Various\n  Forms","summary":"  Reading text from images (either natural scenes or documents) has been a\nlong-standing research topic for decades, due to the high technical challenge\nand wide application range. Previously, individual specialist models are\ndeveloped to tackle the sub-tasks of text reading (e.g., scene text\nrecognition, handwritten text recognition and mathematical expression\nrecognition). However, such specialist models usually cannot effectively\ngeneralize across different sub-tasks. Recently, generalist models (such as\nGPT-4V), trained on tremendous data in a unified way, have shown enormous\npotential in reading text in various scenarios, but with the drawbacks of\nlimited accuracy and low efficiency. In this work, we propose Platypus, a\ngeneralized specialist model for text reading. Specifically, Platypus combines\nthe best of both worlds: being able to recognize text of various forms with a\nsingle unified architecture, while achieving excellent accuracy and high\nefficiency. To better exploit the advantage of Platypus, we also construct a\ntext reading dataset (called Worms), the images of which are curated from\nprevious datasets and partially re-labeled. Experiments on standard benchmarks\ndemonstrate the effectiveness and superiority of the proposed Platypus model.\nModel and data will be made publicly available at\nhttps://github.com/AlibabaResearch/AdvancedLiterateMachinery/tree/main/OCR/Platypus.\n","authors":["Peng Wang","Zhaohai Li","Jun Tang","Humen Zhong","Fei Huang","Zhibo Yang","Cong Yao"],"pdf_url":"https://arxiv.org/pdf/2408.14805v1.pdf","comment":"Accepted by ECCV2024"},{"id":"http://arxiv.org/abs/2408.14802v1","updated":"2024-08-27T06:14:54Z","published":"2024-08-27T06:14:54Z","title":"RAW-Adapter: Adapting Pre-trained Visual Model to Camera RAW Images","summary":"  sRGB images are now the predominant choice for pre-training visual models in\ncomputer vision research, owing to their ease of acquisition and efficient\nstorage. Meanwhile, the advantage of RAW images lies in their rich physical\ninformation under variable real-world challenging lighting conditions. For\ncomputer vision tasks directly based on camera RAW data, most existing studies\nadopt methods of integrating image signal processor (ISP) with backend\nnetworks, yet often overlook the interaction capabilities between the ISP\nstages and subsequent networks. Drawing inspiration from ongoing adapter\nresearch in NLP and CV areas, we introduce RAW-Adapter, a novel approach aimed\nat adapting sRGB pre-trained models to camera RAW data. RAW-Adapter comprises\ninput-level adapters that employ learnable ISP stages to adjust RAW inputs, as\nwell as model-level adapters to build connections between ISP stages and\nsubsequent high-level networks. Additionally, RAW-Adapter is a general\nframework that could be used in various computer vision frameworks. Abundant\nexperiments under different lighting conditions have shown our algorithm's\nstate-of-the-art (SOTA) performance, demonstrating its effectiveness and\nefficiency across a range of real-world and synthetic datasets.\n","authors":["Ziteng Cui","Tatsuya Harada"],"pdf_url":"https://arxiv.org/pdf/2408.14802v1.pdf","comment":"ECCV 2024, code link: https://github.com/cuiziteng/ECCV_RAW_Adapter"},{"id":"http://arxiv.org/abs/2408.14131v2","updated":"2024-08-27T05:54:42Z","published":"2024-08-26T09:26:08Z","title":"GenFormer -- Generated Images are All You Need to Improve Robustness of\n  Transformers on Small Datasets","summary":"  Recent studies showcase the competitive accuracy of Vision Transformers\n(ViTs) in relation to Convolutional Neural Networks (CNNs), along with their\nremarkable robustness. However, ViTs demand a large amount of data to achieve\nadequate performance, which makes their application to small datasets\nchallenging, falling behind CNNs. To overcome this, we propose GenFormer, a\ndata augmentation strategy utilizing generated images, thereby improving\ntransformer accuracy and robustness on small-scale image classification tasks.\nIn our comprehensive evaluation we propose Tiny ImageNetV2, -R, and -A as new\ntest set variants of Tiny ImageNet by transferring established ImageNet\ngeneralization and robustness benchmarks to the small-scale data domain.\nSimilarly, we introduce MedMNIST-C and EuroSAT-C as corrupted test set variants\nof established fine-grained datasets in the medical and aerial domain. Through\na series of experiments conducted on small datasets of various domains,\nincluding Tiny ImageNet, CIFAR, EuroSAT and MedMNIST datasets, we demonstrate\nthe synergistic power of our method, in particular when combined with common\ntrain and test time augmentations, knowledge distillation, and architectural\ndesign choices. Additionally, we prove the effectiveness of our approach under\nchallenging conditions with limited training data, demonstrating significant\nimprovements in both accuracy and robustness, bridging the gap between CNNs and\nViTs in the small-scale dataset domain.\n","authors":["Sven Oehri","Nikolas Ebert","Ahmed Abdullah","Didier Stricker","Oliver Wasenmüller"],"pdf_url":"https://arxiv.org/pdf/2408.14131v2.pdf","comment":"This paper has been accepted at International Conference on Pattern\n  Recognition (ICPR), 2024"},{"id":"http://arxiv.org/abs/2406.18459v5","updated":"2024-08-27T05:46:06Z","published":"2024-06-26T16:10:31Z","title":"DiffuseHigh: Training-free Progressive High-Resolution Image Synthesis\n  through Structure Guidance","summary":"  Large-scale generative models, such as text-to-image diffusion models, have\ngarnered widespread attention across diverse domains due to their creative and\nhigh-fidelity image generation. Nonetheless, existing large-scale diffusion\nmodels are confined to generating images of up to 1K resolution, which is far\nfrom meeting the demands of contemporary commercial applications. Directly\nsampling higher-resolution images often yields results marred by artifacts such\nas object repetition and distorted shapes. Addressing the aforementioned issues\ntypically necessitates training or fine-tuning models on higher-resolution\ndatasets. However, this poses a formidable challenge due to the difficulty in\ncollecting large-scale high-resolution images and substantial computational\nresources. While several preceding works have proposed alternatives to bypass\nthe cumbersome training process, they often fail to produce convincing results.\nIn this work, we probe the generative ability of diffusion models at higher\nresolution beyond their original capability and propose a novel progressive\napproach that fully utilizes generated low-resolution images to guide the\ngeneration of higher-resolution images. Our method obviates the need for\nadditional training or fine-tuning which significantly lowers the burden of\ncomputational costs. Extensive experiments and results validate the efficiency\nand efficacy of our method. Project page:\nhttps://yhyun225.github.io/DiffuseHigh/\n","authors":["Younghyun Kim","Geunmin Hwang","Junyu Zhang","Eunbyung Park"],"pdf_url":"https://arxiv.org/pdf/2406.18459v5.pdf","comment":"Project page: https://yhyun225.github.io/DiffuseHigh/"},{"id":"http://arxiv.org/abs/2408.14789v1","updated":"2024-08-27T05:31:30Z","published":"2024-08-27T05:31:30Z","title":"Revisiting Surgical Instrument Segmentation Without Human Intervention:\n  A Graph Partitioning View","summary":"  Surgical instrument segmentation (SIS) on endoscopic images stands as a\nlong-standing and essential task in the context of computer-assisted\ninterventions for boosting minimally invasive surgery. Given the recent surge\nof deep learning methodologies and their data-hungry nature, training a neural\npredictive model based on massive expert-curated annotations has been\ndominating and served as an off-the-shelf approach in the field, which could,\nhowever, impose prohibitive burden to clinicians for preparing fine-grained\npixel-wise labels corresponding to the collected surgical video frames. In this\nwork, we propose an unsupervised method by reframing the video frame\nsegmentation as a graph partitioning problem and regarding image pixels as\ngraph nodes, which is significantly different from the previous efforts. A\nself-supervised pre-trained model is firstly leveraged as a feature extractor\nto capture high-level semantic features. Then, Laplacian matrixs are computed\nfrom the features and are eigendecomposed for graph partitioning. On the \"deep\"\neigenvectors, a surgical video frame is meaningfully segmented into different\nmodules such as tools and tissues, providing distinguishable semantic\ninformation like locations, classes, and relations. The segmentation problem\ncan then be naturally tackled by applying clustering or threshold on the\neigenvectors. Extensive experiments are conducted on various datasets (e.g.,\nEndoVis2017, EndoVis2018, UCL, etc.) for different clinical endpoints. Across\nall the challenging scenarios, our method demonstrates outstanding performance\nand robustness higher than unsupervised state-of-the-art (SOTA) methods. The\ncode is released at https://github.com/MingyuShengSMY/GraphClusteringSIS.git.\n","authors":["Mingyu Sheng","Jianan Fan","Dongnan Liu","Ron Kikinis","Weidong Cai"],"pdf_url":"https://arxiv.org/pdf/2408.14789v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08345v2","updated":"2024-08-27T05:08:00Z","published":"2024-08-15T17:58:10Z","title":"5%>100%: Breaking Performance Shackles of Full Fine-Tuning on Visual\n  Recognition Tasks","summary":"  Pre-training & fine-tuning can enhance the transferring efficiency and\nperformance in visual tasks. Recent delta-tuning methods provide more options\nfor visual classification tasks. Despite their success, existing visual\ndelta-tuning art fails to exceed the upper limit of full fine-tuning on\nchallenging tasks like object detection and segmentation. To find a competitive\nalternative to full fine-tuning, we propose the Multi-cognitive Visual Adapter\n(Mona) tuning, a novel adapter-based tuning method. First, we introduce\nmultiple vision-friendly filters into the adapter to enhance its ability to\nprocess visual signals, while previous methods mainly rely on language-friendly\nlinear filters. Second, we add the scaled normalization layer in the adapter to\nregulate the distribution of input features for visual filters. To fully\ndemonstrate the practicality and generality of Mona, we conduct experiments on\nmultiple representative visual tasks, including instance segmentation on COCO,\nsemantic segmentation on ADE20K, object detection on Pascal VOC, oriented\nobject detection on DOTA/STAR, and image classification on three common\ndatasets. Exciting results illustrate that Mona surpasses full fine-tuning on\nall these tasks, and is the only delta-tuning method outperforming full\nfine-tuning on the above various tasks. For example, Mona achieves 1%\nperformance gain on the COCO dataset compared to full fine-tuning.\nComprehensive results suggest that Mona-tuning is more suitable for retaining\nand utilizing the capabilities of pre-trained models than full fine-tuning. The\ncode will be released at https://github.com/Leiyi-Hu/mona.\n","authors":["Dongshuo Yin","Leiyi Hu","Bin Li","Youqun Zhang","Xue Yang"],"pdf_url":"https://arxiv.org/pdf/2408.08345v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2311.15010"},{"id":"http://arxiv.org/abs/2408.14176v2","updated":"2024-08-27T04:59:58Z","published":"2024-08-26T10:42:53Z","title":"SwiftBrush v2: Make Your One-step Diffusion Model Better Than Its\n  Teacher","summary":"  In this paper, we aim to enhance the performance of SwiftBrush, a prominent\none-step text-to-image diffusion model, to be competitive with its multi-step\nStable Diffusion counterpart. Initially, we explore the quality-diversity\ntrade-off between SwiftBrush and SD Turbo: the former excels in image\ndiversity, while the latter excels in image quality. This observation motivates\nour proposed modifications in the training methodology, including better weight\ninitialization and efficient LoRA training. Moreover, our introduction of a\nnovel clamped CLIP loss enhances image-text alignment and results in improved\nimage quality. Remarkably, by combining the weights of models trained with\nefficient LoRA and full training, we achieve a new state-of-the-art one-step\ndiffusion model, achieving an FID of 8.14 and surpassing all GAN-based and\nmulti-step Stable Diffusion models. The project page is available at\nhttps://swiftbrushv2.github.io.\n","authors":["Trung Dao","Thuan Hoang Nguyen","Thanh Le","Duc Vu","Khoi Nguyen","Cuong Pham","Anh Tran"],"pdf_url":"https://arxiv.org/pdf/2408.14176v2.pdf","comment":"Accepted to ECCV'24"},{"id":"http://arxiv.org/abs/2408.14776v1","updated":"2024-08-27T04:45:53Z","published":"2024-08-27T04:45:53Z","title":"MROVSeg: Breaking the Resolution Curse of Vision-Language Models in\n  Open-Vocabulary Semantic Segmentation","summary":"  Open-vocabulary semantic segmentation aims to segment and recognize\nsemantically meaningful regions based on text-based descriptions during\ninference. A typical solution to address this task is to leverage powerful\nvision-language models (VLMs), such as CLIP, to bridge the gap between open-\nand close-vocabulary recognition. As VLMs are usually pretrained with\nlow-resolution images (e.g. $224\\times224$), most previous methods operate only\non downscaled images. We question this design as low resolution features often\nfail to preserve fine details. Although employing additional image backbones\nfor high-resolution inputs can mitigate this issue, it may also introduce\nsignificant computation overhead. Therefore, we propose MROVSeg, a\nmulti-resolution training framework for open-vocabulary semantic segmentation\nwith a single pretrained CLIP backbone, that uses sliding windows to slice the\nhigh-resolution input into uniform patches, each matching the input size of the\nwell-trained image encoder. Its key components include a Multi-Res Adapter,\nwhich restores the spatial geometry and grasps local-global correspondences\nacross patches by learnable convolutional and scale attention layers. To\nachieve accurate segmentation, we introduce Multi-grained Masked Attention\nscheme to aggregate multi-grained semantics by performing cross-attention\nbetween object queries and multi-resolution CLIP features within the region of\ninterests. Through comprehensive experiments, we demonstrate the superiority of\nMROVSeg on well-established open-vocabulary semantic segmentation benchmarks,\nparticularly for high-resolution inputs, establishing new standards for\nopen-vocabulary semantic segmentation.\n","authors":["Yuanbing Zhu","Bingke Zhu","Zhen Chen","Huan Xu","Ming Tang","Jinqiao Wang"],"pdf_url":"https://arxiv.org/pdf/2408.14776v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2407.15773v2","updated":"2024-08-27T04:41:40Z","published":"2024-07-22T16:25:41Z","title":"STAMP: Outlier-Aware Test-Time Adaptation with Stable Memory Replay","summary":"  Test-time adaptation (TTA) aims to address the distribution shift between the\ntraining and test data with only unlabeled data at test time. Existing TTA\nmethods often focus on improving recognition performance specifically for test\ndata associated with classes in the training set. However, during the\nopen-world inference process, there are inevitably test data instances from\nunknown classes, commonly referred to as outliers. This paper pays attention to\nthe problem that conducts both sample recognition and outlier rejection during\ninference while outliers exist. To address this problem, we propose a new\napproach called STAble Memory rePlay (STAMP), which performs optimization over\na stable memory bank instead of the risky mini-batch. In particular, the memory\nbank is dynamically updated by selecting low-entropy and label-consistent\nsamples in a class-balanced manner. In addition, we develop a self-weighted\nentropy minimization strategy that assigns higher weight to low-entropy\nsamples. Extensive results demonstrate that STAMP outperforms existing TTA\nmethods in terms of both recognition and outlier detection performance. The\ncode is released at https://github.com/yuyongcan/STAMP.\n","authors":["Yongcan Yu","Lijun Sheng","Ran He","Jian Liang"],"pdf_url":"https://arxiv.org/pdf/2407.15773v2.pdf","comment":"Accepted by ECCV 2024; Fixed a bug in calculating OOD score of STAMP\n  and updated the results"},{"id":"http://arxiv.org/abs/2408.14770v1","updated":"2024-08-27T04:18:18Z","published":"2024-08-27T04:18:18Z","title":"Text-guided Foundation Model Adaptation for Long-Tailed Medical Image\n  Classification","summary":"  In medical contexts, the imbalanced data distribution in long-tailed\ndatasets, due to scarce labels for rare diseases, greatly impairs the\ndiagnostic accuracy of deep learning models. Recent multimodal text-image\nsupervised foundation models offer new solutions to data scarcity through\neffective representation learning. However, their limited medical-specific\npretraining hinders their performance in medical image classification relative\nto natural images. To address this issue, we propose a novel Text-guided\nFoundation model Adaptation for Long-Tailed medical image classification\n(TFA-LT). We adopt a two-stage training strategy, integrating representations\nfrom the foundation model using just two linear adapters and a single ensembler\nfor balanced outcomes. Experimental results on two long-tailed medical image\ndatasets validate the simplicity, lightweight and efficiency of our approach:\nrequiring only 6.1% GPU memory usage of the current best-performing algorithm,\nour method achieves an accuracy improvement of up to 27.1%, highlighting the\nsubstantial potential of foundation model adaptation in this area.\n","authors":["Sirui Li","Li Lin","Yijin Huang","Pujin Cheng","Xiaoying Tang"],"pdf_url":"https://arxiv.org/pdf/2408.14770v1.pdf","comment":"Accepted by IEEE ISBI 2024"},{"id":"http://arxiv.org/abs/2408.14080v2","updated":"2024-08-27T04:14:14Z","published":"2024-08-26T08:02:57Z","title":"SONICS: Synthetic Or Not -- Identifying Counterfeit Songs","summary":"  The recent surge in AI-generated songs presents exciting possibilities and\nchallenges. While these tools democratize music creation, they also necessitate\nthe ability to distinguish between human-composed and AI-generated songs for\nsafeguarding artistic integrity and content curation. Existing research and\ndatasets in fake song detection only focus on singing voice deepfake detection\n(SVDD), where the vocals are AI-generated but the instrumental music is sourced\nfrom real songs. However, this approach is inadequate for contemporary\nend-to-end AI-generated songs where all components (vocals, lyrics, music, and\nstyle) could be AI-generated. Additionally, existing datasets lack lyrics-music\ndiversity, long-duration songs, and open fake songs. To address these gaps, we\nintroduce SONICS, a novel dataset for end-to-end Synthetic Song Detection\n(SSD), comprising over 97k songs with over 49k synthetic songs from popular\nplatforms like Suno and Udio. Furthermore, we highlight the importance of\nmodeling long-range temporal dependencies in songs for effective authenticity\ndetection, an aspect overlooked in existing methods. To capture these patterns,\nwe propose a novel model, SpecTTTra, that is up to 3 times faster and 6 times\nmore memory efficient compared to popular CNN and Transformer-based models\nwhile maintaining competitive performance. Finally, we offer both AI-based and\nHuman evaluation benchmarks, addressing another deficiency in current research.\n","authors":["Md Awsafur Rahman","Zaber Ibn Abdul Hakim","Najibul Haque Sarker","Bishmoy Paul","Shaikh Anowarul Fattah"],"pdf_url":"https://arxiv.org/pdf/2408.14080v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17137v4","updated":"2024-08-27T04:02:58Z","published":"2024-05-27T12:54:09Z","title":"Jump-teaching: Ultra Efficient and Robust Learning with Noisy Label","summary":"  Sample selection is the most straightforward technique to combat label noise,\naiming to distinguish mislabeled samples during training and avoid the\ndegradation of the robustness of the model. In the workflow, $\\textit{selecting\npossibly clean data}$ and $\\textit{model update}$ are iterative. However, their\ninterplay and intrinsic characteristics hinder the robustness and efficiency of\nlearning with noisy labels: 1) The model chooses clean data with selection\nbias, leading to the accumulated error in the model update. 2) Most selection\nstrategies leverage partner networks or supplementary information to mitigate\nlabel corruption, albeit with increased computation resources and lower\nthroughput speed. Therefore, we employ only one network with the jump manner\nupdate to decouple the interplay and mine more semantic information from the\nloss for a more precise selection. Specifically, the selection of clean data\nfor each model update is based on one of the prior models, excluding the last\niteration. The strategy of model update exhibits a jump behavior in the form.\nMoreover, we map the outputs of the network and labels into the same semantic\nfeature space, respectively. In this space, a detailed and simple loss\ndistribution is generated to distinguish clean samples more effectively. Our\nproposed approach achieves almost up to $2.53\\times$ speedup, $0.46\\times$ peak\nmemory footprint, and superior robustness over state-of-the-art works with\nvarious noise settings.\n","authors":["Kangye Ji","Fei Cheng","Zeqing Wang","Bohu Huang"],"pdf_url":"https://arxiv.org/pdf/2405.17137v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19730v4","updated":"2024-08-27T03:45:18Z","published":"2024-05-30T06:21:34Z","title":"Research on the Spatial Data Intelligent Foundation Model","summary":"  This report focuses on spatial data intelligent large models, delving into\nthe principles, methods, and cutting-edge applications of these models. It\nprovides an in-depth discussion on the definition, development history, current\nstatus, and trends of spatial data intelligent large models, as well as the\nchallenges they face. The report systematically elucidates the key technologies\nof spatial data intelligent large models and their applications in urban\nenvironments, aerospace remote sensing, geography, transportation, and other\nscenarios. Additionally, it summarizes the latest application cases of spatial\ndata intelligent large models in themes such as urban development, multimodal\nsystems, remote sensing, smart transportation, and resource environments.\nFinally, the report concludes with an overview and outlook on the development\nprospects of spatial data intelligent large models.\n","authors":["Shaohua Wang","Xing Xie","Yong Li","Danhuai Guo","Zhi Cai","Yu Liu","Yang Yue","Xiao Pan","Feng Lu","Huayi Wu","Zhipeng Gui","Zhiming Ding","Bolong Zheng","Fuzheng Zhang","Jingyuan Wang","Zhengchao Chen","Hao Lu","Jiayi Li","Peng Yue","Wenhao Yu","Yao Yao","Leilei Sun","Yong Zhang","Longbiao Chen","Xiaoping Du","Xiang Li","Xueying Zhang","Kun Qin","Zhaoya Gong","Weihua Dong","Xiaofeng Meng"],"pdf_url":"https://arxiv.org/pdf/2405.19730v4.pdf","comment":"V1 and V2 are in Chinese language, other versions are in English"},{"id":"http://arxiv.org/abs/2408.14765v1","updated":"2024-08-27T03:41:44Z","published":"2024-08-27T03:41:44Z","title":"CrossViewDiff: A Cross-View Diffusion Model for Satellite-to-Street View\n  Synthesis","summary":"  Satellite-to-street view synthesis aims at generating a realistic street-view\nimage from its corresponding satellite-view image. Although stable diffusion\nmodels have exhibit remarkable performance in a variety of image generation\napplications, their reliance on similar-view inputs to control the generated\nstructure or texture restricts their application to the challenging cross-view\nsynthesis task. In this work, we propose CrossViewDiff, a cross-view diffusion\nmodel for satellite-to-street view synthesis. To address the challenges posed\nby the large discrepancy across views, we design the satellite scene structure\nestimation and cross-view texture mapping modules to construct the structural\nand textural controls for street-view image synthesis. We further design a\ncross-view control guided denoising process that incorporates the above\ncontrols via an enhanced cross-view attention module. To achieve a more\ncomprehensive evaluation of the synthesis results, we additionally design a\nGPT-based scoring method as a supplement to standard evaluation metrics. We\nalso explore the effect of different data sources (e.g., text, maps, building\nheights, and multi-temporal satellite imagery) on this task. Results on three\npublic cross-view datasets show that CrossViewDiff outperforms current\nstate-of-the-art on both standard and GPT-based evaluation metrics, generating\nhigh-quality street-view panoramas with more realistic structures and textures\nacross rural, suburban, and urban scenes. The code and models of this work will\nbe released at https://opendatalab.github.io/CrossViewDiff/.\n","authors":["Weijia Li","Jun He","Junyan Ye","Huaping Zhong","Zhimeng Zheng","Zilong Huang","Dahua Lin","Conghui He"],"pdf_url":"https://arxiv.org/pdf/2408.14765v1.pdf","comment":"21 pages, 11 figures"},{"id":"http://arxiv.org/abs/2312.04822v2","updated":"2024-08-27T03:33:51Z","published":"2023-12-08T04:12:26Z","title":"SiCP: Simultaneous Individual and Cooperative Perception for 3D Object\n  Detection in Connected and Automated Vehicles","summary":"  Cooperative perception for connected and automated vehicles is traditionally\nachieved through the fusion of feature maps from two or more vehicles. However,\nthe absence of feature maps shared from other vehicles can lead to a\nsignificant decline in 3D object detection performance for cooperative\nperception models compared to standalone 3D detection models. This drawback\nimpedes the adoption of cooperative perception as vehicle resources are often\ninsufficient to concurrently employ two perception models. To tackle this\nissue, we present Simultaneous Individual and Cooperative Perception (SiCP), a\ngeneric framework that supports a wide range of the state-of-the-art standalone\nperception backbones and enhances them with a novel Dual-Perception Network\n(DP-Net) designed to facilitate both individual and cooperative perception. In\naddition to its lightweight nature with only 0.13M parameters, DP-Net is robust\nand retains crucial gradient information during feature map fusion. As\ndemonstrated in a comprehensive evaluation on the V2V4Real and OPV2V datasets,\nthanks to DP-Net, SiCP surpasses state-of-the-art cooperative perception\nsolutions while preserving the performance of standalone perception solutions.\n","authors":["Deyuan Qu","Qi Chen","Tianyu Bai","Hongsheng Lu","Heng Fan","Hao Zhang","Song Fu","Qing Yang"],"pdf_url":"https://arxiv.org/pdf/2312.04822v2.pdf","comment":"Accepted by IROS 2024"},{"id":"http://arxiv.org/abs/2408.14764v1","updated":"2024-08-27T03:31:24Z","published":"2024-08-27T03:31:24Z","title":"SynthDoc: Bilingual Documents Synthesis for Visual Document\n  Understanding","summary":"  This paper introduces SynthDoc, a novel synthetic document generation\npipeline designed to enhance Visual Document Understanding (VDU) by generating\nhigh-quality, diverse datasets that include text, images, tables, and charts.\nAddressing the challenges of data acquisition and the limitations of existing\ndatasets, SynthDoc leverages publicly available corpora and advanced rendering\ntools to create a comprehensive and versatile dataset. Our experiments,\nconducted using the Donut model, demonstrate that models trained with\nSynthDoc's data achieve superior performance in pre-training read tasks and\nmaintain robustness in downstream tasks, despite language inconsistencies. The\nrelease of a benchmark dataset comprising 5,000 image-text pairs not only\nshowcases the pipeline's capabilities but also provides a valuable resource for\nthe VDU community to advance research and development in document image\nrecognition. This work significantly contributes to the field by offering a\nscalable solution to data scarcity and by validating the efficacy of end-to-end\nmodels in parsing complex, real-world documents.\n","authors":["Chuanghao Ding","Xuejing Liu","Wei Tang","Juan Li","Xiaoliang Wang","Rui Zhao","Cam-Tu Nguyen","Fei Tan"],"pdf_url":"https://arxiv.org/pdf/2408.14764v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14757v1","updated":"2024-08-27T03:17:52Z","published":"2024-08-27T03:17:52Z","title":"Learning effective pruning at initialization from iterative pruning","summary":"  Pruning at initialization (PaI) reduces training costs by removing weights\nbefore training, which becomes increasingly crucial with the growing network\nsize. However, current PaI methods still have a large accuracy gap with\niterative pruning, especially at high sparsity levels. This raises an\nintriguing question: can we get inspiration from iterative pruning to improve\nthe PaI performance? In the lottery ticket hypothesis, the iterative rewind\npruning (IRP) finds subnetworks retroactively by rewinding the parameter to the\noriginal initialization in every pruning iteration, which means all the\nsubnetworks are based on the initial state. Here, we hypothesise the surviving\nsubnetworks are more important and bridge the initial feature and their\nsurviving score as the PaI criterion. We employ an end-to-end neural network\n(\\textbf{AutoS}parse) to learn this correlation, input the model's initial\nfeatures, output their score and then prune the lowest score parameters before\ntraining. To validate the accuracy and generalization of our method, we\nperformed PaI across various models. Results show that our approach outperforms\nexisting methods in high-sparsity settings. Notably, as the underlying logic of\nmodel pruning is consistent in different models, only one-time IRP on one model\nis needed (e.g., once IRP on ResNet-18/CIFAR-10, AutoS can be generalized to\nVGG-16/CIFAR-10, ResNet-18/TinyImageNet, et al.). As the first neural\nnetwork-based PaI method, we conduct extensive experiments to validate the\nfactors influencing this approach. These results reveal the learning tendencies\nof neural networks and provide new insights into our understanding and research\nof PaI from a practical perspective. Our code is available at:\nhttps://github.com/ChengYaofeng/AutoSparse.git.\n","authors":["Shengkai Liu","Yaofeng Cheng","Fusheng Zha","Wei Guo","Lining Sun","Zhenshan Bing","Chenguang Yang"],"pdf_url":"https://arxiv.org/pdf/2408.14757v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14754v1","updated":"2024-08-27T03:09:39Z","published":"2024-08-27T03:09:39Z","title":"Sequential-Scanning Dual-Energy CT Imaging Using High Temporal\n  Resolution Image Reconstruction and Error-Compensated Material Basis Image\n  Generation","summary":"  Dual-energy computed tomography (DECT) has been widely used to obtain\nquantitative elemental composition of imaged subjects for personalized and\nprecise medical diagnosis. Compared with DECT leveraging advanced X-ray source\nand/or detector technologies, the use of the sequential-scanning data\nacquisition scheme to implement DECT may make a broader impact on clinical\npractice because this scheme requires no specialized hardware designs and can\nbe directly implemented into conventional CT systems. However, since the\nconcentration of iodinated contrast agent in the imaged subject varies over\ntime, sequentially scanned data sets acquired at two tube potentials are\ntemporally inconsistent. As existing material basis image reconstruction\napproaches assume that the data sets acquired at two tube potentials are\ntemporally consistent, the violation of this assumption results in inaccurate\nquantification of material concentration. In this work, we developed\nsequential-scanning DECT imaging using high temporal resolution image\nreconstruction and error-compensated material basis image generation,\nACCELERATION in short, to address the technical challenge induced by temporal\ninconsistency of sequentially scanned data sets and improve quantification\naccuracy of material concentration in sequential-scanning DECT. ACCELERATION\nhas been validated and evaluated using numerical simulation data sets generated\nfrom clinical human subject exams and experimental human subject studies.\nResults demonstrated the improvement of quantification accuracy and image\nquality using ACCELERATION.\n","authors":["Qiaoxin Li","Ruifeng Chen","Peng Wang","Guotao Quan","Yanfeng Du","Dong Liang","Yinsheng Li"],"pdf_url":"https://arxiv.org/pdf/2408.14754v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12340v2","updated":"2024-08-27T02:53:37Z","published":"2024-08-22T12:36:10Z","title":"VTON-HandFit: Virtual Try-on for Arbitrary Hand Pose Guided by Hand\n  Priors Embedding","summary":"  Although diffusion-based image virtual try-on has made considerable progress,\nemerging approaches still struggle to effectively address the issue of hand\nocclusion (i.e., clothing regions occluded by the hand part), leading to a\nnotable degradation of the try-on performance. To tackle this issue widely\nexisting in real-world scenarios, we propose VTON-HandFit, leveraging the power\nof hand priors to reconstruct the appearance and structure for hand occlusion\ncases. Firstly, we tailor a Handpose Aggregation Net using the ControlNet-based\nstructure explicitly and adaptively encoding the global hand and pose priors.\nBesides, to fully exploit the hand-related structure and appearance\ninformation, we propose Hand-feature Disentanglement Embedding module to\ndisentangle the hand priors into the hand structure-parametric and\nvisual-appearance features, and customize a masked cross attention for further\ndecoupled feature embedding. Lastly, we customize a hand-canny constraint loss\nto better learn the structure edge knowledge from the hand template of model\nimage. VTON-HandFit outperforms the baselines in qualitative and quantitative\nevaluations on the public dataset and our self-collected hand-occlusion\nHandfit-3K dataset particularly for the arbitrary hand pose occlusion cases in\nreal-world scenarios. The Code and dataset will be available at\n\\url{https://github.com/VTON-HandFit/VTON-HandFit}.\n","authors":["Yujie Liang","Xiaobin Hu","Boyuan Jiang","Donghao Luo","Kai WU","Wenhui Han","Taisong Jin","Chengjie Wang"],"pdf_url":"https://arxiv.org/pdf/2408.12340v2.pdf","comment":"The project page is \\url{https://vton-handfit.github.io}"},{"id":"http://arxiv.org/abs/2305.10662v2","updated":"2024-08-27T02:46:38Z","published":"2023-05-18T02:51:17Z","title":"Private Gradient Estimation is Useful for Generative Modeling","summary":"  While generative models have proved successful in many domains, they may pose\na privacy leakage risk in practical deployment. To address this issue,\ndifferentially private generative model learning has emerged as a solution to\ntrain private generative models for different downstream tasks. However,\nexisting private generative modeling approaches face significant challenges in\ngenerating high-dimensional data due to the inherent complexity involved in\nmodeling such data. In this work, we present a new private generative modeling\napproach where samples are generated via Hamiltonian dynamics with gradients of\nthe private dataset estimated by a well-trained network. In the approach, we\nachieve differential privacy by perturbing the projection vectors in the\nestimation of gradients with sliced score matching. In addition, we enhance the\nreconstruction ability of the model by incorporating a residual enhancement\nmodule during the score matching. For sampling, we perform Hamiltonian dynamics\nwith gradients estimated by the well-trained network, allowing the sampled data\nclose to the private dataset's manifold step by step. In this way, our model is\nable to generate data with a resolution of 256x256. Extensive experiments and\nanalysis clearly demonstrate the effectiveness and rationality of the proposed\napproach.\n","authors":["Bochao Liu","Pengju Wang","Weijia Guo","Yong Li","Liansheng Zhuang","Weiping Wang","Shiming Ge"],"pdf_url":"https://arxiv.org/pdf/2305.10662v2.pdf","comment":"accepted by ACM MM 2024 Oral"},{"id":"http://arxiv.org/abs/2408.14744v1","updated":"2024-08-27T02:45:26Z","published":"2024-08-27T02:45:26Z","title":"RSTeller: Scaling Up Visual Language Modeling in Remote Sensing with\n  Rich Linguistic Semantics from Openly Available Data and Large Language\n  Models","summary":"  Abundant, well-annotated multimodal data in remote sensing are pivotal for\naligning complex visual remote sensing (RS) scenes with human language,\nenabling the development of specialized vision language models across diverse\nRS interpretation tasks. However, annotating RS images with rich linguistic\nsemantics at scale demands expertise in RS and substantial human labor, making\nit costly and often impractical. In this study, we propose a workflow that\nleverages large language models (LLMs) to generate multimodal datasets with\nsemantically rich captions at scale from plain OpenStreetMap (OSM) data for\nimages sourced from the Google Earth Engine (GEE) platform. This approach\nfacilitates the generation of paired remote sensing data and can be readily\nscaled up using openly available data. Within this framework, we present\nRSTeller, a multimodal dataset comprising over 1 million RS images, each\naccompanied by multiple descriptive captions. Extensive experiments demonstrate\nthat RSTeller enhances the performance of multiple existing vision language\nmodels for RS scene understanding through continual pre-training. Our\nmethodology significantly reduces the manual effort and expertise needed for\nannotating remote sensing imagery while democratizing access to high-quality\nannotated data. This advancement fosters progress in visual language modeling\nand encourages broader participation in remote sensing research and\napplications. The RSTeller dataset is available at\nhttps://github.com/SlytherinGe/RSTeller.\n","authors":["Junyao Ge","Yang Zheng","Kaitai Guo","Jimin Liang"],"pdf_url":"https://arxiv.org/pdf/2408.14744v1.pdf","comment":"Submitted to ISPRS"},{"id":"http://arxiv.org/abs/2408.14743v1","updated":"2024-08-27T02:43:40Z","published":"2024-08-27T02:43:40Z","title":"Personalized Video Summarization using Text-Based Queries and\n  Conditional Modeling","summary":"  The proliferation of video content on platforms like YouTube and Vimeo\npresents significant challenges in efficiently locating relevant information.\nAutomatic video summarization aims to address this by extracting and presenting\nkey content in a condensed form. This thesis explores enhancing video\nsummarization by integrating text-based queries and conditional modeling to\ntailor summaries to user needs. Traditional methods often produce fixed\nsummaries that may not align with individual requirements. To overcome this, we\npropose a multi-modal deep learning approach that incorporates both textual\nqueries and visual information, fusing them at different levels of the model\narchitecture. Evaluation metrics such as accuracy and F1-score assess the\nquality of the generated summaries. The thesis also investigates improving\ntext-based query representations using contextualized word embeddings and\nspecialized attention networks. This enhances the semantic understanding of\nqueries, leading to better video summaries. To emulate human-like\nsummarization, which accounts for both visual coherence and abstract factors\nlike storyline consistency, we introduce a conditional modeling approach. This\nmethod uses multiple random variables and joint distributions to capture key\nsummarization components, resulting in more human-like and explainable\nsummaries. Addressing data scarcity in fully supervised learning, the thesis\nproposes a segment-level pseudo-labeling approach. This self-supervised method\ngenerates additional data, improving model performance even with limited\nhuman-labeled datasets. In summary, this research aims to enhance automatic\nvideo summarization by incorporating text-based queries, improving query\nrepresentations, introducing conditional modeling, and addressing data\nscarcity, thereby creating more effective and personalized video summaries.\n","authors":["Jia-Hong Huang"],"pdf_url":"https://arxiv.org/pdf/2408.14743v1.pdf","comment":"Ph.D. thesis, 137 pages"},{"id":"http://arxiv.org/abs/2408.12569v3","updated":"2024-08-27T02:31:42Z","published":"2024-08-22T17:37:27Z","title":"Sapiens: Foundation for Human Vision Models","summary":"  We present Sapiens, a family of models for four fundamental human-centric\nvision tasks -- 2D pose estimation, body-part segmentation, depth estimation,\nand surface normal prediction. Our models natively support 1K high-resolution\ninference and are extremely easy to adapt for individual tasks by simply\nfine-tuning models pretrained on over 300 million in-the-wild human images. We\nobserve that, given the same computational budget, self-supervised pretraining\non a curated dataset of human images significantly boosts the performance for a\ndiverse set of human-centric tasks. The resulting models exhibit remarkable\ngeneralization to in-the-wild data, even when labeled data is scarce or\nentirely synthetic. Our simple model design also brings scalability -- model\nperformance across tasks improves as we scale the number of parameters from 0.3\nto 2 billion. Sapiens consistently surpasses existing baselines across various\nhuman-centric benchmarks. We achieve significant improvements over the prior\nstate-of-the-art on Humans-5K (pose) by 7.6 mAP, Humans-2K (part-seg) by 17.1\nmIoU, Hi4D (depth) by 22.4% relative RMSE, and THuman2 (normal) by 53.5%\nrelative angular error. Project page:\nhttps://about.meta.com/realitylabs/codecavatars/sapiens.\n","authors":["Rawal Khirodkar","Timur Bagautdinov","Julieta Martinez","Su Zhaoen","Austin James","Peter Selednik","Stuart Anderson","Shunsuke Saito"],"pdf_url":"https://arxiv.org/pdf/2408.12569v3.pdf","comment":"ECCV 2024 (Oral)"},{"id":"http://arxiv.org/abs/2408.13800v2","updated":"2024-08-27T02:30:47Z","published":"2024-08-25T10:42:07Z","title":"BCDNet: A Convolutional Neural Network For Breast Cancer Detection","summary":"  Previous research has established that breast cancer is a prevalent cancer\ntype, with Invasive Ductal Carcinoma (IDC) being the most common subtype. The\nincidence of this dangerous cancer continues to rise, making accurate and rapid\ndiagnosis, particularly in the early stages, critically important. While modern\nComputer-Aided Diagnosis (CAD) systems can address most cases, medical\nprofessionals still face challenges in using them in the field without powerful\ncomputing resources. In this paper, we propose a novel CNN model called BCDNet,\nwhich effectively detects IDC in histopathological images with an accuracy of\nup to 89.5% and reduces training time effectively.\n","authors":["Yujia Lin","Aiwei Lian","Mingyu Liao","Yipeng Liu"],"pdf_url":"https://arxiv.org/pdf/2408.13800v2.pdf","comment":"5 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.14738v1","updated":"2024-08-27T02:29:29Z","published":"2024-08-27T02:29:29Z","title":"Learning Differentially Private Diffusion Models via Stochastic\n  Adversarial Distillation","summary":"  While the success of deep learning relies on large amounts of training\ndatasets, data is often limited in privacy-sensitive domains. To address this\nchallenge, generative model learning with differential privacy has emerged as a\nsolution to train private generative models for desensitized data generation.\nHowever, the quality of the images generated by existing methods is limited due\nto the complexity of modeling data distribution. We build on the success of\ndiffusion models and introduce DP-SAD, which trains a private diffusion model\nby a stochastic adversarial distillation method. Specifically, we first train a\ndiffusion model as a teacher and then train a student by distillation, in which\nwe achieve differential privacy by adding noise to the gradients from other\nmodels to the student. For better generation quality, we introduce a\ndiscriminator to distinguish whether an image is from the teacher or the\nstudent, which forms the adversarial training. Extensive experiments and\nanalysis clearly demonstrate the effectiveness of our proposed method.\n","authors":["Bochao Liu","Pengju Wang","Shiming Ge"],"pdf_url":"https://arxiv.org/pdf/2408.14738v1.pdf","comment":"accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2408.13623v2","updated":"2024-08-27T01:59:59Z","published":"2024-08-24T16:33:26Z","title":"Prompt-Softbox-Prompt: A free-text Embedding Control for Image Editing","summary":"  Text-driven diffusion models have achieved remarkable success in image\nediting, but a crucial component in these models-text embeddings-has not been\nfully explored. The entanglement and opacity of text embeddings present\nsignificant challenges to achieving precise image editing. In this paper, we\nprovide a comprehensive and in-depth analysis of text embeddings in Stable\nDiffusion XL, offering three key insights. First, while the 'aug_embedding'\ncaptures the full semantic content of the text, its contribution to the final\nimage generation is relatively minor. Second, 'BOS' and 'Padding_embedding' do\nnot contain any semantic information. Lastly, the 'EOS' holds the semantic\ninformation of all words and contains the most style features. Each word\nembedding plays a unique role without interfering with one another. Based on\nthese insights, we propose a novel approach for controllable image editing\nusing a free-text embedding control method called PSP (Prompt-Softbox-Prompt).\nPSP enables precise image editing by inserting or adding text embeddings within\nthe cross-attention layers and using Softbox to define and control the specific\narea for semantic injection. This technique allows for obejct additions and\nreplacements while preserving other areas of the image. Additionally, PSP can\nachieve style transfer by simply replacing text embeddings. Extensive\nexperimental results show that PSP achieves significant results in tasks such\nas object replacement, object addition, and style transfer.\n","authors":["Yitong Yang","Yinglin Wang","Jing Wang","Tian Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.13623v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14732v1","updated":"2024-08-27T01:55:40Z","published":"2024-08-27T01:55:40Z","title":"OctFusion: Octree-based Diffusion Models for 3D Shape Generation","summary":"  Diffusion models have emerged as a popular method for 3D generation. However,\nit is still challenging for diffusion models to efficiently generate diverse\nand high-quality 3D shapes. In this paper, we introduce OctFusion, which can\ngenerate 3D shapes with arbitrary resolutions in 2.5 seconds on a single Nvidia\n4090 GPU, and the extracted meshes are guaranteed to be continuous and\nmanifold. The key components of OctFusion are the octree-based latent\nrepresentation and the accompanying diffusion models. The representation\ncombines the benefits of both implicit neural representations and explicit\nspatial octrees and is learned with an octree-based variational autoencoder.\nThe proposed diffusion model is a unified multi-scale U-Net that enables\nweights and computation sharing across different octree levels and avoids the\ncomplexity of widely used cascaded diffusion schemes. We verify the\neffectiveness of OctFusion on the ShapeNet and Objaverse datasets and achieve\nstate-of-the-art performances on shape generation tasks. We demonstrate that\nOctFusion is extendable and flexible by generating high-quality color fields\nfor textured mesh generation and high-quality 3D shapes conditioned on text\nprompts, sketches, or category labels. Our code and pre-trained models are\navailable at \\url{https://github.com/octree-nn/octfusion}.\n","authors":["Bojun Xiong","Si-Tong Wei","Xin-Yang Zheng","Yan-Pei Cao","Zhouhui Lian","Peng-Shuai Wang"],"pdf_url":"https://arxiv.org/pdf/2408.14732v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2408.14724v1","updated":"2024-08-27T01:28:15Z","published":"2024-08-27T01:28:15Z","title":"GeoTransfer : Generalizable Few-Shot Multi-View Reconstruction via\n  Transfer Learning","summary":"  This paper presents a novel approach for sparse 3D reconstruction by\nleveraging the expressive power of Neural Radiance Fields (NeRFs) and fast\ntransfer of their features to learn accurate occupancy fields. Existing 3D\nreconstruction methods from sparse inputs still struggle with capturing\nintricate geometric details and can suffer from limitations in handling\noccluded regions. On the other hand, NeRFs excel in modeling complex scenes but\ndo not offer means to extract meaningful geometry. Our proposed method offers\nthe best of both worlds by transferring the information encoded in NeRF\nfeatures to derive an accurate occupancy field representation. We utilize a\npre-trained, generalizable state-of-the-art NeRF network to capture detailed\nscene radiance information, and rapidly transfer this knowledge to train a\ngeneralizable implicit occupancy network. This process helps in leveraging the\nknowledge of the scene geometry encoded in the generalizable NeRF prior and\nrefining it to learn occupancy fields, facilitating a more precise\ngeneralizable representation of 3D space. The transfer learning approach leads\nto a dramatic reduction in training time, by orders of magnitude (i.e. from\nseveral days to 3.5 hrs), obviating the need to train generalizable sparse\nsurface reconstruction methods from scratch. Additionally, we introduce a novel\nloss on volumetric rendering weights that helps in the learning of accurate\noccupancy fields, along with a normal loss that helps in global smoothing of\nthe occupancy fields. We evaluate our approach on the DTU dataset and\ndemonstrate state-of-the-art performance in terms of reconstruction accuracy,\nespecially in challenging scenarios with sparse input data and occluded\nregions. We furthermore demonstrate the generalization capabilities of our\nmethod by showing qualitative results on the Blended MVS dataset without any\nretraining.\n","authors":["Shubhendu Jena","Franck Multon","Adnane Boukhayma"],"pdf_url":"https://arxiv.org/pdf/2408.14724v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13621v5","updated":"2024-08-27T01:23:50Z","published":"2024-04-21T11:21:27Z","title":"Attack on Scene Flow using Point Clouds","summary":"  Deep neural networks have made significant advancements in accurately\nestimating scene flow using point clouds, which is vital for many applications\nlike video analysis, action recognition, and navigation. The robustness of\nthese techniques, however, remains a concern, particularly in the face of\nadversarial attacks that have been proven to deceive state-of-the-art deep\nneural networks in many domains. Surprisingly, the robustness of scene flow\nnetworks against such attacks has not been thoroughly investigated. To address\nthis problem, the proposed approach aims to bridge this gap by introducing\nadversarial white-box attacks specifically tailored for scene flow networks.\nExperimental results show that the generated adversarial examples obtain up to\n33.7 relative degradation in average end-point error on the KITTI and\nFlyingThings3D datasets. The study also reveals the significant impact that\nattacks targeting point clouds in only one dimension or color channel have on\naverage end-point error. Analyzing the success and failure of these attacks on\nthe scene flow networks and their 2D optical flow network variants shows a\nhigher vulnerability for the optical flow networks. Code is available at\nhttps://github.com/aheldis/Attack-on-Scene-Flow-using-Point-Clouds.git.\n","authors":["Haniyeh Ehsani Oskouie","Mohammad-Shahram Moin","Shohreh Kasaei"],"pdf_url":"https://arxiv.org/pdf/2404.13621v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14723v1","updated":"2024-08-27T01:23:49Z","published":"2024-08-27T01:23:49Z","title":"Snap and Diagnose: An Advanced Multimodal Retrieval System for\n  Identifying Plant Diseases in the Wild","summary":"  Plant disease recognition is a critical task that ensures crop health and\nmitigates the damage caused by diseases. A handy tool that enables farmers to\nreceive a diagnosis based on query pictures or the text description of\nsuspicious plants is in high demand for initiating treatment before potential\ndiseases spread further. In this paper, we develop a multimodal plant disease\nimage retrieval system to support disease search based on either image or text\nprompts. Specifically, we utilize the largest in-the-wild plant disease dataset\nPlantWild, which includes over 18,000 images across 89 categories, to provide a\ncomprehensive view of potential diseases relating to the query. Furthermore,\ncross-modal retrieval is achieved in the developed system, facilitated by a\nnovel CLIP-based vision-language model that encodes both disease descriptions\nand disease images into the same latent space. Built on top of the retriever,\nour retrieval system allows users to upload either plant disease images or\ndisease descriptions to retrieve the corresponding images with similar\ncharacteristics from the disease dataset to suggest candidate diseases for end\nusers' consideration.\n","authors":["Tianqi Wei","Zhi Chen","Xin Yu"],"pdf_url":"https://arxiv.org/pdf/2408.14723v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.06889v3","updated":"2024-08-27T00:03:31Z","published":"2024-07-09T14:18:35Z","title":"A Neurosymbolic Approach to Adaptive Feature Extraction in SLAM","summary":"  Autonomous robots, autonomous vehicles, and humans wearing mixed-reality\nheadsets require accurate and reliable tracking services for safety-critical\napplications in dynamically changing real-world environments. However, the\nexisting tracking approaches, such as Simultaneous Localization and Mapping\n(SLAM), do not adapt well to environmental changes and boundary conditions\ndespite extensive manual tuning. On the other hand, while deep learning-based\napproaches can better adapt to environmental changes, they typically demand\nsubstantial data for training and often lack flexibility in adapting to new\ndomains. To solve this problem, we propose leveraging the neurosymbolic program\nsynthesis approach to construct adaptable SLAM pipelines that integrate the\ndomain knowledge from traditional SLAM approaches while leveraging data to\nlearn complex relationships. While the approach can synthesize end-to-end SLAM\npipelines, we focus on synthesizing the feature extraction module. We first\ndevise a domain-specific language (DSL) that can encapsulate domain knowledge\non the important attributes for feature extraction and the real-world\nperformance of various feature extractors. Our neurosymbolic architecture then\nundertakes adaptive feature extraction, optimizing parameters via learning\nwhile employing symbolic reasoning to select the most suitable feature\nextractor. Our evaluations demonstrate that our approach, neurosymbolic Feature\nEXtraction (nFEX), yields higher-quality features. It also reduces the pose\nerror observed for the state-of-the-art baseline feature extractors ORB and\nSIFT by up to 90% and up to 66%, respectively, thereby enhancing the system's\nefficiency and adaptability to novel environments.\n","authors":["Yasra Chandio","Momin A. Khan","Khotso Selialia","Luis Garcia","Joseph DeGol","Fatima M. Anwar"],"pdf_url":"https://arxiv.org/pdf/2407.06889v3.pdf","comment":"8 pages, 6 figures, and 5 tables. Published at the 2024 IEEE/RSJ\n  International Conference on Intelligent Robots and Systems (IROS).\n  Corresponding author: Yasra Chandio (ychandio@umass.edu)"},{"id":"http://arxiv.org/abs/2408.15447v1","updated":"2024-08-27T23:53:52Z","published":"2024-08-27T23:53:52Z","title":"Fine-grained length controllable video captioning with ordinal\n  embeddings","summary":"  This paper proposes a method for video captioning that controls the length of\ngenerated captions. Previous work on length control often had few levels for\nexpressing length. In this study, we propose two methods of length embedding\nfor fine-grained length control. A traditional embedding method is linear,\nusing a one-hot vector and an embedding matrix. In this study, we propose\nmethods that represent length in multi-hot vectors. One is bit embedding that\nexpresses length in bit representation, and the other is ordinal embedding that\nuses the binary representation often used in ordinal regression. These length\nrepresentations of multi-hot vectors are converted into length embedding by a\nnonlinear MLP. This method allows for not only the length control of caption\nsentences but also the control of the time when reading the caption.\nExperiments using ActivityNet Captions and Spoken Moments in Time show that the\nproposed method effectively controls the length of the generated captions.\nAnalysis of the embedding vectors with ICA shows that length and semantics were\nlearned separately, demonstrating the effectiveness of the proposed embedding\nmethods.\n","authors":["Tomoya Nitta","Takumi Fukuzawa","Toru Tamaki"],"pdf_url":"https://arxiv.org/pdf/2408.15447v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15891v4","updated":"2024-08-27T22:09:19Z","published":"2024-04-24T14:29:26Z","title":"OMEGAS: Object Mesh Extraction from Large Scenes Guided by Gaussian\n  Segmentation","summary":"  Recent advancements in 3D reconstruction technologies have paved the way for\nhigh-quality and real-time rendering of complex 3D scenes. Despite these\nachievements, a notable challenge persists: it is difficult to precisely\nreconstruct specific objects from large scenes. Current scene reconstruction\ntechniques frequently result in the loss of object detail textures and are\nunable to reconstruct object portions that are occluded or unseen in views. To\naddress this challenge, we delve into the meticulous 3D reconstruction of\nspecific objects within large scenes and propose a framework termed OMEGAS:\nObject Mesh Extraction from Large Scenes Guided by Gaussian Segmentation.\nSpecifically, we proposed a novel 3D target segmentation technique based on 2D\nGaussian Splatting, which segments 3D consistent target masks in multi-view\nscene images and generates a preliminary target model. Moreover, to reconstruct\nthe unseen portions of the target, we propose a novel target replenishment\ntechnique driven by large-scale generative diffusion priors. We demonstrate\nthat our method can accurately reconstruct specific targets from large scenes,\nboth quantitatively and qualitatively. Our experiments show that OMEGAS\nsignificantly outperforms existing reconstruction methods across various\nscenarios. Our project page is at: https://github.com/CrystalWlz/OMEGAS\n","authors":["Lizhi Wang","Feng Zhou","Bo yu","Pu Cao","Jianqin Yin"],"pdf_url":"https://arxiv.org/pdf/2404.15891v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15428v1","updated":"2024-08-27T22:05:44Z","published":"2024-08-27T22:05:44Z","title":"HEAD: A Bandwidth-Efficient Cooperative Perception Approach for\n  Heterogeneous Connected and Autonomous Vehicles","summary":"  In cooperative perception studies, there is often a trade-off between\ncommunication bandwidth and perception performance. While current feature\nfusion solutions are known for their excellent object detection performance,\ntransmitting the entire sets of intermediate feature maps requires substantial\nbandwidth. Furthermore, these fusion approaches are typically limited to\nvehicles that use identical detection models. Our goal is to develop a solution\nthat supports cooperative perception across vehicles equipped with different\nmodalities of sensors. This method aims to deliver improved perception\nperformance compared to late fusion techniques, while achieving precision\nsimilar to the state-of-art intermediate fusion, but requires an order of\nmagnitude less bandwidth. We propose HEAD, a method that fuses features from\nthe classification and regression heads in 3D object detection networks. Our\nmethod is compatible with heterogeneous detection networks such as LiDAR\nPointPillars, SECOND, VoxelNet, and camera Bird's-eye View (BEV) Encoder. Given\nthe naturally smaller feature size in the detection heads, we design a\nself-attention mechanism to fuse the classification head and a complementary\nfeature fusion layer to fuse the regression head. Our experiments,\ncomprehensively evaluated on the V2V4Real and OPV2V datasets, demonstrate that\nHEAD is a fusion method that effectively balances communication bandwidth and\nperception performance.\n","authors":["Deyuan Qu","Qi Chen","Yongqi Zhu","Yihao Zhu","Sergei S. Avedisov","Song Fu","Qing Yang"],"pdf_url":"https://arxiv.org/pdf/2408.15428v1.pdf","comment":"Accepted by ECCV 2024 Workshop"},{"id":"http://arxiv.org/abs/2307.11986v2","updated":"2024-08-27T21:25:39Z","published":"2023-07-22T05:34:18Z","title":"Expert Knowledge-Aware Image Difference Graph Representation Learning\n  for Difference-Aware Medical Visual Question Answering","summary":"  To contribute to automating the medical vision-language model, we propose a\nnovel Chest-Xray Difference Visual Question Answering (VQA) task. Given a pair\nof main and reference images, this task attempts to answer several questions on\nboth diseases and, more importantly, the differences between them. This is\nconsistent with the radiologist's diagnosis practice that compares the current\nimage with the reference before concluding the report. We collect a new\ndataset, namely MIMIC-Diff-VQA, including 700,703 QA pairs from 164,324 pairs\nof main and reference images. Compared to existing medical VQA datasets, our\nquestions are tailored to the Assessment-Diagnosis-Intervention-Evaluation\ntreatment procedure used by clinical professionals. Meanwhile, we also propose\na novel expert knowledge-aware graph representation learning model to address\nthis task. The proposed baseline model leverages expert knowledge such as\nanatomical structure prior, semantic, and spatial knowledge to construct a\nmulti-relationship graph, representing the image differences between two images\nfor the image difference VQA task. The dataset and code can be found at\nhttps://github.com/Holipori/MIMIC-Diff-VQA. We believe this work would further\npush forward the medical vision language model.\n","authors":["Xinyue Hu","Lin Gu","Qiyuan An","Mengliang Zhang","Liangchen Liu","Kazuma Kobayashi","Tatsuya Harada","Ronald M. Summers","Yingying Zhu"],"pdf_url":"https://arxiv.org/pdf/2307.11986v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01949v2","updated":"2024-08-27T21:05:09Z","published":"2023-09-05T04:55:10Z","title":"Variational Bayesian Imaging with an Efficient Surrogate Score-based\n  Prior","summary":"  We propose a surrogate function for efficient yet principled use of\nscore-based priors in Bayesian imaging. We consider ill-posed inverse imaging\nproblems in which one aims for a clean image posterior given incomplete or\nnoisy measurements. Since the measurements do not uniquely determine a true\nimage, a prior is needed to constrain the solution space. Recent work turned\nscore-based diffusion models into principled priors for solving ill-posed\nimaging problems by appealing to an ODE-based log-probability function.\nHowever, evaluating the ODE is computationally inefficient and inhibits\nposterior estimation of high-dimensional images. Our proposed surrogate prior\nis based on the evidence lower bound of a score-based diffusion model. We\ndemonstrate the surrogate prior on variational inference for efficient\napproximate posterior sampling of large images. Compared to the exact prior in\nprevious work, our surrogate accelerates optimization of the variational image\ndistribution by at least two orders of magnitude. We also find that our\nprincipled approach gives more accurate posterior estimation than\nnon-variational diffusion-based approaches that involve hyperparameter-tuning\nat inference. Our work establishes a practical path forward for using\nscore-based diffusion models as general-purpose image priors.\n","authors":["Berthy T. Feng","Katherine L. Bouman"],"pdf_url":"https://arxiv.org/pdf/2309.01949v2.pdf","comment":"Published in Transactions on Machine Learning Research (TMLR) August\n  2024"},{"id":"http://arxiv.org/abs/2302.11552v5","updated":"2024-08-27T20:56:53Z","published":"2023-02-22T18:48:46Z","title":"Reduce, Reuse, Recycle: Compositional Generation with Energy-Based\n  Diffusion Models and MCMC","summary":"  Since their introduction, diffusion models have quickly become the prevailing\napproach to generative modeling in many domains. They can be interpreted as\nlearning the gradients of a time-varying sequence of log-probability density\nfunctions. This interpretation has motivated classifier-based and\nclassifier-free guidance as methods for post-hoc control of diffusion models.\nIn this work, we build upon these ideas using the score-based interpretation of\ndiffusion models, and explore alternative ways to condition, modify, and reuse\ndiffusion models for tasks involving compositional generation and guidance. In\nparticular, we investigate why certain types of composition fail using current\ntechniques and present a number of solutions. We conclude that the sampler (not\nthe model) is responsible for this failure and propose new samplers, inspired\nby MCMC, which enable successful compositional generation. Further, we propose\nan energy-based parameterization of diffusion models which enables the use of\nnew compositional operators and more sophisticated, Metropolis-corrected\nsamplers. Intriguingly we find these samplers lead to notable improvements in\ncompositional generation across a wide set of problems such as\nclassifier-guided ImageNet modeling and compositional text-to-image generation.\n","authors":["Yilun Du","Conor Durkan","Robin Strudel","Joshua B. Tenenbaum","Sander Dieleman","Rob Fergus","Jascha Sohl-Dickstein","Arnaud Doucet","Will Grathwohl"],"pdf_url":"https://arxiv.org/pdf/2302.11552v5.pdf","comment":"ICML 2023, Project Webpage:\n  https://energy-based-model.github.io/reduce-reuse-recycle/"},{"id":"http://arxiv.org/abs/2408.15398v1","updated":"2024-08-27T20:49:11Z","published":"2024-08-27T20:49:11Z","title":"Evaluating Pre-Training Bias on Severe Acute Respiratory Syndrome\n  Dataset","summary":"  Machine learning (ML) is a growing field of computer science that has found\nmany practical applications in several domains, including Health. However, as\ndata grows in size and availability, and the number of models that aim to aid\nor replace human decisions, it raises the concern that these models can be\nsusceptible to bias, which can lead to harm to specific individuals by basing\nits decisions on protected attributes such as gender, religion, sexual\norientation, ethnicity, and others. Visualization techniques might generate\ninsights and help summarize large datasets, enabling data scientists to\nunderstand the data better before training a model by evaluating pre-training\nmetrics applied to the datasets before training, which might contribute to\nidentifying potential harm before any effort is put into training and deploying\nthe models. This work uses the severe acute respiratory syndrome dataset from\nOpenDataSUS to visualize three pre-training bias metrics and their distribution\nacross different regions in Brazil. A random forest model is trained in each\nregion and applied to the others. The aim is to compare the bias for the\ndifferent regions, focusing on their protected attributes and comparing the\nmodel's performance with the metric values.\n","authors":["Diego Dimer Rodrigues"],"pdf_url":"https://arxiv.org/pdf/2408.15398v1.pdf","comment":"short paper for eurovis, 5 pages"},{"id":"http://arxiv.org/abs/2408.15388v1","updated":"2024-08-27T20:14:42Z","published":"2024-08-27T20:14:42Z","title":"Panoptic Perception for Autonomous Driving: A Survey","summary":"  Panoptic perception represents a forefront advancement in autonomous driving\ntechnology, unifying multiple perception tasks into a singular, cohesive\nframework to facilitate a thorough understanding of the vehicle's surroundings.\nThis survey reviews typical panoptic perception models for their unique inputs\nand architectures and compares them to performance, responsiveness, and\nresource utilization. It also delves into the prevailing challenges faced in\npanoptic perception and explores potential trajectories for future research.\nOur goal is to furnish researchers in autonomous driving with a detailed\nsynopsis of panoptic perception, positioning this survey as a pivotal reference\nin the ever-evolving landscape of autonomous driving technologies.\n","authors":["Yunge Li","Lanyu Xu"],"pdf_url":"https://arxiv.org/pdf/2408.15388v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15386v1","updated":"2024-08-27T20:08:33Z","published":"2024-08-27T20:08:33Z","title":"Multi-Feature Aggregation in Diffusion Models for Enhanced Face\n  Super-Resolution","summary":"  Super-resolution algorithms often struggle with images from surveillance\nenvironments due to adverse conditions such as unknown degradation, variations\nin pose, irregular illumination, and occlusions. However, acquiring multiple\nimages, even of low quality, is possible with surveillance cameras. In this\nwork, we develop an algorithm based on diffusion models that utilize a\nlow-resolution image combined with features extracted from multiple low-quality\nimages to generate a super-resolved image while minimizing distortions in the\nindividual's identity. Unlike other algorithms, our approach recovers facial\nfeatures without explicitly providing attribute information or without the need\nto calculate a gradient of a function during the reconstruction process. To the\nbest of our knowledge, this is the first time multi-features combined with\nlow-resolution images are used as conditioners to generate more reliable\nsuper-resolution images using stochastic differential equations. The FFHQ\ndataset was employed for training, resulting in state-of-the-art performance in\nfacial recognition and verification metrics when evaluated on the CelebA and\nQuis-Campi datasets. Our code is publicly available at\nhttps://github.com/marcelowds/fasr\n","authors":["Marcelo dos Santos","Rayson Laroca","Rafael O. Ribeiro","João C. Neves","David Menotti"],"pdf_url":"https://arxiv.org/pdf/2408.15386v1.pdf","comment":"Accepted for presentation at the Conference on Graphics, Patterns and\n  Images (SIBGRAPI) 2024"},{"id":"http://arxiv.org/abs/2408.01959v2","updated":"2024-08-27T19:57:45Z","published":"2024-08-04T08:26:58Z","title":"Dataset Scale and Societal Consistency Mediate Facial Impression Bias in\n  Vision-Language AI","summary":"  Multimodal AI models capable of associating images and text hold promise for\nnumerous domains, ranging from automated image captioning to accessibility\napplications for blind and low-vision users. However, uncertainty about bias\nhas in some cases limited their adoption and availability. In the present work,\nwe study 43 CLIP vision-language models to determine whether they learn\nhuman-like facial impression biases, and we find evidence that such biases are\nreflected across three distinct CLIP model families. We show for the first time\nthat the the degree to which a bias is shared across a society predicts the\ndegree to which it is reflected in a CLIP model. Human-like impressions of\nvisually unobservable attributes, like trustworthiness and sexuality, emerge\nonly in models trained on the largest dataset, indicating that a better fit to\nuncurated cultural data results in the reproduction of increasingly subtle\nsocial biases. Moreover, we use a hierarchical clustering approach to show that\ndataset size predicts the extent to which the underlying structure of facial\nimpression bias resembles that of facial impression bias in humans. Finally, we\nshow that Stable Diffusion models employing CLIP as a text encoder learn facial\nimpression biases, and that these biases intersect with racial biases in Stable\nDiffusion XL-Turbo. While pretrained CLIP models may prove useful for\nscientific studies of bias, they will also require significant dataset curation\nwhen intended for use as general-purpose models in a zero-shot setting.\n","authors":["Robert Wolfe","Aayushi Dangol","Alexis Hiniker","Bill Howe"],"pdf_url":"https://arxiv.org/pdf/2408.01959v2.pdf","comment":"Accepted at Artificial Intelligence, Ethics, and Society 2024"},{"id":"http://arxiv.org/abs/2312.06731v6","updated":"2024-08-27T19:51:13Z","published":"2023-12-11T09:44:41Z","title":"Genixer: Empowering Multimodal Large Language Models as a Powerful Data\n  Generator","summary":"  Multimodal Large Language Models (MLLMs) demonstrate exceptional\nproblem-solving capabilities, but few research studies aim to gauge the ability\nto generate visual instruction tuning data. This paper proposes to explore the\npotential of empowering MLLMs to generate data independently without relying on\nGPT-4. We introduce Genixer, a comprehensive data generation pipeline\nconsisting of four key steps: (i) instruction data collection, (ii) instruction\ntemplate design, (iii) empowering MLLMs, and (iv) data generation and\nfiltering. Additionally, we outline two modes of data generation: task-agnostic\nand task-specific, enabling controllable output. We demonstrate that a\nsynthetic VQA-like dataset trained with LLaVA1.5 enhances performance on 10 out\nof 12 multimodal benchmarks. Additionally, the grounding MLLM Shikra, when\ntrained with a REC-like synthetic dataset, shows improvements on 7 out of 8 REC\ndatasets. Through experiments and synthetic data analysis, our findings are:\n(1) current MLLMs can serve as robust data generators without assistance from\nGPT-4V; (2) MLLMs trained with task-specific datasets can surpass GPT-4V in\ngenerating complex instruction tuning data; (3) synthetic datasets enhance\nperformance across various multimodal benchmarks and help mitigate model\nhallucinations. The data, code, and models can be found at\nhttps://github.com/zhaohengyuan1/Genixer.\n","authors":["Henry Hengyuan Zhao","Pan Zhou","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2312.06731v6.pdf","comment":"Accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2408.15374v1","updated":"2024-08-27T19:22:06Z","published":"2024-08-27T19:22:06Z","title":"CycleGAN with Better Cycles","summary":"  CycleGAN provides a framework to train image-to-image translation with\nunpaired datasets using cycle consistency loss [4]. While results are great in\nmany applications, the pixel level cycle consistency can potentially be\nproblematic and causes unrealistic images in certain cases. In this project, we\npropose three simple modifications to cycle consistency, and show that such an\napproach achieves better results with fewer artifacts.\n","authors":["Tongzhou Wang","Yihan Lin"],"pdf_url":"https://arxiv.org/pdf/2408.15374v1.pdf","comment":"Technical Report 2018"},{"id":"http://arxiv.org/abs/2408.15373v1","updated":"2024-08-27T19:13:15Z","published":"2024-08-27T19:13:15Z","title":"Handling Geometric Domain Shifts in Semantic Segmentation of Surgical\n  RGB and Hyperspectral Images","summary":"  Robust semantic segmentation of intraoperative image data holds promise for\nenabling automatic surgical scene understanding and autonomous robotic surgery.\nWhile model development and validation are primarily conducted on idealistic\nscenes, geometric domain shifts, such as occlusions of the situs, are common in\nreal-world open surgeries. To close this gap, we (1) present the first analysis\nof state-of-the-art (SOA) semantic segmentation models when faced with\ngeometric out-of-distribution (OOD) data, and (2) propose an augmentation\ntechnique called \"Organ Transplantation\", to enhance generalizability. Our\ncomprehensive validation on six different OOD datasets, comprising 600 RGB and\nhyperspectral imaging (HSI) cubes from 33 pigs, each annotated with 19 classes,\nreveals a large performance drop in SOA organ segmentation models on geometric\nOOD data. This performance decline is observed not only in conventional RGB\ndata (with a dice similarity coefficient (DSC) drop of 46 %) but also in HSI\ndata (with a DSC drop of 45 %), despite the richer spectral information\ncontent. The performance decline increases with the spatial granularity of the\ninput data. Our augmentation technique improves SOA model performance by up to\n67 % for RGB data and 90 % for HSI data, achieving performance at the level of\nin-distribution performance on real OOD test data. Given the simplicity and\neffectiveness of our augmentation method, it is a valuable tool for addressing\ngeometric domain shifts in surgical scene segmentation, regardless of the\nunderlying model. Our code and pre-trained models are publicly available at\nhttps://github.com/IMSY-DKFZ/htc.\n","authors":["Silvia Seidlitz","Jan Sellner","Alexander Studier-Fischer","Alessandro Motta","Berkin Özdemir","Beat P. Müller-Stich","Felix Nickel","Lena Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2408.15373v1.pdf","comment":"Silvia Seidlitz and Jan Sellner contributed equally"},{"id":"http://arxiv.org/abs/2408.13912v2","updated":"2024-08-27T19:06:57Z","published":"2024-08-25T18:27:20Z","title":"Splatt3R: Zero-shot Gaussian Splatting from Uncalibrated Image Pairs","summary":"  In this paper, we introduce Splatt3R, a pose-free, feed-forward method for\nin-the-wild 3D reconstruction and novel view synthesis from stereo pairs. Given\nuncalibrated natural images, Splatt3R can predict 3D Gaussian Splats without\nrequiring any camera parameters or depth information. For generalizability, we\nbuild Splatt3R upon a ``foundation'' 3D geometry reconstruction method, MASt3R,\nby extending it to deal with both 3D structure and appearance. Specifically,\nunlike the original MASt3R which reconstructs only 3D point clouds, we predict\nthe additional Gaussian attributes required to construct a Gaussian primitive\nfor each point. Hence, unlike other novel view synthesis methods, Splatt3R is\nfirst trained by optimizing the 3D point cloud's geometry loss, and then a\nnovel view synthesis objective. By doing this, we avoid the local minima\npresent in training 3D Gaussian Splats from stereo views. We also propose a\nnovel loss masking strategy that we empirically find is critical for strong\nperformance on extrapolated viewpoints. We train Splatt3R on the ScanNet++\ndataset and demonstrate excellent generalisation to uncalibrated, in-the-wild\nimages. Splatt3R can reconstruct scenes at 4FPS at 512 x 512 resolution, and\nthe resultant splats can be rendered in real-time.\n","authors":["Brandon Smart","Chuanxia Zheng","Iro Laina","Victor Adrian Prisacariu"],"pdf_url":"https://arxiv.org/pdf/2408.13912v2.pdf","comment":"Our project page can be found at: https://splatt3r.active.vision/"},{"id":"http://arxiv.org/abs/2301.06267v5","updated":"2024-08-27T19:00:47Z","published":"2023-01-16T05:40:42Z","title":"Multimodality Helps Unimodality: Cross-Modal Few-Shot Learning with\n  Multimodal Models","summary":"  The ability to quickly learn a new task with minimal instruction - known as\nfew-shot learning - is a central aspect of intelligent agents. Classical\nfew-shot benchmarks make use of few-shot samples from a single modality, but\nsuch samples may not be sufficient to characterize an entire concept class. In\ncontrast, humans use cross-modal information to learn new concepts efficiently.\nIn this work, we demonstrate that one can indeed build a better ${\\bf visual}$\ndog classifier by ${\\bf read}$ing about dogs and ${\\bf listen}$ing to them\nbark. To do so, we exploit the fact that recent multimodal foundation models\nsuch as CLIP learn cross-modal encoders that map different modalities to the\nsame representation space. Specifically, we propose a simple strategy for ${\\bf\ncross-modal}$ ${\\bf adaptation}$: we treat examples from different modalities\nas additional few-shot examples. For example, by simply repurposing class names\nas an additional training sample, we trivially turn any n-shot learning problem\ninto a (n+1)-shot problem. This allows us to produce SOTA results with\nembarrassingly simple linear classifiers. We show that our approach can be\ncombined with existing methods such as prefix tuning, adapters, and classifier\nensembling. Finally, to explore other modalities beyond vision and language, we\nconstruct the first (to our knowledge) audiovisual few-shot benchmark and use\ncross-modal training to improve the performance of both image and audio\nclassification.\n","authors":["Zhiqiu Lin","Samuel Yu","Zhiyi Kuang","Deepak Pathak","Deva Ramanan"],"pdf_url":"https://arxiv.org/pdf/2301.06267v5.pdf","comment":"Published at CVPR 2023. Project site:\n  https://linzhiqiu.github.io/papers/cross_modal/"},{"id":"http://arxiv.org/abs/2406.14568v2","updated":"2024-08-27T18:42:09Z","published":"2024-04-29T23:53:42Z","title":"Policy Gradient-Driven Noise Mask","summary":"  Deep learning classifiers face significant challenges when dealing with\nheterogeneous multi-modal and multi-organ biomedical datasets. The low-level\nfeature distinguishability limited to imaging-modality hinders the classifiers'\nability to learn high-level semantic relationships, resulting in sub-optimal\nperformance. To address this issue, image augmentation strategies are employed\nas regularization techniques. While additive noise input during network\ntraining is a well-established augmentation as regularization method, modern\npipelines often favor more robust techniques such as dropout and weight decay.\nThis preference stems from the observation that combining these established\ntechniques with noise input can adversely affect model performance.\n  In this study, we propose a novel pretraining pipeline that learns to\ngenerate conditional noise mask specifically tailored to improve performance on\nmulti-modal and multi-organ datasets. As a reinforcement learning algorithm,\nour approach employs a dual-component system comprising a very light-weight\npolicy network that learns to sample conditional noise using a differentiable\nbeta distribution as well as a classifier network. The policy network is\ntrained using the reinforce algorithm to generate image-specific noise masks\nthat regularize the classifier during pretraining. A key aspect is that the\npolicy network's role is limited to obtaining an intermediate (or heated) model\nbefore fine-tuning. During inference, the policy network is omitted, allowing\ndirect comparison between the baseline and noise-regularized models.\n  We conducted experiments and related analyses on RadImageNet datasets.\nResults demonstrate that fine-tuning the intermediate models consistently\noutperforms conventional training algorithms on both classification and\ngeneralization to unseen concept tasks.\n","authors":["Mehmet Can Yavuz","Yang Yang"],"pdf_url":"https://arxiv.org/pdf/2406.14568v2.pdf","comment":"13 pages; 8 figures; 5 tables"},{"id":"http://arxiv.org/abs/2403.10170v2","updated":"2024-08-27T18:36:12Z","published":"2024-03-15T10:26:52Z","title":"Computer User Interface Understanding. A New Dataset and a Learning\n  Framework","summary":"  User Interface (UI) understanding has been an increasingly popular topic over\nthe last few years. So far, there has been a vast focus solely on web and\nmobile applications. In this paper, we introduce the harder task of computer UI\nunderstanding. With the goal of enabling research in this field, we have\ngenerated a dataset with a set of videos where a user is performing a sequence\nof actions and each image shows the desktop contents at that time point. We\nalso present a framework that is composed of a synthetic sample generation\npipeline to augment the dataset with relevant characteristics, and a\ncontrastive learning method to classify images in the videos. We take advantage\nof the natural conditional, tree-like, relationship of the images'\ncharacteristics to regularize the learning of the representations by dealing\nwith multiple partial tasks simultaneously. Experimental results show that the\nproposed framework outperforms previously proposed hierarchical multi-label\ncontrastive losses in fine-grain UI classification.\n","authors":["Andrés Muñoz","Daniel Borrajo"],"pdf_url":"https://arxiv.org/pdf/2403.10170v2.pdf","comment":"14 pages main paper, 6 pages appendix"},{"id":"http://arxiv.org/abs/2408.15355v1","updated":"2024-08-27T18:27:47Z","published":"2024-08-27T18:27:47Z","title":"Optimizing Lung Cancer Detection in CT Imaging: A Wavelet Multi-Layer\n  Perceptron (WMLP) Approach Enhanced by Dragonfly Algorithm (DA)","summary":"  Lung cancer stands as the preeminent cause of cancer-related mortality\nglobally. Prompt and precise diagnosis, coupled with effective treatment, is\nimperative to reduce the fatality rates associated with this formidable\ndisease. This study introduces a cutting-edge deep learning framework for the\nclassification of lung cancer from CT scan imagery. The research encompasses a\nsuite of image pre-processing strategies, notably Canny edge detection, and\nwavelet transformations, which precede the extraction of salient features and\nsubsequent classification via a Multi-Layer Perceptron (MLP). The optimization\nprocess is further refined using the Dragonfly Algorithm (DA). The methodology\nput forth has attained an impressive training and testing accuracy of 99.82\\%,\nunderscoring its efficacy and reliability in the accurate diagnosis of lung\ncancer.\n","authors":["Bitasadat Jamshidi","Nastaran Ghorbani","Mohsen Rostamy-Malkhalifeh"],"pdf_url":"https://arxiv.org/pdf/2408.15355v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2408.15232v1","updated":"2024-08-27T17:50:03Z","published":"2024-08-27T17:50:03Z","title":"Into the Unknown Unknowns: Engaged Human Learning through Participation\n  in Language Model Agent Conversations","summary":"  While language model (LM)-powered chatbots and generative search engines\nexcel at answering concrete queries, discovering information in the terrain of\nunknown unknowns remains challenging for users. To emulate the common\neducational scenario where children/students learn by listening to and\nparticipating in conversations of their parents/teachers, we create\nCollaborative STORM (Co-STORM). Unlike QA systems that require users to ask all\nthe questions, Co-STORM lets users observe and occasionally steer the discourse\namong several LM agents. The agents ask questions on the user's behalf,\nallowing the user to discover unknown unknowns serendipitously. To facilitate\nuser interaction, Co-STORM assists users in tracking the discourse by\norganizing the uncovered information into a dynamic mind map, ultimately\ngenerating a comprehensive report as takeaways. For automatic evaluation, we\nconstruct the WildSeek dataset by collecting real information-seeking records\nwith user goals. Co-STORM outperforms baseline methods on both discourse trace\nand report quality. In a further human evaluation, 70% of participants prefer\nCo-STORM over a search engine, and 78% favor it over a RAG chatbot.\n","authors":["Yucheng Jiang","Yijia Shao","Dekun Ma","Sina J. Semnani","Monica S. Lam"],"pdf_url":"https://arxiv.org/pdf/2408.15232v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15172v1","updated":"2024-08-27T16:10:21Z","published":"2024-08-27T16:10:21Z","title":"X-Reflect: Cross-Reflection Prompting for Multimodal Recommendation","summary":"  Large Language Models (LLMs) and Large Multimodal Models (LMMs) have been\nshown to enhance the effectiveness of enriching item descriptions, thereby\nimproving the accuracy of recommendation systems. However, most existing\napproaches either rely on text-only prompting or employ basic multimodal\nstrategies that do not fully exploit the complementary information available\nfrom both textual and visual modalities. This paper introduces a novel\nframework, Cross-Reflection Prompting, termed X-Reflect, designed to address\nthese limitations by prompting LMMs to explicitly identify and reconcile\nsupportive and conflicting information between text and images. By capturing\nnuanced insights from both modalities, this approach generates more\ncomprehensive and contextually richer item representations. Extensive\nexperiments conducted on two widely used benchmarks demonstrate that our method\noutperforms existing prompting baselines in downstream recommendation accuracy.\nAdditionally, we evaluate the generalizability of our framework across\ndifferent LMM backbones and the robustness of the prompting strategies,\noffering insights for optimization. This work underscores the importance of\nintegrating multimodal information and presents a novel solution for improving\nitem understanding in multimodal recommendation systems.\n","authors":["Hanjia Lyu","Ryan Rossi","Xiang Chen","Md Mehrab Tanjim","Stefano Petrangeli","Somdeb Sarkhel","Jiebo Luo"],"pdf_url":"https://arxiv.org/pdf/2408.15172v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16828v2","updated":"2024-08-27T15:07:28Z","published":"2024-07-23T20:38:23Z","title":"Pareto Front Approximation for Multi-Objective Session-Based Recommender\n  Systems","summary":"  This work introduces MultiTRON, an approach that adapts Pareto front\napproximation techniques to multi-objective session-based recommender systems\nusing a transformer neural network. Our approach optimizes trade-offs between\nkey metrics such as click-through and conversion rates by training on sampled\npreference vectors. A significant advantage is that after training, a single\nmodel can access the entire Pareto front, allowing it to be tailored to meet\nthe specific requirements of different stakeholders by adjusting an additional\ninput vector that weights the objectives. We validate the model's performance\nthrough extensive offline and online evaluation. For broader application and\nresearch, the source code is made available at\nhttps://github.com/otto-de/MultiTRON. The results confirm the model's ability\nto manage multiple recommendation objectives effectively, offering a flexible\ntool for diverse business needs.\n","authors":["Timo Wilm","Philipp Normann","Felix Stepprath"],"pdf_url":"https://arxiv.org/pdf/2407.16828v2.pdf","comment":"Accepted at the Eighteenth ACM Conference on Recommender Systems\n  (RecSys '24)"},{"id":"http://arxiv.org/abs/2402.09766v2","updated":"2024-08-27T13:01:56Z","published":"2024-02-15T07:35:52Z","title":"From Variability to Stability: Advancing RecSys Benchmarking Practices","summary":"  In the rapidly evolving domain of Recommender Systems (RecSys), new\nalgorithms frequently claim state-of-the-art performance based on evaluations\nover a limited set of arbitrarily selected datasets. However, this approach may\nfail to holistically reflect their effectiveness due to the significant impact\nof dataset characteristics on algorithm performance. Addressing this\ndeficiency, this paper introduces a novel benchmarking methodology to\nfacilitate a fair and robust comparison of RecSys algorithms, thereby advancing\nevaluation practices. By utilizing a diverse set of $30$ open datasets,\nincluding two introduced in this work, and evaluating $11$ collaborative\nfiltering algorithms across $9$ metrics, we critically examine the influence of\ndataset characteristics on algorithm performance. We further investigate the\nfeasibility of aggregating outcomes from multiple datasets into a unified\nranking. Through rigorous experimental analysis, we validate the reliability of\nour methodology under the variability of datasets, offering a benchmarking\nstrategy that balances quality and computational demands. This methodology\nenables a fair yet effective means of evaluating RecSys algorithms, providing\nvaluable guidance for future research endeavors.\n","authors":["Valeriy Shevchenko","Nikita Belousov","Alexey Vasilev","Vladimir Zholobov","Artyom Sosedka","Natalia Semenova","Anna Volodkevich","Andrey Savchenko","Alexey Zaytsev"],"pdf_url":"https://arxiv.org/pdf/2402.09766v2.pdf","comment":"8 pages with 11 figures"},{"id":"http://arxiv.org/abs/2408.15004v1","updated":"2024-08-27T12:41:37Z","published":"2024-08-27T12:41:37Z","title":"Measuring publication relatedness using controlled vocabularies","summary":"  Measuring the relatedness between scientific publications has important\napplications in many areas of bibliometrics and science policy. Controlled\nvocabularies provide a promising basis for measuring relatedness because they\naddress issues that arise when using citation or textual similarity to measure\nrelatedness. While several controlled-vocabulary-based relatedness measures\nhave been developed, there exists no comprehensive and direct test of their\naccuracy and suitability for different types of research questions. This paper\nreviews existing measures, develops a new measure, and benchmarks the measures\nusing TREC Genomics data as a ground truth of topics. The benchmark test show\nthat the new measure and the measure proposed by Ahlgren et al. (2020) have\ndiffering strengths and weaknesses. These results inform a discussion of which\nmethod to choose when studying interdisciplinarity, information retrieval,\nclustering of science, and researcher topic switching.\n","authors":["Emil Dolmer Alnor"],"pdf_url":"https://arxiv.org/pdf/2408.15004v1.pdf","comment":"Accepted for presentation at the 28th International Conference on\n  Science, Technology and Innovation Indicators, 2024"},{"id":"http://arxiv.org/abs/2408.15002v1","updated":"2024-08-27T12:34:41Z","published":"2024-08-27T12:34:41Z","title":"Knowledge Discovery in Optical Music Recognition: Enhancing Information\n  Retrieval with Instance Segmentation","summary":"  Optical Music Recognition (OMR) automates the transcription of musical\nnotation from images into machine-readable formats like MusicXML, MEI, or MIDI,\nsignificantly reducing the costs and time of manual transcription. This study\nexplores knowledge discovery in OMR by applying instance segmentation using\nMask R-CNN to enhance the detection and delineation of musical symbols in sheet\nmusic. Unlike Optical Character Recognition (OCR), OMR must handle the\nintricate semantics of Common Western Music Notation (CWMN), where symbol\nmeanings depend on shape, position, and context. Our approach leverages\ninstance segmentation to manage the density and overlap of musical symbols,\nfacilitating more precise information retrieval from music scores. Evaluations\non the DoReMi and MUSCIMA++ datasets demonstrate substantial improvements, with\nour method achieving a mean Average Precision (mAP) of up to 59.70\\% in dense\nsymbol environments, achieving comparable results to object detection.\nFurthermore, using traditional computer vision techniques, we add a parallel\nstep for staff detection to infer the pitch for the recognised symbols. This\nstudy emphasises the role of pixel-wise segmentation in advancing accurate\nmusic symbol recognition, contributing to knowledge discovery in OMR. Our\nfindings indicate that instance segmentation provides more precise\nrepresentations of musical symbols, particularly in densely populated scores,\nadvancing OMR technology. We make our implementation, pre-processing scripts,\ntrained models, and evaluation results publicly available to support further\nresearch and development.\n","authors":["Elona Shatri","George Fazekas"],"pdf_url":"https://arxiv.org/pdf/2408.15002v1.pdf","comment":"8 pages content and one references, accepted version at the\n  International Conference on Knowledge Discovery and Information Retrieval\n  2024, Porto, Portugal"},{"id":"http://arxiv.org/abs/2408.14968v1","updated":"2024-08-27T11:21:19Z","published":"2024-08-27T11:21:19Z","title":"MRSE: An Efficient Multi-modality Retrieval System for Large Scale\n  E-commerce","summary":"  Providing high-quality item recall for text queries is crucial in large-scale\ne-commerce search systems. Current Embedding-based Retrieval Systems (ERS)\nembed queries and items into a shared low-dimensional space, but uni-modality\nERS rely too heavily on textual features, making them unreliable in complex\ncontexts. While multi-modality ERS incorporate various data sources, they often\noverlook individual preferences for different modalities, leading to suboptimal\nresults. To address these issues, we propose MRSE, a Multi-modality Retrieval\nSystem that integrates text, item images, and user preferences through\nlightweight mixture-of-expert (LMoE) modules to better align features across\nand within modalities. MRSE also builds user profiles at a multi-modality level\nand introduces a novel hybrid loss function that enhances consistency and\nrobustness using hard negative sampling. Experiments on a large-scale dataset\nfrom Shopee and online A/B testing show that MRSE achieves an 18.9% improvement\nin offline relevance and a 3.7% gain in online core metrics compared to\nShopee's state-of-the-art uni-modality system.\n","authors":["Hao Jiang","Haoxiang Zhang","Qingshan Hou","Chaofeng Chen","Weisi Lin","Jingchang Zhang","Annan Wang"],"pdf_url":"https://arxiv.org/pdf/2408.14968v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14908v1","updated":"2024-08-27T09:35:13Z","published":"2024-08-27T09:35:13Z","title":"Triplètoile: Extraction of Knowledge from Microblogging Text","summary":"  Numerous methods and pipelines have recently emerged for the automatic\nextraction of knowledge graphs from documents such as scientific publications\nand patents. However, adapting these methods to incorporate alternative text\nsources like micro-blogging posts and news has proven challenging as they\nstruggle to model open-domain entities and relations, typically found in these\nsources. In this paper, we propose an enhanced information extraction pipeline\ntailored to the extraction of a knowledge graph comprising open-domain entities\nfrom micro-blogging posts on social media platforms. Our pipeline leverages\ndependency parsing and classifies entity relations in an unsupervised manner\nthrough hierarchical clustering over word embeddings. We provide a use case on\nextracting semantic triples from a corpus of 100 thousand tweets about digital\ntransformation and publicly release the generated knowledge graph. On the same\ndataset, we conduct two experimental evaluations, showing that the system\nproduces triples with precision over 95% and outperforms similar pipelines of\naround 5% in terms of precision, while generating a comparatively higher number\nof triples.\n","authors":["Vanni Zavarella","Sergio Consoli","Diego Reforgiato Recupero","Gianni Fenu","Simone Angioni","Davide Buscaldi","Danilo Dessì","Francesco Osborne"],"pdf_url":"https://arxiv.org/pdf/2408.14908v1.pdf","comment":"42 pages, 6 figures"},{"id":"http://arxiv.org/abs/2408.14906v1","updated":"2024-08-27T09:34:38Z","published":"2024-08-27T09:34:38Z","title":"Writing in the Margins: Better Inference Pattern for Long Context\n  Retrieval","summary":"  In this paper, we introduce Writing in the Margins (WiM), a new inference\npattern for Large Language Models designed to optimize the handling of long\ninput sequences in retrieval-oriented tasks. This approach leverages the\nchunked prefill of the key-value cache to perform segment-wise inference, which\nenables efficient processing of extensive contexts along with the generation\nand classification of intermediate information (\"margins\") that guide the model\ntowards specific tasks. This method increases computational overhead marginally\nwhile significantly enhancing the performance of off-the-shelf models without\nthe need for fine-tuning. Specifically, we observe that WiM provides an average\nenhancement of 7.5% in accuracy for reasoning skills (HotpotQA, MultiHop-RAG)\nand more than a 30.0% increase in the F1-score for aggregation tasks (CWE).\nAdditionally, we show how the proposed pattern fits into an interactive\nretrieval design that provides end-users with ongoing updates about the\nprogress of context processing, and pinpoints the integration of relevant\ninformation into the final response. We release our implementation of WiM using\nHugging Face Transformers library at\nhttps://github.com/writer/writing-in-the-margins.\n","authors":["Melisa Russak","Umar Jamil","Christopher Bryant","Kiran Kamble","Axel Magnuson","Mateusz Russak","Waseem AlShikh"],"pdf_url":"https://arxiv.org/pdf/2408.14906v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14851v1","updated":"2024-08-27T08:08:05Z","published":"2024-08-27T08:08:05Z","title":"Graph and Sequential Neural Networks in Session-based Recommendation: A\n  Survey","summary":"  Recent years have witnessed the remarkable success of recommendation systems\n(RSs) in alleviating the information overload problem. As a new paradigm of\nRSs, session-based recommendation (SR) specializes in users' short-term\npreference capture and aims to provide a more dynamic and timely recommendation\nbased on the ongoing interacted actions. In this survey, we will give a\ncomprehensive overview of the recent works on SR. First, we clarify the\ndefinitions of various SR tasks and introduce the characteristics of\nsession-based recommendation against other recommendation tasks. Then, we\nsummarize the existing methods in two categories: sequential neural network\nbased methods and graph neural network (GNN) based methods. The standard\nframeworks and technical are also introduced. Finally, we discuss the\nchallenges of SR and new research directions in this area.\n","authors":["Zihao Li","Chao Yang","Yakun Chen","Xianzhi Wang","Hongxu Chen","Guandong Xu","Lina Yao","Quan Z. Sheng"],"pdf_url":"https://arxiv.org/pdf/2408.14851v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14043v2","updated":"2024-08-27T06:18:05Z","published":"2024-06-20T07:06:58Z","title":"Taxonomy-Guided Zero-Shot Recommendations with LLMs","summary":"  With the emergence of large language models (LLMs) and their ability to\nperform a variety of tasks, their application in recommender systems (RecSys)\nhas shown promise. However, we are facing significant challenges when deploying\nLLMs into RecSys, such as limited prompt length, unstructured item information,\nand un-constrained generation of recommendations, leading to sub-optimal\nperformance. To address these issues, we propose a novel method using a\ntaxonomy dictionary. This method provides a systematic framework for\ncategorizing and organizing items, improving the clarity and structure of item\ninformation. By incorporating the taxonomy dictionary into LLM prompts, we\nachieve efficient token utilization and controlled feature generation, leading\nto more accurate and contextually relevant recommendations. Our Taxonomy-guided\nRecommendation (TaxRec) approach features a two-step process: one-time taxonomy\ncategorization and LLM-based recommendation, enabling zero-shot recommendations\nwithout the need for domain-specific fine-tuning. Experimental results\ndemonstrate TaxRec significantly enhances recommendation quality compared to\ntraditional zero-shot approaches, showcasing its efficacy as personal\nrecommender with LLMs. Code is available at\nhttps://github.com/yueqingliang1/TaxRec.\n","authors":["Yueqing Liang","Liangwei Yang","Chen Wang","Xiongxiao Xu","Philip S. Yu","Kai Shu"],"pdf_url":"https://arxiv.org/pdf/2406.14043v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.01262v3","updated":"2024-08-27T03:13:50Z","published":"2024-08-02T13:35:11Z","title":"RAGEval: Scenario Specific RAG Evaluation Dataset Generation Framework","summary":"  Retrieval-Augmented Generation (RAG) systems have demonstrated their\nadvantages in alleviating the hallucination of Large Language Models (LLMs).\nExisting RAG benchmarks mainly focus on evaluating whether LLMs can correctly\nanswer the general knowledge. However, they are unable to evaluate the\neffectiveness of the RAG system in dealing with the data from different\nvertical domains. This paper introduces RAGEval, a framework for automatically\ngenerating evaluation datasets to evaluate the knowledge usage ability of\ndifferent LLMs in different scenarios. Specifically, RAGEval summarizes a\nschema from seed documents, applies the configurations to generate diverse\ndocuments, and constructs question-answering pairs according to both articles\nand configurations. We propose three novel metrics, Completeness,\nHallucination, and Irrelevance, to carefully evaluate the responses generated\nby LLMs. By benchmarking RAG models in vertical domains, RAGEval has the\nability to better evaluate the knowledge usage ability of LLMs, which avoids\nthe confusion regarding the source of knowledge in answering question in\nexisting QA datasets--whether it comes from parameterized memory or retrieval.\nThe code and dataset will be released.\n","authors":["Kunlun Zhu","Yifan Luo","Dingling Xu","Ruobing Wang","Shi Yu","Shuo Wang","Yukun Yan","Zhenghao Liu","Xu Han","Zhiyuan Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2408.01262v3.pdf","comment":"add github repo"},{"id":"http://arxiv.org/abs/2408.14743v1","updated":"2024-08-27T02:43:40Z","published":"2024-08-27T02:43:40Z","title":"Personalized Video Summarization using Text-Based Queries and\n  Conditional Modeling","summary":"  The proliferation of video content on platforms like YouTube and Vimeo\npresents significant challenges in efficiently locating relevant information.\nAutomatic video summarization aims to address this by extracting and presenting\nkey content in a condensed form. This thesis explores enhancing video\nsummarization by integrating text-based queries and conditional modeling to\ntailor summaries to user needs. Traditional methods often produce fixed\nsummaries that may not align with individual requirements. To overcome this, we\npropose a multi-modal deep learning approach that incorporates both textual\nqueries and visual information, fusing them at different levels of the model\narchitecture. Evaluation metrics such as accuracy and F1-score assess the\nquality of the generated summaries. The thesis also investigates improving\ntext-based query representations using contextualized word embeddings and\nspecialized attention networks. This enhances the semantic understanding of\nqueries, leading to better video summaries. To emulate human-like\nsummarization, which accounts for both visual coherence and abstract factors\nlike storyline consistency, we introduce a conditional modeling approach. This\nmethod uses multiple random variables and joint distributions to capture key\nsummarization components, resulting in more human-like and explainable\nsummaries. Addressing data scarcity in fully supervised learning, the thesis\nproposes a segment-level pseudo-labeling approach. This self-supervised method\ngenerates additional data, improving model performance even with limited\nhuman-labeled datasets. In summary, this research aims to enhance automatic\nvideo summarization by incorporating text-based queries, improving query\nrepresentations, introducing conditional modeling, and addressing data\nscarcity, thereby creating more effective and personalized video summaries.\n","authors":["Jia-Hong Huang"],"pdf_url":"https://arxiv.org/pdf/2408.14743v1.pdf","comment":"Ph.D. thesis, 137 pages"},{"id":"http://arxiv.org/abs/2408.14723v1","updated":"2024-08-27T01:23:49Z","published":"2024-08-27T01:23:49Z","title":"Snap and Diagnose: An Advanced Multimodal Retrieval System for\n  Identifying Plant Diseases in the Wild","summary":"  Plant disease recognition is a critical task that ensures crop health and\nmitigates the damage caused by diseases. A handy tool that enables farmers to\nreceive a diagnosis based on query pictures or the text description of\nsuspicious plants is in high demand for initiating treatment before potential\ndiseases spread further. In this paper, we develop a multimodal plant disease\nimage retrieval system to support disease search based on either image or text\nprompts. Specifically, we utilize the largest in-the-wild plant disease dataset\nPlantWild, which includes over 18,000 images across 89 categories, to provide a\ncomprehensive view of potential diseases relating to the query. Furthermore,\ncross-modal retrieval is achieved in the developed system, facilitated by a\nnovel CLIP-based vision-language model that encodes both disease descriptions\nand disease images into the same latent space. Built on top of the retriever,\nour retrieval system allows users to upload either plant disease images or\ndisease descriptions to retrieve the corresponding images with similar\ncharacteristics from the disease dataset to suggest candidate diseases for end\nusers' consideration.\n","authors":["Tianqi Wei","Zhi Chen","Xin Yu"],"pdf_url":"https://arxiv.org/pdf/2408.14723v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15371v1","updated":"2024-08-27T19:10:21Z","published":"2024-08-27T19:10:21Z","title":"Temporal Graph Neural Network-Powered Paper Recommendation on Dynamic\n  Citation Networks","summary":"  Due to the rapid growth of scientific publications, identifying all related\nreference articles in the literature has become increasingly challenging yet\nhighly demanding. Existing methods primarily assess candidate publications from\na static perspective, focusing on the content of articles and their structural\ninformation, such as citation relationships. There is a lack of research\nregarding how to account for the evolving impact among papers on their\nembeddings. Toward this goal, this paper introduces a temporal dimension to\npaper recommendation strategies. The core idea is to continuously update a\npaper's embedding when new citation relationships appear, enhancing its\nrelevance for future recommendations. Whenever a citation relationship is added\nto the literature upon the publication of a paper, the embeddings of the two\nrelated papers are updated through a Temporal Graph Neural Network (TGN). A\nlearnable memory update module based on a Recurrent Neural Network (RNN) is\nutilized to study the evolution of the embedding of a paper in order to predict\nits reference impact in a future timestamp. Such a TGN-based model learns a\npattern of how people's views of the paper may evolve, aiming to guide paper\nrecommendations more precisely. Extensive experiments on an open citation\nnetwork dataset, including 313,278 articles from\nhttps://paperswithcode.com/about PaperWithCode, have demonstrated the\neffectiveness of the proposed approach.\n","authors":["Junhao Shen","Mohammad Ausaf Ali Haqqani","Beichen Hu","Cheng Huang","Xihao Xie","Tsengdar Lee","Jia Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.15371v1.pdf","comment":"10 pages, 4 figures, accepted by SDU@AAAI-2024. The AAAI Workshop on\n  Scientific Document Understanding (2024)"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2408.15240v1","updated":"2024-08-27T17:57:45Z","published":"2024-08-27T17:57:45Z","title":"Generative Verifiers: Reward Modeling as Next-Token Prediction","summary":"  Verifiers or reward models are often used to enhance the reasoning\nperformance of large language models (LLMs). A common approach is the Best-of-N\nmethod, where N candidate solutions generated by the LLM are ranked by a\nverifier, and the best one is selected. While LLM-based verifiers are typically\ntrained as discriminative classifiers to score solutions, they do not utilize\nthe text generation capabilities of pretrained LLMs. To overcome this\nlimitation, we instead propose training verifiers using the ubiquitous\nnext-token prediction objective, jointly on verification and solution\ngeneration. Compared to standard verifiers, such generative verifiers (GenRM)\ncan benefit from several advantages of LLMs: they integrate seamlessly with\ninstruction tuning, enable chain-of-thought reasoning, and can utilize\nadditional inference-time compute via majority voting for better verification.\nWe demonstrate that when using Gemma-based verifiers on algorithmic and\ngrade-school math reasoning tasks, GenRM outperforms discriminative verifiers\nand LLM-as-a-Judge, showing a 16-64% improvement in the percentage of problems\nsolved with Best-of-N. Furthermore, we show that GenRM scales favorably across\ndataset size, model capacity, and inference-time compute.\n","authors":["Lunjun Zhang","Arian Hosseini","Hritik Bansal","Mehran Kazemi","Aviral Kumar","Rishabh Agarwal"],"pdf_url":"https://arxiv.org/pdf/2408.15240v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15237v1","updated":"2024-08-27T17:56:11Z","published":"2024-08-27T17:56:11Z","title":"The Mamba in the Llama: Distilling and Accelerating Hybrid Models","summary":"  Linear RNN architectures, like Mamba, can be competitive with Transformer\nmodels in language modeling while having advantageous deployment\ncharacteristics. Given the focus on training large-scale Transformer models, we\nconsider the challenge of converting these pretrained models for deployment. We\ndemonstrate that it is feasible to distill large Transformers into linear RNNs\nby reusing the linear projection weights from attention layers with academic\nGPU resources. The resulting hybrid model, which incorporates a quarter of the\nattention layers, achieves performance comparable to the original Transformer\nin chat benchmarks and outperforms open-source hybrid Mamba models trained from\nscratch with trillions of tokens in both chat benchmarks and general\nbenchmarks. Moreover, we introduce a hardware-aware speculative decoding\nalgorithm that accelerates the inference speed of Mamba and hybrid models.\nOverall we show how, with limited computation resources, we can remove many of\nthe original attention layers and generate from the resulting model more\nefficiently. Our top-performing model, distilled from Llama3-8B-Instruct,\nachieves a 29.61 length-controlled win rate on AlpacaEval 2 against GPT-4 and\n7.35 on MT-Bench, surpassing the best instruction-tuned linear RNN model.\n","authors":["Junxiong Wang","Daniele Paliotta","Avner May","Alexander M. Rush","Tri Dao"],"pdf_url":"https://arxiv.org/pdf/2408.15237v1.pdf","comment":"Code is open-sourced at https://github.com/jxiw/MambaInLlama"},{"id":"http://arxiv.org/abs/2408.15231v1","updated":"2024-08-27T17:48:29Z","published":"2024-08-27T17:48:29Z","title":"DCT-CryptoNets: Scaling Private Inference in the Frequency Domain","summary":"  The convergence of fully homomorphic encryption (FHE) and machine learning\noffers unprecedented opportunities for private inference of sensitive data. FHE\nenables computation directly on encrypted data, safeguarding the entire machine\nlearning pipeline, including data and model confidentiality. However, existing\nFHE-based implementations for deep neural networks face significant challenges\nin computational cost, latency, and scalability, limiting their practical\ndeployment. This paper introduces DCT-CryptoNets, a novel approach that\nleverages frequency-domain learning to tackle these issues. Our method operates\ndirectly in the frequency domain, utilizing the discrete cosine transform (DCT)\ncommonly employed in JPEG compression. This approach is inherently compatible\nwith remote computing services, where images are usually transmitted and stored\nin compressed formats. DCT-CryptoNets reduces the computational burden of\nhomomorphic operations by focusing on perceptually relevant low-frequency\ncomponents. This is demonstrated by substantial latency reduction of up to\n5.3$\\times$ compared to prior work on image classification tasks, including a\nnovel demonstration of ImageNet inference within 2.5 hours, down from 12.5\nhours compared to prior work on equivalent compute resources. Moreover,\nDCT-CryptoNets improves the reliability of encrypted accuracy by reducing\nvariability (e.g., from $\\pm$2.5\\% to $\\pm$1.0\\% on ImageNet). This study\ndemonstrates a promising avenue for achieving efficient and practical\nprivacy-preserving deep learning on high resolution images seen in real-world\napplications.\n","authors":["Arjun Roy","Kaushik Roy"],"pdf_url":"https://arxiv.org/pdf/2408.15231v1.pdf","comment":"Under Review; 10 pages content, 3 pages appendix, 4 figures, 8\n  tables; Code TBD"},{"id":"http://arxiv.org/abs/2408.15221v1","updated":"2024-08-27T17:33:30Z","published":"2024-08-27T17:33:30Z","title":"LLM Defenses Are Not Robust to Multi-Turn Human Jailbreaks Yet","summary":"  Recent large language model (LLM) defenses have greatly improved models'\nability to refuse harmful queries, even when adversarially attacked. However,\nLLM defenses are primarily evaluated against automated adversarial attacks in a\nsingle turn of conversation, an insufficient threat model for real-world\nmalicious use. We demonstrate that multi-turn human jailbreaks uncover\nsignificant vulnerabilities, exceeding 70% attack success rate (ASR) on\nHarmBench against defenses that report single-digit ASRs with automated\nsingle-turn attacks. Human jailbreaks also reveal vulnerabilities in machine\nunlearning defenses, successfully recovering dual-use biosecurity knowledge\nfrom unlearned models. We compile these results into Multi-Turn Human\nJailbreaks (MHJ), a dataset of 2,912 prompts across 537 multi-turn jailbreaks.\nWe publicly release MHJ alongside a compendium of jailbreak tactics developed\nacross dozens of commercial red teaming engagements, supporting research\ntowards stronger LLM defenses.\n","authors":["Nathaniel Li","Ziwen Han","Ian Steneker","Willow Primack","Riley Goodside","Hugh Zhang","Zifan Wang","Cristina Menghini","Summer Yue"],"pdf_url":"https://arxiv.org/pdf/2408.15221v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.13643v2","updated":"2024-08-27T17:32:39Z","published":"2022-09-27T19:16:26Z","title":"MPC-Pipe: an Efficient Pipeline Scheme for Secure Multi-party Machine\n  Learning Inference","summary":"  Multi-party computing (MPC) has been gaining popularity as a secure computing\nmodel over the past few years. However, prior works have demonstrated that MPC\nprotocols still pay substantial performance penalties compared to plaintext,\nparticularly when applied to ML algorithms. The overhead is due to added\ncomputation and communication costs. Prior studies, as well as our own\nanalysis, found that most MPC protocols today sequentially perform\ncommunication and computation. The participating parties must compute on their\nshares first and then perform data communication to allow the distribution of\nnew secret shares before proceeding to the next computation step. In this work,\nwe show that serialization is unnecessary, particularly in the context of ML\ncomputations (both in Convolutional neural networks and in Transformer-based\nmodels). We demonstrate that it is possible to carefully orchestrate the\ncomputation and communication steps to overlap.\n  We propose MPC-Pipe, an efficient MPC system for both training and inference\nof ML workloads, which pipelines computations and communications in an MPC\nprotocol during the online phase. MPC-Pipe proposes three pipeline schemes to\noptimize the online phase of ML in the semi-honest majority adversary setting.\nWe implement MPC-Pipe by augmenting a modified version of CrypTen, which\nseparates online and offline phases. We evaluate the end-to-end system\nperformance benefits of the online phase of MPC using deep neural networks\n(VGG16, ResNet50) and Transformers using different network settings. We show\nthat MPC-Pipe can improve the throughput and latency of ML workloads.\n","authors":["Yongqin Wang","Rachit Rajat","Murali Annavaram"],"pdf_url":"https://arxiv.org/pdf/2209.13643v2.pdf","comment":"To be appeared in ASPLOS'25"},{"id":"http://arxiv.org/abs/2209.04042v3","updated":"2024-08-27T17:24:51Z","published":"2022-09-08T21:46:12Z","title":"Assessing Lower Limb Strength using Internet-of-Things Enabled Chair","summary":"  This project describes the application of the technologies of Machine\nLearning and Internet-of-Things to assess the lower limb strength of\nindividuals undergoing rehabilitation or therapy. Specifically, it seeks to\nmeasure and assess the progress of individuals by sensors attached to chairs\nand processing the data through Google GPU Tensorflow CoLab. Pressure sensors\nare attached to various locations on a chair, including but not limited to the\nseating area, backrest, hand rests, and legs. Sensor data from the individual\nperforming both sit-to-stand transition and stand-to-sit transition provides a\ntime series dataset regarding the pressure distribution and vibratory motion on\nthe chair. The dataset and timing information can then be fed into a machine\nlearning model to estimate the relative strength and weakness during various\nphases of the movement.\n","authors":["Chelsea Yeh","Hanna Kaitlin Dy","Phillip Schodinger","Hudson Kaleb Dy"],"pdf_url":"https://arxiv.org/pdf/2209.04042v3.pdf","comment":"12 Pages"},{"id":"http://arxiv.org/abs/2406.14507v2","updated":"2024-08-27T17:19:20Z","published":"2024-06-20T17:12:20Z","title":"On Newton's Method to Unlearn Neural Networks","summary":"  With the widespread applications of neural networks (NNs) trained on personal\ndata, machine unlearning has become increasingly important for enabling\nindividuals to exercise their personal data ownership, particularly the \"right\nto be forgotten\" from trained NNs. Since retraining is computationally\nexpensive, we seek approximate unlearning algorithms for NNs that return\nidentical models to the retrained oracle. While Newton's method has been\nsuccessfully used to approximately unlearn linear models, we observe that\nadapting it for NN is challenging due to degenerate Hessians that make\ncomputing Newton's update impossible. Additionally, we show that when coupled\nwith popular techniques to resolve the degeneracy, Newton's method often incurs\noffensively large norm updates and empirically degrades model performance\npost-unlearning. To address these challenges, we propose CureNewton's method, a\nprinciple approach that leverages cubic regularization to handle the Hessian\ndegeneracy effectively. The added regularizer eliminates the need for manual\nfinetuning and affords a natural interpretation within the unlearning context.\nExperiments across different models and datasets show that our method can\nachieve competitive unlearning performance to the state-of-the-art algorithm in\npractical unlearning settings, while being theoretically justified and\nefficient in running time.\n","authors":["Nhung Bui","Xinyang Lu","Rachael Hwee Ling Sim","See-Kiong Ng","Bryan Kian Hsiang Low"],"pdf_url":"https://arxiv.org/pdf/2406.14507v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.05921v2","updated":"2024-08-27T17:14:16Z","published":"2024-07-08T13:28:47Z","title":"TAPVid-3D: A Benchmark for Tracking Any Point in 3D","summary":"  We introduce a new benchmark, TAPVid-3D, for evaluating the task of\nlong-range Tracking Any Point in 3D (TAP-3D). While point tracking in two\ndimensions (TAP) has many benchmarks measuring performance on real-world\nvideos, such as TAPVid-DAVIS, three-dimensional point tracking has none. To\nthis end, leveraging existing footage, we build a new benchmark for 3D point\ntracking featuring 4,000+ real-world videos, composed of three different data\nsources spanning a variety of object types, motion patterns, and indoor and\noutdoor environments. To measure performance on the TAP-3D task, we formulate a\ncollection of metrics that extend the Jaccard-based metric used in TAP to\nhandle the complexities of ambiguous depth scales across models, occlusions,\nand multi-track spatio-temporal smoothness. We manually verify a large sample\nof trajectories to ensure correct video annotations, and assess the current\nstate of the TAP-3D task by constructing competitive baselines using existing\ntracking models. We anticipate this benchmark will serve as a guidepost to\nimprove our ability to understand precise 3D motion and surface deformation\nfrom monocular video. Code for dataset download, generation, and model\nevaluation is available at https://tapvid3d.github.io\n","authors":["Skanda Koppula","Ignacio Rocco","Yi Yang","Joe Heyward","João Carreira","Andrew Zisserman","Gabriel Brostow","Carl Doersch"],"pdf_url":"https://arxiv.org/pdf/2407.05921v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.14053v5","updated":"2024-08-27T17:03:12Z","published":"2023-09-25T11:35:10Z","title":"Revisiting LARS for Large Batch Training Generalization of Neural\n  Networks","summary":"  This paper explores Large Batch Training techniques using layer-wise adaptive\nscaling ratio (LARS) across diverse settings, uncovering insights. LARS\nalgorithms with warm-up tend to be trapped in sharp minimizers early on due to\nredundant ratio scaling. Additionally, a fixed steep decline in the latter\nphase restricts deep neural networks from effectively navigating early-phase\nsharp minimizers. Building on these findings, we propose Time Varying LARS\n(TVLARS), a novel algorithm that replaces warm-up with a configurable\nsigmoid-like function for robust training in the initial phase. TVLARS promotes\ngradient exploration early on, surpassing sharp optimizers and gradually\ntransitioning to LARS for robustness in later phases. Extensive experiments\ndemonstrate that TVLARS consistently outperforms LARS and LAMB in most cases,\nwith up to 2\\% improvement in classification scenarios. Notably, in all\nself-supervised learning cases, TVLARS dominates LARS and LAMB with performance\nimprovements of up to 10\\%.\n","authors":["Khoi Do","Duong Nguyen","Hoa Nguyen","Long Tran-Thanh","Nguyen-Hoang Tran","Quoc-Viet Pham"],"pdf_url":"https://arxiv.org/pdf/2309.14053v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15198v1","updated":"2024-08-27T16:58:23Z","published":"2024-08-27T16:58:23Z","title":"Automatic 8-tissue Segmentation for 6-month Infant Brains","summary":"  Numerous studies have highlighted that atypical brain development,\nparticularly during infancy and toddlerhood, is linked to an increased\nlikelihood of being diagnosed with a neurodevelopmental condition, such as\nautism. Accurate brain tissue segmentations for morphological analysis are\nessential in numerous infant studies. However, due to ongoing white matter (WM)\nmyelination changing tissue contrast in T1- and T2-weighted images, automatic\ntissue segmentation in 6-month infants is particularly difficult. On the other\nhand, manual labelling by experts is time-consuming and labor-intensive. In\nthis study, we propose the first 8-tissue segmentation pipeline for\nsix-month-old infant brains. This pipeline utilizes domain adaptation (DA)\ntechniques to leverage our longitudinal data, including neonatal images\nsegmented with the neonatal Developing Human Connectome Project structural\npipeline. Our pipeline takes raw 6-month images as inputs and generates the\n8-tissue segmentation as outputs, forming an end-to-end segmentation pipeline.\nThe segmented tissues include WM, gray matter (GM), cerebrospinal fluid (CSF),\nventricles, cerebellum, basal ganglia, brainstem, and hippocampus/amygdala.\nCycle-Consistent Generative Adversarial Network (CycleGAN) and Attention U-Net\nwere employed to achieve the image contrast transformation between neonatal and\n6-month images and perform tissue segmentation on the synthesized 6-month\nimages (neonatal images with 6-month intensity contrast), respectively.\nMoreover, we incorporated the segmentation outputs from Infant Brain Extraction\nand Analysis Toolbox (iBEAT) and another Attention U-Net to further enhance the\nperformance and construct the end-to-end segmentation pipeline. Our evaluation\nwith real 6-month images achieved a DICE score of 0.92, an HD95 of 1.6, and an\nASSD of 0.42.\n","authors":["Yilan Dong","Vanessa Kyriakopoulou","Irina Grigorescu","Grainne McAlonan","Dafnis Batalle","Maria Deprez"],"pdf_url":"https://arxiv.org/pdf/2408.15198v1.pdf","comment":"11 pages, 4 figures, to be published in MICCAI PIPPI workshop"},{"id":"http://arxiv.org/abs/2408.14461v2","updated":"2024-08-27T16:43:52Z","published":"2024-08-26T17:50:47Z","title":"A domain decomposition-based autoregressive deep learning model for\n  unsteady and nonlinear partial differential equations","summary":"  In this paper, we propose a domain-decomposition-based deep learning (DL)\nframework, named transient-CoMLSim, for accurately modeling unsteady and\nnonlinear partial differential equations (PDEs). The framework consists of two\nkey components: (a) a convolutional neural network (CNN)-based autoencoder\narchitecture and (b) an autoregressive model composed of fully connected\nlayers. Unlike existing state-of-the-art methods that operate on the entire\ncomputational domain, our CNN-based autoencoder computes a lower-dimensional\nbasis for solution and condition fields represented on subdomains. Timestepping\nis performed entirely in the latent space, generating embeddings of the\nsolution variables from the time history of embeddings of solution and\ncondition variables. This approach not only reduces computational complexity\nbut also enhances scalability, making it well-suited for large-scale\nsimulations. Furthermore, to improve the stability of our rollouts, we employ a\ncurriculum learning (CL) approach during the training of the autoregressive\nmodel. The domain-decomposition strategy enables scaling to out-of-distribution\ndomain sizes while maintaining the accuracy of predictions -- a feature not\neasily integrated into popular DL-based approaches for physics simulations. We\nbenchmark our model against two widely-used DL architectures, Fourier Neural\nOperator (FNO) and U-Net, and demonstrate that our framework outperforms them\nin terms of accuracy, extrapolation to unseen timesteps, and stability for a\nwide range of use cases.\n","authors":["Sheel Nidhan","Haoliang Jiang","Lalit Ghule","Clancy Umphrey","Rishikesh Ranade","Jay Pathak"],"pdf_url":"https://arxiv.org/pdf/2408.14461v2.pdf","comment":"26 pages"},{"id":"http://arxiv.org/abs/2401.13054v3","updated":"2024-08-27T16:42:26Z","published":"2024-01-23T19:26:24Z","title":"Frustrated Random Walks: A Fast Method to Compute Node Distances on\n  Hypergraphs","summary":"  A hypergraph is a generalization of a graph that arises naturally when\nattribute-sharing among entities is considered. Compared to graphs, hypergraphs\nhave the distinct advantage that they contain explicit communities and are more\nconvenient to manipulate. An open problem in hypergraph research is how to\naccurately and efficiently calculate node distances on hypergraphs. Estimating\nnode distances enables us to find a node's nearest neighbors, which has\nimportant applications in such areas as recommender system, targeted\nadvertising, etc. In this paper, we propose using expected hitting times of\nrandom walks to compute hypergraph node distances. We note that simple random\nwalks (SRW) cannot accurately compute node distances on highly complex\nreal-world hypergraphs, which motivates us to introduce frustrated random walks\n(FRW) for this task. We further benchmark our method against DeepWalk, and show\nthat while the latter can achieve comparable results, FRW has a distinct\ncomputational advantage in cases where the number of targets is fairly small.\nFor such cases, we show that FRW runs in significantly shorter time than\nDeepWalk. Finally, we analyze the time complexity of our method, and show that\nfor large and sparse hypergraphs, the complexity is approximately linear,\nrendering it superior to the DeepWalk alternative.\n","authors":["Enzhi Li","Scott Nickleach","Bilal Fadlallah"],"pdf_url":"https://arxiv.org/pdf/2401.13054v3.pdf","comment":"15 pages, 6 figures"},{"id":"http://arxiv.org/abs/2408.15183v1","updated":"2024-08-27T16:35:06Z","published":"2024-08-27T16:35:06Z","title":"On latent dynamics learning in nonlinear reduced order modeling","summary":"  In this work, we present the novel mathematical framework of latent dynamics\nmodels (LDMs) for reduced order modeling of parameterized nonlinear\ntime-dependent PDEs. Our framework casts this latter task as a nonlinear\ndimensionality reduction problem, while constraining the latent state to evolve\naccordingly to an (unknown) dynamical system. A time-continuous setting is\nemployed to derive error and stability estimates for the LDM approximation of\nthe full order model (FOM) solution. We analyze the impact of using an explicit\nRunge-Kutta scheme in the time-discrete setting, resulting in the\n$\\Delta\\text{LDM}$ formulation, and further explore the learnable setting,\n$\\Delta\\text{LDM}_\\theta$, where deep neural networks approximate the discrete\nLDM components, while providing a bounded approximation error with respect to\nthe FOM. Moreover, we extend the concept of parameterized Neural ODE - recently\nproposed as a possible way to build data-driven dynamical systems with varying\ninput parameters - to be a convolutional architecture, where the input\nparameters information is injected by means of an affine modulation mechanism,\nwhile designing a convolutional autoencoder neural network able to retain\nspatial-coherence, thus enhancing interpretability at the latent level.\nNumerical experiments, including the Burgers' and the\nadvection-reaction-diffusion equations, demonstrate the framework's ability to\nobtain, in a multi-query context, a time-continuous approximation of the FOM\nsolution, thus being able to query the LDM approximation at any given time\ninstance while retaining a prescribed level of accuracy. Our findings highlight\nthe remarkable potential of the proposed LDMs, representing a mathematically\nrigorous framework to enhance the accuracy and approximation capabilities of\nreduced order modeling for time-dependent parameterized PDEs.\n","authors":["Nicola Farenga","Stefania Fresca","Simone Brivio","Andrea Manzoni"],"pdf_url":"https://arxiv.org/pdf/2408.15183v1.pdf","comment":"43 pages"},{"id":"http://arxiv.org/abs/2408.15173v1","updated":"2024-08-27T16:11:20Z","published":"2024-08-27T16:11:20Z","title":"Exploiting Approximate Symmetry for Efficient Multi-Agent Reinforcement\n  Learning","summary":"  Mean-field games (MFG) have become significant tools for solving large-scale\nmulti-agent reinforcement learning problems under symmetry. However, the\nassumption of exact symmetry limits the applicability of MFGs, as real-world\nscenarios often feature inherent heterogeneity. Furthermore, most works on MFG\nassume access to a known MFG model, which might not be readily available for\nreal-world finite-agent games. In this work, we broaden the applicability of\nMFGs by providing a methodology to extend any finite-player, possibly\nasymmetric, game to an \"induced MFG\". First, we prove that $N$-player dynamic\ngames can be symmetrized and smoothly extended to the infinite-player continuum\nvia explicit Kirszbraun extensions. Next, we propose the notion of\n$\\alpha,\\beta$-symmetric games, a new class of dynamic population games that\nincorporate approximate permutation invariance. For $\\alpha,\\beta$-symmetric\ngames, we establish explicit approximation bounds, demonstrating that a Nash\npolicy of the induced MFG is an approximate Nash of the $N$-player dynamic\ngame. We show that TD learning converges up to a small bias using trajectories\nof the $N$-player game with finite-sample guarantees, permitting symmetrized\nlearning without building an explicit MFG model. Finally, for certain games\nsatisfying monotonicity, we prove a sample complexity of\n$\\widetilde{\\mathcal{O}}(\\varepsilon^{-6})$ for the $N$-agent game to learn an\n$\\varepsilon$-Nash up to symmetrization bias. Our theory is supported by\nevaluations on MARL benchmarks with thousands of agents.\n","authors":["Batuhan Yardim","Niao He"],"pdf_url":"https://arxiv.org/pdf/2408.15173v1.pdf","comment":"5 figures"},{"id":"http://arxiv.org/abs/2408.15165v1","updated":"2024-08-27T16:03:18Z","published":"2024-08-27T16:03:18Z","title":"Latent Ewald summation for machine learning of long-range interactions","summary":"  Machine learning interatomic potentials (MLIPs) often neglect long-range\ninteractions, such as electrostatic and dispersion forces. In this work, we\nintroduce a straightforward and efficient method to account for long-range\ninteractions by learning a latent variable from local atomic descriptors and\napplying an Ewald summation to this variable. We demonstrate that in systems\nincluding charged, polar, or apolar molecular dimers, bulk water, and\nwater-vapor interface, standard short-ranged MLIPs can lead to unphysical\npredictions even when employing message passing. The long-range models\neffectively eliminate these artifacts, with only about twice the computational\ncost of short-range MLIPs.\n","authors":["Bingqing Cheng"],"pdf_url":"https://arxiv.org/pdf/2408.15165v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15158v1","updated":"2024-08-27T15:52:52Z","published":"2024-08-27T15:52:52Z","title":"Delay as Payoff in MAB","summary":"  In this paper, we investigate a variant of the classical stochastic\nMulti-armed Bandit (MAB) problem, where the payoff received by an agent (either\ncost or reward) is both delayed, and directly corresponds to the magnitude of\nthe delay. This setting models faithfully many real world scenarios such as the\ntime it takes for a data packet to traverse a network given a choice of route\n(where delay serves as the agent's cost); or a user's time spent on a web page\ngiven a choice of content (where delay serves as the agent's reward).\n  Our main contributions are tight upper and lower bounds for both the cost and\nreward settings. For the case that delays serve as costs, which we are the\nfirst to consider, we prove optimal regret that scales as $\\sum_{i:\\Delta_i >\n0}\\frac{\\log T}{\\Delta_i} + d^*$, where $T$ is the maximal number of steps,\n$\\Delta_i$ are the sub-optimality gaps and $d^*$ is the minimal expected delay\namongst arms. For the case that delays serves as rewards, we show optimal\nregret of $\\sum_{i:\\Delta_i > 0}\\frac{\\log T}{\\Delta_i} + \\bar{d}$, where $\\bar\nd$ is the second maximal expected delay. These improve over the regret in the\ngeneral delay-dependent payoff setting, which scales as $\\sum_{i:\\Delta_i >\n0}\\frac{\\log T}{\\Delta_i} + D$, where $D$ is the maximum possible delay. Our\nregret bounds highlight the difference between the cost and reward scenarios,\nshowing that the improvement in the cost scenario is more significant than for\nthe reward. Finally, we accompany our theoretical results with an empirical\nevaluation.\n","authors":["Ofir Schlisselberg","Ido Cohen","Tal Lancewicki","Yishay Mansour"],"pdf_url":"https://arxiv.org/pdf/2408.15158v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.11075v4","updated":"2024-08-27T15:39:53Z","published":"2024-07-13T04:29:36Z","title":"A Comprehensive Survey on Kolmogorov Arnold Networks (KAN)","summary":"  Through this comprehensive survey of Kolmogorov-Arnold Networks(KAN), we have\ngained a thorough understanding of its theoretical foundation, architectural\ndesign, application scenarios, and current research progress. KAN, with its\nunique architecture and flexible activation functions, excels in handling\ncomplex data patterns and nonlinear relationships, demonstrating wide-ranging\napplication potential. While challenges remain, KAN is poised to pave the way\nfor innovative solutions in various fields, potentially revolutionizing how we\napproach complex computational problems.\n","authors":["Yuntian Hou","Di Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.11075v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08669v2","updated":"2024-08-27T15:36:59Z","published":"2024-01-08T21:13:07Z","title":"Deep Reinforcement Learning for Multi-Truck Vehicle Routing Problems\n  with Multi-Leg Demand Routes","summary":"  Deep reinforcement learning (RL) has been shown to be effective in producing\napproximate solutions to some vehicle routing problems (VRPs), especially when\nusing policies generated by encoder-decoder attention mechanisms. While these\ntechniques have been quite successful for relatively simple problem instances,\nthere are still under-researched and highly complex VRP variants for which no\neffective RL method has been demonstrated. In this work we focus on one such\nVRP variant, which contains multiple trucks and multi-leg routing requirements.\nIn these problems, demand is required to move along sequences of nodes, instead\nof just from a start node to an end node. With the goal of making deep RL a\nviable strategy for real-world industrial-scale supply chain logistics, we\ndevelop new extensions to existing encoder-decoder attention models which allow\nthem to handle multiple trucks and multi-leg routing requirements. Our models\nhave the advantage that they can be trained for a small number of trucks and\nnodes, and then embedded into a large supply chain to yield solutions for\nlarger numbers of trucks and nodes. We test our approach on a real supply chain\nenvironment arising in the operations of Japanese automotive parts manufacturer\nAisin Corporation, and find that our algorithm outperforms Aisin's previous\nbest solution.\n","authors":["Joshua Levin","Randall Correll","Takanori Ide","Takafumi Suzuki","Takaho Saito","Alan Arai"],"pdf_url":"https://arxiv.org/pdf/2401.08669v2.pdf","comment":"13 pages, 4 figures"},{"id":"http://arxiv.org/abs/2408.10280v2","updated":"2024-08-27T15:34:49Z","published":"2024-08-18T12:18:56Z","title":"NoRA: Nested Low-Rank Adaptation for Efficient Fine-Tuning Large Models","summary":"  In this paper, we introduce Nested Low-Rank Adaptation (NoRA), a novel\napproach to parameter-efficient fine-tuning that extends the capabilities of\nLow-Rank Adaptation (LoRA) techniques. Vanilla LoRA overlooks pre-trained\nweight inheritance and still requires fine-tuning numerous parameters. To\naddresses these issues, our NoRA adopts a dual-layer nested structure with\nSingular Value Decomposition (SVD), effectively leveraging original matrix\nknowledge while reducing tunable parameters. Specifically, NoRA freezes the\nouter LoRA weights and utilizes an inner LoRA design, providing enhanced\ncontrol over model optimization. This approach allows the model to more\nprecisely adapt to specific tasks while maintaining a compact parameter space.\nBy freezing outer LoRA weights and using an inner LoRA design, NoRA enables\nprecise task adaptation with a compact parameter space. Evaluations on tasks\nincluding commonsense reasoning with large language models, fine-tuning\nvision-language models, and subject-driven generation demonstrate NoRA's\nsuperiority over LoRA and its variants. Code will be released upon acceptance.\n","authors":["Cheng Lin","Lujun Li","Dezhi Li","Jie Zou","Wei Xue","Yike Guo"],"pdf_url":"https://arxiv.org/pdf/2408.10280v2.pdf","comment":"Work in progress, revisions ongoing"},{"id":"http://arxiv.org/abs/2311.07596v2","updated":"2024-08-27T15:34:43Z","published":"2023-11-10T11:40:24Z","title":"Graph GOSPA metric: a metric to measure the discrepancy between graphs\n  of different sizes","summary":"  This paper proposes a metric to measure the dissimilarity between graphs that\nmay have a different number of nodes. The proposed metric extends the\ngeneralised optimal subpattern assignment (GOSPA) metric, which is a metric for\nsets, to graphs. The proposed graph GOSPA metric includes costs associated with\nnode attribute errors for properly assigned nodes, missed and false nodes and\nedge mismatches between graphs. The computation of this metric is based on\nfinding the optimal assignments between nodes in the two graphs, with the\npossibility of leaving some of the nodes unassigned. We also propose a lower\nbound for the metric, which is also a metric for graphs and is computable in\npolynomial time using linear programming. The metric is first derived for\nundirected unweighted graphs and it is then extended to directed and weighted\ngraphs. The properties of the metric are demonstrated via simulated and\nempirical datasets.\n","authors":["Jinhao Gu","Ángel F. García-Fernández","Robert E. Firth","Lennart Svensson"],"pdf_url":"https://arxiv.org/pdf/2311.07596v2.pdf","comment":"Accepted in IEEE Transactions on Signal Processing. The code is\n  available at https://github.com/JinhaoGu/The-graph-GOSPA-metric"},{"id":"http://arxiv.org/abs/2405.14848v2","updated":"2024-08-27T15:28:33Z","published":"2024-05-23T17:56:38Z","title":"Local Causal Discovery for Structural Evidence of Direct Discrimination","summary":"  Identifying the causal pathways of unfairness is a critical objective in\nimproving policy design and algorithmic decision-making. Prior work in causal\nfairness analysis often requires knowledge of the causal graph, hindering\npractical applications in complex or low-knowledge domains. Moreover, global\ndiscovery methods that learn causal structure from data can result in unstable\nperformance with finite samples, potentially leading to contradictory fairness\nconclusions. To mitigate these issues, we introduce local discovery for direct\ndiscrimination (LD3): a method that uncovers structural evidence of direct\ndiscrimination by identifying the causal parents of an outcome variable. LD3\nperforms a linear number of conditional independence tests relative to variable\nset size, and allows for latent confounding under the sufficient condition that\nno parent of the outcome is latent. We show that LD3 returns a valid adjustment\nset (VAS) under a new graphical criterion for the weighted controlled direct\neffect, a qualitative indicator of direct discrimination. LD3 limits\nunnecessary adjustment, providing interpretable VAS for assessing unfairness.\nWe use LD3 to analyze causal fairness in two complex decision systems: criminal\nrecidivism prediction and liver transplant allocation. LD3 was more\ntime-efficient and returned more plausible results on real-world data than\nbaselines, which took 46x to 5870x longer to execute.\n","authors":["Jacqueline Maasch","Kyra Gan","Violet Chen","Agni Orfanoudaki","Nil-Jana Akpinar","Fei Wang"],"pdf_url":"https://arxiv.org/pdf/2405.14848v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15138v1","updated":"2024-08-27T15:23:09Z","published":"2024-08-27T15:23:09Z","title":"How transformers learn structured data: insights from hierarchical\n  filtering","summary":"  We introduce a hierarchical filtering procedure for generative models of\nsequences on trees, enabling control over the range of positional correlations\nin the data. Leveraging this controlled setting, we provide evidence that\nvanilla encoder-only transformer architectures can implement the optimal Belief\nPropagation algorithm on both root classification and masked language modeling\ntasks. Correlations at larger distances corresponding to increasing layers of\nthe hierarchy are sequentially included as the network is trained. We analyze\nhow the transformer layers succeed by focusing on attention maps from models\ntrained with varying degrees of filtering. These attention maps show clear\nevidence for iterative hierarchical reconstruction of correlations, and we can\nrelate these observations to a plausible implementation of the exact inference\nalgorithm for the network sizes considered.\n","authors":["Jerome Garnier-Brun","Marc Mézard","Emanuele Moscato","Luca Saglietti"],"pdf_url":"https://arxiv.org/pdf/2408.15138v1.pdf","comment":"18 pages, 9 figures"},{"id":"http://arxiv.org/abs/2408.15136v1","updated":"2024-08-27T15:19:07Z","published":"2024-08-27T15:19:07Z","title":"Low-Budget Simulation-Based Inference with Bayesian Neural Networks","summary":"  Simulation-based inference methods have been shown to be inaccurate in the\ndata-poor regime, when training simulations are limited or expensive. Under\nthese circumstances, the inference network is particularly prone to\noverfitting, and using it without accounting for the computational uncertainty\narising from the lack of identifiability of the network weights can lead to\nunreliable results. To address this issue, we propose using Bayesian neural\nnetworks in low-budget simulation-based inference, thereby explicitly\naccounting for the computational uncertainty of the posterior approximation. We\ndesign a family of Bayesian neural network priors that are tailored for\ninference and show that they lead to well-calibrated posteriors on tested\nbenchmarks, even when as few as $O(10)$ simulations are available. This opens\nup the possibility of performing reliable simulation-based inference using very\nexpensive simulators, as we demonstrate on a problem from the field of\ncosmology where single simulations are computationally expensive. We show that\nBayesian neural networks produce informative and well-calibrated posterior\nestimates with only a few hundred simulations.\n","authors":["Arnaud Delaunoy","Maxence de la Brassinne Bonardeaux","Siddharth Mishra-Sharma","Gilles Louppe"],"pdf_url":"https://arxiv.org/pdf/2408.15136v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07531v2","updated":"2024-08-27T15:16:06Z","published":"2024-08-14T13:03:41Z","title":"Development of a Large Language Model-based Multi-Agent Clinical\n  Decision Support System for Korean Triage and Acuity Scale (KTAS)-Based\n  Triage and Treatment Planning in Emergency Departments","summary":"  Emergency department (ED) overcrowding and the complexity of rapid\ndecision-making in critical care settings pose significant challenges to\nhealthcare systems worldwide. While clinical decision support systems (CDSS)\nhave shown promise, the integration of large language models (LLMs) offers new\npossibilities for enhancing triage accuracy and clinical decision-making. This\nstudy presents an LLM-driven CDSS designed to assist ED physicians and nurses\nin patient triage, treatment planning, and overall emergency care management.\n  We developed a multi-agent CDSS utilizing Llama-3-70b as the base LLM,\norchestrated by CrewAI and Langchain. The system comprises four AI agents\nemulating key ED roles: Triage Nurse, Emergency Physician, Pharmacist, and ED\nCoordinator. It incorporates the Korean Triage and Acuity Scale (KTAS) for\ntriage assessment and integrates with the RxNorm API for medication management.\n  The model was evaluated using the Asclepius dataset, with performance\nassessed by a clinical emergency medicine specialist. The CDSS demonstrated\nhigh accuracy in triage decision-making compared to the baseline of a\nsingle-agent system. Furthermore, the system exhibited strong performance in\ncritical areas, including primary diagnosis, critical findings identification,\ndisposition decision-making, treatment planning, and resource allocation.\n  Our multi-agent CDSS demonstrates significant potential for supporting\ncomprehensive emergency care management. By leveraging state-of-the-art AI\ntechnologies, this system offers a scalable and adaptable tool that could\nenhance emergency medical care delivery, potentially alleviating ED\novercrowding and improving patient outcomes. This work contributes to the\ngrowing field of AI applications in emergency medicine and offers a promising\ndirection for future research and clinical implementation.\n","authors":["Seungjun Han","Wongyung Choi"],"pdf_url":"https://arxiv.org/pdf/2408.07531v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15133v1","updated":"2024-08-27T15:13:06Z","published":"2024-08-27T15:13:06Z","title":"Using LLMs for Explaining Sets of Counterfactual Examples to Final Users","summary":"  Causality is vital for understanding true cause-and-effect relationships\nbetween variables within predictive models, rather than relying on mere\ncorrelations, making it highly relevant in the field of Explainable AI. In an\nautomated decision-making scenario, causal inference methods can analyze the\nunderlying data-generation process, enabling explanations of a model's decision\nby manipulating features and creating counterfactual examples. These\ncounterfactuals explore hypothetical scenarios where a minimal number of\nfactors are altered, providing end-users with valuable information on how to\nchange their situation. However, interpreting a set of multiple counterfactuals\ncan be challenging for end-users who are not used to analyzing raw data\nrecords. In our work, we propose a novel multi-step pipeline that uses\ncounterfactuals to generate natural language explanations of actions that will\nlead to a change in outcome in classifiers of tabular data using LLMs. This\npipeline is designed to guide the LLM through smaller tasks that mimic human\nreasoning when explaining a decision based on counterfactual cases. We\nconducted various experiments using a public dataset and proposed a method of\nclosed-loop evaluation to assess the coherence of the final explanation with\nthe counterfactuals, as well as the quality of the content. Results are\npromising, although further experiments with other datasets and human\nevaluations should be carried out.\n","authors":["Arturo Fredes","Jordi Vitria"],"pdf_url":"https://arxiv.org/pdf/2408.15133v1.pdf","comment":"Presented as a poster in the 2nd Workshop on Causal Inference and\n  Machine Learning in Practice at KDD 2024"},{"id":"http://arxiv.org/abs/2408.15128v1","updated":"2024-08-27T15:08:06Z","published":"2024-08-27T15:08:06Z","title":"Evaluating the Energy Consumption of Machine Learning: Systematic\n  Literature Review and Experiments","summary":"  Monitoring, understanding, and optimizing the energy consumption of Machine\nLearning (ML) are various reasons why it is necessary to evaluate the energy\nusage of ML. However, there exists no universal tool that can answer this\nquestion for all use cases, and there may even be disagreement on how to\nevaluate energy consumption for a specific use case. Tools and methods are\nbased on different approaches, each with their own advantages and drawbacks,\nand they need to be mapped out and explained in order to select the most\nsuitable one for a given situation. We address this challenge through two\napproaches. First, we conduct a systematic literature review of all tools and\nmethods that permit to evaluate the energy consumption of ML (both at training\nand at inference), irrespective of whether they were originally designed for\nmachine learning or general software. Second, we develop and use an\nexperimental protocol to compare a selection of these tools and methods. The\ncomparison is both qualitative and quantitative on a range of ML tasks of\ndifferent nature (vision, language) and computational complexity. The\nsystematic literature review serves as a comprehensive guide for understanding\nthe array of tools and methods used in evaluating energy consumption of ML, for\nvarious use cases going from basic energy monitoring to consumption\noptimization. Two open-source repositories are provided for further\nexploration. The first one contains tools that can be used to replicate this\nwork or extend the current review. The second repository houses the\nexperimental protocol, allowing users to augment the protocol with new ML\ncomputing tasks and additional energy evaluation tools.\n","authors":["Charlotte Rodriguez","Laura Degioanni","Laetitia Kameni","Richard Vidal","Giovanni Neglia"],"pdf_url":"https://arxiv.org/pdf/2408.15128v1.pdf","comment":"52 pages,"},{"id":"http://arxiv.org/abs/2407.16828v2","updated":"2024-08-27T15:07:28Z","published":"2024-07-23T20:38:23Z","title":"Pareto Front Approximation for Multi-Objective Session-Based Recommender\n  Systems","summary":"  This work introduces MultiTRON, an approach that adapts Pareto front\napproximation techniques to multi-objective session-based recommender systems\nusing a transformer neural network. Our approach optimizes trade-offs between\nkey metrics such as click-through and conversion rates by training on sampled\npreference vectors. A significant advantage is that after training, a single\nmodel can access the entire Pareto front, allowing it to be tailored to meet\nthe specific requirements of different stakeholders by adjusting an additional\ninput vector that weights the objectives. We validate the model's performance\nthrough extensive offline and online evaluation. For broader application and\nresearch, the source code is made available at\nhttps://github.com/otto-de/MultiTRON. The results confirm the model's ability\nto manage multiple recommendation objectives effectively, offering a flexible\ntool for diverse business needs.\n","authors":["Timo Wilm","Philipp Normann","Felix Stepprath"],"pdf_url":"https://arxiv.org/pdf/2407.16828v2.pdf","comment":"Accepted at the Eighteenth ACM Conference on Recommender Systems\n  (RecSys '24)"},{"id":"http://arxiv.org/abs/2408.15126v1","updated":"2024-08-27T15:07:27Z","published":"2024-08-27T15:07:27Z","title":"Force-Guided Bridge Matching for Full-Atom Time-Coarsened Dynamics of\n  Peptides","summary":"  Molecular Dynamics (MD) simulations are irreplaceable and ubiquitous in\nfields of materials science, chemistry, pharmacology just to name a few.\nConventional MD simulations are plagued by numerical stability as well as long\nequilibration time issues, which limits broader applications of MD simulations.\nRecently, a surge of deep learning approaches have been devised for\ntime-coarsened dynamics, which learns the state transition mechanism over much\nlarger time scales to overcome these limitations. However, only a few methods\ntarget the underlying Boltzmann distribution by resampling techniques, where\nproposals are rarely accepted as new states with low efficiency. In this work,\nwe propose a force-guided bridge matching model, FBM, a novel framework that\nfirst incorporates physical priors into bridge matching for full-atom\ntime-coarsened dynamics. With the guidance of our well-designed intermediate\nforce field, FBM is feasible to target the Boltzmann-like distribution by\ndirect inference without extra steps. Experiments on small peptides verify our\nsuperiority in terms of comprehensive metrics and demonstrate transferability\nto unseen peptide systems.\n","authors":["Ziyang Yu","Wenbing Huang","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2408.15126v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13960v2","updated":"2024-08-27T15:06:17Z","published":"2024-08-25T23:48:11Z","title":"Time Series Analysis for Education: Methods, Applications, and Future\n  Directions","summary":"  Recent advancements in the collection and analysis of sequential educational\ndata have brought time series analysis to a pivotal position in educational\nresearch, highlighting its essential role in facilitating data-driven\ndecision-making. However, there is a lack of comprehensive summaries that\nconsolidate these advancements. To the best of our knowledge, this paper is the\nfirst to provide a comprehensive review of time series analysis techniques\nspecifically within the educational context. We begin by exploring the\nlandscape of educational data analytics, categorizing various data sources and\ntypes relevant to education. We then review four prominent time series\nmethods-forecasting, classification, clustering, and anomaly\ndetection-illustrating their specific application points in educational\nsettings. Subsequently, we present a range of educational scenarios and\napplications, focusing on how these methods are employed to address diverse\neducational tasks, which highlights the practical integration of multiple time\nseries methods to solve complex educational problems. Finally, we conclude with\na discussion on future directions, including personalized learning analytics,\nmultimodal data fusion, and the role of large language models (LLMs) in\neducational time series. The contributions of this paper include a detailed\ntaxonomy of educational data, a synthesis of time series techniques with\nspecific educational applications, and a forward-looking perspective on\nemerging trends and future research opportunities in educational analysis. The\nrelated papers and resources are available and regularly updated at the project\npage.\n","authors":["Shengzhong Mao","Chaoli Zhang","Yichi Song","Jindong Wang","Xiao-Jun Zeng","Zenglin Xu","Qingsong Wen"],"pdf_url":"https://arxiv.org/pdf/2408.13960v2.pdf","comment":"24 pages, 3 figures, 6 tables, project page: see\n  https://github.com/ai-for-edu/time-series-analysis-for-education"},{"id":"http://arxiv.org/abs/2408.05892v3","updated":"2024-08-27T15:00:53Z","published":"2024-08-12T02:10:18Z","title":"Polyp SAM 2: Advancing Zero shot Polyp Segmentation in Colorectal Cancer\n  Detection","summary":"  Polyp segmentation plays a crucial role in the early detection and diagnosis\nof colorectal cancer. However, obtaining accurate segmentations often requires\nlabor-intensive annotations and specialized models. Recently, Meta AI Research\nreleased a general Segment Anything Model 2 (SAM 2), which has demonstrated\npromising performance in several segmentation tasks. In this manuscript, we\nevaluate the performance of SAM 2 in segmenting polyps under various prompted\nsettings. We hope this report will provide insights to advance the field of\npolyp segmentation and promote more interesting work in the future. This\nproject is publicly available at https://github.com/ sajjad-sh33/Polyp-SAM-2.\n","authors":["Mobina Mansoori","Sajjad Shahabodini","Jamshid Abouei","Konstantinos N. Plataniotis","Arash Mohammadi"],"pdf_url":"https://arxiv.org/pdf/2408.05892v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12446v2","updated":"2024-08-27T14:59:25Z","published":"2024-08-22T14:41:49Z","title":"EX-DRL: Hedging Against Heavy Losses with EXtreme Distributional\n  Reinforcement Learning","summary":"  Recent advancements in Distributional Reinforcement Learning (DRL) for\nmodeling loss distributions have shown promise in developing hedging strategies\nin derivatives markets. A common approach in DRL involves learning the\nquantiles of loss distributions at specified levels using Quantile Regression\n(QR). This method is particularly effective in option hedging due to its direct\nquantile-based risk assessment, such as Value at Risk (VaR) and Conditional\nValue at Risk (CVaR). However, these risk measures depend on the accurate\nestimation of extreme quantiles in the loss distribution's tail, which can be\nimprecise in QR-based DRL due to the rarity and extremity of tail data, as\nhighlighted in the literature. To address this issue, we propose EXtreme DRL\n(EX-DRL), which enhances extreme quantile prediction by modeling the tail of\nthe loss distribution with a Generalized Pareto Distribution (GPD). This method\nintroduces supplementary data to mitigate the scarcity of extreme quantile\nobservations, thereby improving estimation accuracy through QR. Comprehensive\nexperiments on gamma hedging options demonstrate that EX-DRL improves existing\nQR-based models by providing more precise estimates of extreme quantiles,\nthereby improving the computation and reliability of risk metrics for complex\nfinancial risk management.\n","authors":["Parvin Malekzadeh","Zissis Poulos","Jacky Chen","Zeyu Wang","Konstantinos N. Plataniotis"],"pdf_url":"https://arxiv.org/pdf/2408.12446v2.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2408.15114v1","updated":"2024-08-27T14:54:33Z","published":"2024-08-27T14:54:33Z","title":"Few-Shot Unsupervised Implicit Neural Shape Representation Learning with\n  Spatial Adversaries","summary":"  Implicit Neural Representations have gained prominence as a powerful\nframework for capturing complex data modalities, encompassing a wide range from\n3D shapes to images and audio. Within the realm of 3D shape representation,\nNeural Signed Distance Functions (SDF) have demonstrated remarkable potential\nin faithfully encoding intricate shape geometry. However, learning SDFs from\nsparse 3D point clouds in the absence of ground truth supervision remains a\nvery challenging task. While recent methods rely on smoothness priors to\nregularize the learning, our method introduces a regularization term that\nleverages adversarial samples around the shape to improve the learned SDFs.\nThrough extensive experiments and evaluations, we illustrate the efficacy of\nour proposed method, highlighting its capacity to improve SDF learning with\nrespect to baselines and the state-of-the-art using synthetic and real data.\n","authors":["Amine Ouasfi","Adnane Boukhayma"],"pdf_url":"https://arxiv.org/pdf/2408.15114v1.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2403.20324v3","updated":"2024-08-27T14:53:41Z","published":"2024-03-29T17:51:50Z","title":"Localising the Seizure Onset Zone from Single-Pulse Electrical\n  Stimulation Responses with a CNN Transformer","summary":"  Epilepsy is one of the most common neurological disorders, often requiring\nsurgical intervention when medication fails to control seizures. For effective\nsurgical outcomes, precise localisation of the epileptogenic focus - often\napproximated through the Seizure Onset Zone (SOZ) - is critical yet remains a\nchallenge. Active probing through electrical stimulation is already standard\nclinical practice for identifying epileptogenic areas. Our study advances the\napplication of deep learning for SOZ localisation using Single-Pulse Electrical\nStimulation (SPES) responses, with two key contributions. Firstly, we implement\nan existing deep learning model to compare two SPES analysis paradigms:\ndivergent and convergent. These paradigms evaluate outward and inward effective\nconnections, respectively. We assess the generalisability of these models to\nunseen patients and electrode placements using held-out test sets. Our findings\nreveal a notable improvement in moving from a divergent (AUROC: 0.574) to a\nconvergent approach (AUROC: 0.666), marking the first application of the latter\nin this context. Secondly, we demonstrate the efficacy of CNN Transformers with\ncross-channel attention in handling heterogeneous electrode placements,\nincreasing the AUROC to 0.730. These findings represent a significant step in\nmodelling patient-specific intracranial EEG electrode placements in SPES.\nFuture work will explore integrating these models into clinical decision-making\nprocesses to bridge the gap between deep learning research and practical\nhealthcare applications.\n","authors":["Jamie Norris","Aswin Chari","Dorien van Blooijs","Gerald Cooray","Karl Friston","Martin Tisdall","Richard Rosch"],"pdf_url":"https://arxiv.org/pdf/2403.20324v3.pdf","comment":"21 pages, 6 figures, accepted at Machine Learning for Healthcare 2024"},{"id":"http://arxiv.org/abs/2311.07537v2","updated":"2024-08-27T14:34:26Z","published":"2023-11-13T18:23:46Z","title":"Estimating optical vegetation indices and biophysical variables for\n  temperate forests with Sentinel-1 SAR data using machine learning techniques:\n  A case study for Czechia","summary":"  Current optical vegetation indices (VIs) for monitoring forest ecosystems are\nwell established and widely used in various applications, but can be limited by\natmospheric effects such as clouds. In contrast, synthetic aperture radar (SAR)\ndata can offer insightful and systematic forest monitoring with complete time\nseries (TS) due to signal penetration through clouds and day and night image\nacquisitions. This study aims to address the limitations of optical satellite\ndata by using SAR data as an alternative for estimating optical VIs for forests\nthrough machine learning (ML). While this approach is less direct and likely\nonly feasible through the power of ML, it raises the scientific question of\nwhether enough relevant information is contained in the SAR signal to\naccurately estimate VIs. This work covers the estimation of TS of four VIs\n(LAI, FAPAR, EVI and NDVI) using multitemporal Sentinel-1 SAR and ancillary\ndata. The study focused on both healthy and disturbed temperate forest areas in\nCzechia for the year 2021, while ground truth labels generated from Sentinel-2\nmultispectral data. This was enabled by creating a paired multi-modal TS\ndataset in Google Earth Engine (GEE), including temporally and spatially\naligned Sentinel-1, Sentinel-2, DEM, weather and land cover datasets. The\ninclusion of DEM-derived auxiliary features and additional meteorological\ninformation, further improved the results. In the comparison of ML models, the\ntraditional ML algorithms, RFR and XGBoost slightly outperformed the AutoML\napproach, auto-sklearn, for all VIs, achieving high accuracies ($R^2$ between\n70-86%) and low errors (0.055-0.29 of MAE). In general, up to 240 measurements\nper year and a spatial resolution of 20 m can be achieved using estimated\nSAR-based VIs with high accuracy. A great advantage of the SAR-based VI is the\nability to detect abrupt forest changes with sub-weekly temporal accuracy.\n","authors":["Daniel Paluba","Bertrand Le Saux","Přemysl Stych"],"pdf_url":"https://arxiv.org/pdf/2311.07537v2.pdf","comment":"Revised version of the preprint, based on comments from the\n  reviewers. Full research article. 23 pages, 10 figures, 7 tables"},{"id":"http://arxiv.org/abs/2408.15099v1","updated":"2024-08-27T14:31:54Z","published":"2024-08-27T14:31:54Z","title":"No Regrets: Investigating and Improving Regret Approximations for\n  Curriculum Discovery","summary":"  What data or environments to use for training to improve downstream\nperformance is a longstanding and very topical question in reinforcement\nlearning. In particular, Unsupervised Environment Design (UED) methods have\ngained recent attention as their adaptive curricula enable agents to be robust\nto in- and out-of-distribution tasks. We ask to what extent these methods are\nthemselves robust when applied to a novel setting, closely inspired by a\nreal-world robotics problem. Surprisingly, we find that the state-of-the-art\nUED methods either do not improve upon the na\\\"{i}ve baseline of Domain\nRandomisation (DR), or require substantial hyperparameter tuning to do so. Our\nanalysis shows that this is due to their underlying scoring functions failing\nto predict intuitive measures of ``learnability'', i.e., in finding the\nsettings that the agent sometimes solves, but not always. Based on this, we\ninstead directly train on levels with high learnability and find that this\nsimple and intuitive approach outperforms UED methods and DR in several\nbinary-outcome environments, including on our domain and the standard UED\ndomain of Minigrid. We further introduce a new adversarial evaluation procedure\nfor directly measuring robustness, closely mirroring the conditional value at\nrisk (CVaR). We open-source all our code and present visualisations of final\npolicies here: https://github.com/amacrutherford/sampling-for-learnability.\n","authors":["Alexander Rutherford","Michael Beukman","Timon Willi","Bruno Lacerda","Nick Hawes","Jakob Foerster"],"pdf_url":"https://arxiv.org/pdf/2408.15099v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15097v1","updated":"2024-08-27T14:30:06Z","published":"2024-08-27T14:30:06Z","title":"Data-Driven Nonlinear Deformation Design of 3D-Printable Shells","summary":"  Designing and fabricating structures with specific mechanical properties\nrequires understanding the intricate relationship between design parameters and\nperformance. Understanding the design-performance relationship becomes\nincreasingly complicated for nonlinear deformations. Though successful at\nmodeling elastic deformations, simulation-based techniques struggle to model\nlarge elastoplastic deformations exhibiting plasticity and densification. We\npropose a neural network trained on experimental data to learn the\ndesign-performance relationship between 3D-printable shells and their\ncompressive force-displacement behavior. Trained on thousands of physical\nexperiments, our network aids in both forward and inverse design to generate\nshells exhibiting desired elastoplastic and hyperelastic deformations. We\nvalidate a subset of generated designs through fabrication and testing.\nFurthermore, we demonstrate the network's inverse design efficacy in generating\ncustom shells for several applications.\n","authors":["Samuel Silverman","Kelsey L. Snapp","Keith A. Brown","Emily Whiting"],"pdf_url":"https://arxiv.org/pdf/2408.15097v1.pdf","comment":"Submitted to 3D Printing and Additive Manufacturing"},{"id":"http://arxiv.org/abs/2408.15096v1","updated":"2024-08-27T14:26:56Z","published":"2024-08-27T14:26:56Z","title":"Post-processing fairness with minimal changes","summary":"  In this paper, we introduce a novel post-processing algorithm that is both\nmodel-agnostic and does not require the sensitive attribute at test time. In\naddition, our algorithm is explicitly designed to enforce minimal changes\nbetween biased and debiased predictions; a property that, while highly\ndesirable, is rarely prioritized as an explicit objective in fairness\nliterature. Our approach leverages a multiplicative factor applied to the logit\nvalue of probability scores produced by a black-box classifier. We demonstrate\nthe efficacy of our method through empirical evaluations, comparing its\nperformance against other four debiasing algorithms on two widely used datasets\nin fairness research.\n","authors":["Federico Di Gennaro","Thibault Laugel","Vincent Grari","Xavier Renard","Marcin Detyniecki"],"pdf_url":"https://arxiv.org/pdf/2408.15096v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15094v1","updated":"2024-08-27T14:25:42Z","published":"2024-08-27T14:25:42Z","title":"Constrained Diffusion Models via Dual Training","summary":"  Diffusion models have attained prominence for their ability to synthesize a\nprobability distribution for a given dataset via a diffusion process, enabling\nthe generation of new data points with high fidelity. However, diffusion\nprocesses are prone to generating biased data based on the training dataset. To\naddress this issue, we develop constrained diffusion models by imposing\ndiffusion constraints based on desired distributions that are informed by\nrequirements. Specifically, we cast the training of diffusion models under\nrequirements as a constrained distribution optimization problem that aims to\nreduce the distribution difference between original and generated data while\nobeying constraints on the distribution of generated data. We show that our\nconstrained diffusion models generate new data from a mixture data distribution\nthat achieves the optimal trade-off among objective and constraints. To train\nconstrained diffusion models, we develop a dual training algorithm and\ncharacterize the optimality of the trained constrained diffusion model. We\nempirically demonstrate the effectiveness of our constrained models in two\nconstrained generation tasks: (i) we consider a dataset with one or more\nunderrepresented classes where we train the model with constraints to ensure\nfairly sampling from all classes during inference; (ii) we fine-tune a\npre-trained diffusion model to sample from a new dataset while avoiding\noverfitting.\n","authors":["Shervin Khalafi","Dongsheng Ding","Alejandro Ribeiro"],"pdf_url":"https://arxiv.org/pdf/2408.15094v1.pdf","comment":"41 pages, 4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2408.13843v2","updated":"2024-08-27T14:24:52Z","published":"2024-08-25T14:17:43Z","title":"Consistent machine learning for topology optimization with\n  microstructure-dependent neural network material models","summary":"  Additive manufacturing methods together with topology optimization have\nenabled the creation of multiscale structures with controlled spatially-varying\nmaterial microstructure. However, topology optimization or inverse design of\nsuch structures in the presence of nonlinearities remains a challenge due to\nthe expense of computational homogenization methods and the complexity of\ndifferentiably parameterizing the microstructural response. A solution to this\nchallenge lies in machine learning techniques that offer efficient,\ndifferentiable mappings between the material response and its microstructural\ndescriptors. This work presents a framework for designing multiscale\nheterogeneous structures with spatially varying microstructures by merging a\nhomogenization-based topology optimization strategy with a consistent machine\nlearning approach grounded in hyperelasticity theory. We leverage neural\narchitectures that adhere to critical physical principles such as\npolyconvexity, objectivity, material symmetry, and thermodynamic consistency to\nsupply the framework with a reliable constitutive model that is dependent on\nmaterial microstructural descriptors. Our findings highlight the potential of\nintegrating consistent machine learning models with density-based topology\noptimization for enhancing design optimization of heterogeneous hyperelastic\nstructures under finite deformations.\n","authors":["Harikrishnan Vijayakumaran","Jonathan B. Russ","Glaucio H. Paulino","Miguel A. Bessa"],"pdf_url":"https://arxiv.org/pdf/2408.13843v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10895v4","updated":"2024-08-27T14:23:51Z","published":"2023-07-20T14:18:44Z","title":"Variational Autoencoding of Dental Point Clouds","summary":"  Digital dentistry has made significant advancements, yet numerous challenges\nremain. This paper introduces the FDI 16 dataset, an extensive collection of\ntooth meshes and point clouds. Additionally, we present a novel approach:\nVariational FoldingNet (VF-Net), a fully probabilistic variational autoencoder\nfor point clouds. Notably, prior latent variable models for point clouds lack a\none-to-one correspondence between input and output points. Instead, they rely\non optimizing Chamfer distances, a metric that lacks a normalized\ndistributional counterpart, rendering it unsuitable for probabilistic modeling.\nWe replace the explicit minimization of Chamfer distances with a suitable\nencoder, increasing computational efficiency while simplifying the\nprobabilistic extension. This allows for straightforward application in various\ntasks, including mesh generation, shape completion, and representation\nlearning. Empirically, we provide evidence of lower reconstruction error in\ndental reconstruction and interpolation, showcasing state-of-the-art\nperformance in dental sample generation while identifying valuable latent\nrepresentations\n","authors":["Johan Ziruo Ye","Thomas Ørkild","Peter Lempel Søndergaard","Søren Hauberg"],"pdf_url":"https://arxiv.org/pdf/2307.10895v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15089v1","updated":"2024-08-27T14:20:21Z","published":"2024-08-27T14:20:21Z","title":"SiHGNN: Leveraging Properties of Semantic Graphs for Efficient HGNN\n  Acceleration","summary":"  Heterogeneous Graph Neural Networks (HGNNs) have expanded graph\nrepresentation learning to heterogeneous graph fields. Recent studies have\ndemonstrated their superior performance across various applications, including\nmedical analysis and recommendation systems, often surpassing existing methods.\nHowever, GPUs often experience inefficiencies when executing HGNNs due to their\nunique and complex execution patterns. Compared to traditional Graph Neural\nNetworks, these patterns further exacerbate irregularities in memory access. To\ntackle these challenges, recent studies have focused on developing\ndomain-specific accelerators for HGNNs. Nonetheless, most of these efforts have\nconcentrated on optimizing the datapath or scheduling data accesses, while\nlargely overlooking the potential benefits that could be gained from leveraging\nthe inherent properties of the semantic graph, such as its topology, layout,\nand generation.\n  In this work, we focus on leveraging the properties of semantic graphs to\nenhance HGNN performance. First, we analyze the Semantic Graph Build (SGB)\nstage and identify significant opportunities for data reuse during semantic\ngraph generation. Next, we uncover the phenomenon of buffer thrashing during\nthe Graph Feature Processing (GFP) stage, revealing potential optimization\nopportunities in semantic graph layout. Furthermore, we propose a lightweight\nhardware accelerator frontend for HGNNs, called SiHGNN. This accelerator\nfrontend incorporates a tree-based Semantic Graph Builder for efficient\nsemantic graph generation and features a novel Graph Restructurer for\noptimizing semantic graph layouts. Experimental results show that SiHGNN\nenables the state-of-the-art HGNN accelerator to achieve an average performance\nimprovement of 2.95$\\times$.\n","authors":["Runzhen Xue","Mingyu Yan","Dengke Han","Zhimin Tang","Xiaochun Ye","Dongrui Fan"],"pdf_url":"https://arxiv.org/pdf/2408.15089v1.pdf","comment":"12 pages, 18 figures. arXiv admin note: text overlap with\n  arXiv:2404.04792"},{"id":"http://arxiv.org/abs/2408.14340v2","updated":"2024-08-27T14:09:44Z","published":"2024-08-26T15:13:14Z","title":"Foundation Models for Music: A Survey","summary":"  In recent years, foundation models (FMs) such as large language models (LLMs)\nand latent diffusion models (LDMs) have profoundly impacted diverse sectors,\nincluding music. This comprehensive review examines state-of-the-art (SOTA)\npre-trained models and foundation models in music, spanning from representation\nlearning, generative learning and multimodal learning. We first contextualise\nthe significance of music in various industries and trace the evolution of AI\nin music. By delineating the modalities targeted by foundation models, we\ndiscover many of the music representations are underexplored in FM development.\nThen, emphasis is placed on the lack of versatility of previous methods on\ndiverse music applications, along with the potential of FMs in music\nunderstanding, generation and medical application. By comprehensively exploring\nthe details of the model pre-training paradigm, architectural choices,\ntokenisation, finetuning methodologies and controllability, we emphasise the\nimportant topics that should have been well explored, like instruction tuning\nand in-context learning, scaling law and emergent ability, as well as\nlong-sequence modelling etc. A dedicated section presents insights into music\nagents, accompanied by a thorough analysis of datasets and evaluations\nessential for pre-training and downstream tasks. Finally, by underscoring the\nvital importance of ethical considerations, we advocate that following research\non FM for music should focus more on such issues as interpretability,\ntransparency, human responsibility, and copyright issues. The paper offers\ninsights into future challenges and trends on FMs for music, aiming to shape\nthe trajectory of human-AI collaboration in the music realm.\n","authors":["Yinghao Ma","Anders Øland","Anton Ragni","Bleiz MacSen Del Sette","Charalampos Saitis","Chris Donahue","Chenghua Lin","Christos Plachouras","Emmanouil Benetos","Elio Quinton","Elona Shatri","Fabio Morreale","Ge Zhang","György Fazekas","Gus Xia","Huan Zhang","Ilaria Manco","Jiawen Huang","Julien Guinot","Liwei Lin","Luca Marinelli","Max W. Y. Lam","Megha Sharma","Qiuqiang Kong","Roger B. Dannenberg","Ruibin Yuan","Shangda Wu","Shih-Lun Wu","Shuqi Dai","Shun Lei","Shiyin Kang","Simon Dixon","Wenhu Chen","Wenhao Huang","Xingjian Du","Xingwei Qu","Xu Tan","Yizhi Li","Zeyue Tian","Zhiyong Wu","Zhizheng Wu","Ziyang Ma","Ziyu Wang"],"pdf_url":"https://arxiv.org/pdf/2408.14340v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15077v1","updated":"2024-08-27T14:05:48Z","published":"2024-08-27T14:05:48Z","title":"MMASD+: A Novel Dataset for Privacy-Preserving Behavior Analysis of\n  Children with Autism Spectrum Disorder","summary":"  Autism spectrum disorder (ASD) is characterized by significant challenges in\nsocial interaction and comprehending communication signals. Recently,\ntherapeutic interventions for ASD have increasingly utilized Deep learning\npowered-computer vision techniques to monitor individual progress over time.\nThese models are trained on private, non-public datasets from the autism\ncommunity, creating challenges in comparing results across different models due\nto privacy-preserving data-sharing issues. This work introduces MMASD+. MMASD+\nconsists of diverse data modalities, including 3D-Skeleton, 3D Body Mesh, and\nOptical Flow data. It integrates the capabilities of Yolov8 and Deep SORT\nalgorithms to distinguish between the therapist and children, addressing a\nsignificant barrier in the original dataset. Additionally, a Multimodal\nTransformer framework is proposed to predict 11 action types and the presence\nof ASD. This framework achieves an accuracy of 95.03% for predicting action\ntypes and 96.42% for predicting ASD presence, demonstrating over a 10%\nimprovement compared to models trained on single data modalities. These\nfindings highlight the advantages of integrating multiple data modalities\nwithin the Multimodal Transformer framework.\n","authors":["Pavan Uttej Ravva","Behdokht Kiafar","Pinar Kullu","Jicheng Li","Anjana Bhat","Roghayeh Leila Barmaki"],"pdf_url":"https://arxiv.org/pdf/2408.15077v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04580v2","updated":"2024-08-27T14:05:38Z","published":"2024-02-07T04:43:41Z","title":"A Comprehensive Survey of Cross-Domain Policy Transfer for Embodied\n  Agents","summary":"  The burgeoning fields of robot learning and embodied AI have triggered an\nincreasing demand for large quantities of data. However, collecting sufficient\nunbiased data from the target domain remains a challenge due to costly data\ncollection processes and stringent safety requirements. Consequently,\nresearchers often resort to data from easily accessible source domains, such as\nsimulation and laboratory environments, for cost-effective data acquisition and\nrapid model iteration. Nevertheless, the environments and embodiments of these\nsource domains can be quite different from their target domain counterparts,\nunderscoring the need for effective cross-domain policy transfer approaches. In\nthis paper, we conduct a systematic review of existing cross-domain policy\ntransfer methods. Through a nuanced categorization of domain gaps, we\nencapsulate the overarching insights and design considerations of each problem\nsetting. We also provide a high-level discussion about the key methodologies\nused in cross-domain policy transfer problems. Lastly, we summarize the open\nchallenges that lie beyond the capabilities of current paradigms and discuss\npotential future directions in this field.\n","authors":["Haoyi Niu","Jianming Hu","Guyue Zhou","Xianyuan Zhan"],"pdf_url":"https://arxiv.org/pdf/2402.04580v2.pdf","comment":"IJCAI 2024"},{"id":"http://arxiv.org/abs/2408.15076v1","updated":"2024-08-27T14:04:04Z","published":"2024-08-27T14:04:04Z","title":"MiWaves Reinforcement Learning Algorithm","summary":"  The escalating prevalence of cannabis use poses a significant public health\nchallenge globally. In the U.S., cannabis use is more prevalent among emerging\nadults (EAs) (ages 18-25) than any other age group, with legalization in the\nmultiple states contributing to a public perception that cannabis is less risky\nthan in prior decades. To address this growing concern, we developed MiWaves, a\nreinforcement learning (RL) algorithm designed to optimize the delivery of\npersonalized intervention prompts to reduce cannabis use among EAs. MiWaves\nleverages domain expertise and prior data to tailor the likelihood of delivery\nof intervention messages. This paper presents a comprehensive overview of the\nalgorithm's design, including key decisions and experimental outcomes. The\nfinalized MiWaves RL algorithm was deployed in a clinical trial from March to\nMay 2024.\n","authors":["Susobhan Ghosh","Yongyi Guo","Pei-Yao Hung","Lara Coughlin","Erin Bonar","Inbal Nahum-Shani","Maureen Walton","Susan Murphy"],"pdf_url":"https://arxiv.org/pdf/2408.15076v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2402.17739"},{"id":"http://arxiv.org/abs/2408.06425v5","updated":"2024-08-27T14:03:15Z","published":"2024-08-12T18:04:59Z","title":"Bayesian Learning in a Nonlinear Multiscale State-Space Model","summary":"  The ubiquity of multiscale interactions in complex systems is\nwell-recognized, with development and heredity serving as a prime example of\nhow processes at different temporal scales influence one another. This work\nintroduces a novel multiscale state-space model to explore the dynamic\ninterplay between systems interacting across different time scales, with\nfeedback between each scale. We propose a Bayesian learning framework to\nestimate unknown states by learning the unknown process noise covariances\nwithin this multiscale model. We develop a Particle Gibbs with Ancestor\nSampling (PGAS) algorithm for inference and demonstrate through simulations the\nefficacy of our approach.\n","authors":["Nayely Vélez-Cruz","Manfred D. Laubichler"],"pdf_url":"https://arxiv.org/pdf/2408.06425v5.pdf","comment":"Corrected a typo"},{"id":"http://arxiv.org/abs/2408.15073v1","updated":"2024-08-27T14:02:21Z","published":"2024-08-27T14:02:21Z","title":"Interactive dense pixel visualizations for time series and model\n  attribution explanations","summary":"  The field of Explainable Artificial Intelligence (XAI) for Deep Neural\nNetwork models has developed significantly, offering numerous techniques to\nextract explanations from models. However, evaluating explanations is often not\ntrivial, and differences in applied metrics can be subtle, especially with\nnon-intelligible data. Thus, there is a need for visualizations tailored to\nexplore explanations for domains with such data, e.g., time series. We propose\nDAVOTS, an interactive visual analytics approach to explore raw time series\ndata, activations of neural networks, and attributions in a dense-pixel\nvisualization to gain insights into the data, models' decisions, and\nexplanations. To further support users in exploring large datasets, we apply\nclustering approaches to the visualized data domains to highlight groups and\npresent ordering strategies for individual and combined data exploration to\nfacilitate finding patterns. We visualize a CNN trained on the FordA dataset to\ndemonstrate the approach.\n","authors":["Udo Schlegel","Daniel A. Keim"],"pdf_url":"https://arxiv.org/pdf/2408.15073v1.pdf","comment":"5 pages, 2 figures, accepted at MLVIS 2023"},{"id":"http://arxiv.org/abs/2408.15065v1","updated":"2024-08-27T13:48:15Z","published":"2024-08-27T13:48:15Z","title":"The Benefits of Balance: From Information Projections to Variance\n  Reduction","summary":"  Data balancing across multiple modalities/sources appears in various forms in\nseveral foundation models (e.g., CLIP and DINO) achieving universal\nrepresentation learning. We show that this iterative algorithm, usually used to\navoid representation collapse, enjoys an unsuspected benefit: reducing the\nvariance of estimators that are functionals of the empirical distribution over\nthese sources. We provide non-asymptotic bounds quantifying this variance\nreduction effect and relate them to the eigendecays of appropriately defined\nMarkov operators. We explain how various forms of data balancing in contrastive\nmultimodal learning and self-supervised clustering can be interpreted as\ninstances of this variance reduction scheme.\n","authors":["Lang Liu","Ronak Mehta","Soumik Pal","Zaid Harchaoui"],"pdf_url":"https://arxiv.org/pdf/2408.15065v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15057v1","updated":"2024-08-27T13:40:15Z","published":"2024-08-27T13:40:15Z","title":"Subgroup Analysis via Model-based Rule Forest","summary":"  Machine learning models are often criticized for their black-box nature,\nraising concerns about their applicability in critical decision-making\nscenarios. Consequently, there is a growing demand for interpretable models in\nsuch contexts. In this study, we introduce Model-based Deep Rule Forests\n(mobDRF), an interpretable representation learning algorithm designed to\nextract transparent models from data. By leveraging IF-THEN rules with\nmulti-level logic expressions, mobDRF enhances the interpretability of existing\nmodels without compromising accuracy. We apply mobDRF to identify key risk\nfactors for cognitive decline in an elderly population, demonstrating its\neffectiveness in subgroup analysis and local model optimization. Our method\noffers a promising solution for developing trustworthy and interpretable\nmachine learning models, particularly valuable in fields like healthcare, where\nunderstanding differential effects across patient subgroups can lead to more\npersonalized and effective treatments.\n","authors":["I-Ling Cheng","Chan Hsu","Chantung Ku","Pei-Ju Lee","Yihuang Kang"],"pdf_url":"https://arxiv.org/pdf/2408.15057v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.09212v4","updated":"2024-08-27T13:40:13Z","published":"2024-07-12T12:20:39Z","title":"Generating $SROI^-$ Ontologies via Knowledge Graph Query Embedding\n  Learning","summary":"  Query embedding approaches answer complex logical queries over incomplete\nknowledge graphs (KGs) by computing and operating on low-dimensional vector\nrepresentations of entities, relations, and queries. However, current query\nembedding models heavily rely on excessively parameterized neural networks and\ncannot explain the knowledge learned from the graph. We propose a novel query\nembedding method, AConE, which explains the knowledge learned from the graph in\nthe form of $SROI^-$ description logic axioms while being more\nparameter-efficient than most existing approaches. AConE associates queries to\na $SROI^-$ description logic concept. Every $SROI^-$ concept is embedded as a\ncone in complex vector space, and each $SROI^-$ relation is embedded as a\ntransformation that rotates and scales cones. We show theoretically that AConE\ncan learn $SROI^-$ axioms, and defines an algebra whose operations correspond\none to one to $SROI^-$ description logic concept constructs. Our empirical\nstudy on multiple query datasets shows that AConE achieves superior results\nover previous baselines with fewer parameters. Notably on the WN18RR dataset,\nAConE achieves significant improvement over baseline models. We provide\ncomprehensive analyses showing that the capability to represent axioms\npositively impacts the results of query answering.\n","authors":["Yunjie He","Daniel Hernandez","Mojtaba Nayyeri","Bo Xiong","Yuqicheng Zhu","Evgeny Kharlamov","Steffen Staab"],"pdf_url":"https://arxiv.org/pdf/2407.09212v4.pdf","comment":"Accepted by ECAI 2024"},{"id":"http://arxiv.org/abs/2408.15055v1","updated":"2024-08-27T13:32:31Z","published":"2024-08-27T13:32:31Z","title":"Causal Rule Forest: Toward Interpretable and Precise Treatment Effect\n  Estimation","summary":"  Understanding and inferencing Heterogeneous Treatment Effects (HTE) and\nConditional Average Treatment Effects (CATE) are vital for developing\npersonalized treatment recommendations. Many state-of-the-art approaches\nachieve inspiring performance in estimating HTE on benchmark datasets or\nsimulation studies. However, the indirect predicting manner and complex model\narchitecture reduce the interpretability of these approaches. To mitigate the\ngap between predictive performance and heterogeneity interpretability, we\nintroduce the Causal Rule Forest (CRF), a novel approach to learning hidden\npatterns from data and transforming the patterns into interpretable multi-level\nBoolean rules. By training the other interpretable causal inference models with\ndata representation learned by CRF, we can reduce the predictive errors of\nthese models in estimating HTE and CATE, while keeping their interpretability\nfor identifying subgroups that a treatment is more effective. Our experiments\nunderscore the potential of CRF to advance personalized interventions and\npolicies, paving the way for future research to enhance its scalability and\napplication across complex causal inference challenges.\n","authors":["Chan Hsu","Jun-Ting Wu","Yihuang Kang"],"pdf_url":"https://arxiv.org/pdf/2408.15055v1.pdf","comment":"The 25th IEEE International Conference on Information Reuse and\n  Integration for Data Science (IRI 2024)"},{"id":"http://arxiv.org/abs/2301.01188v4","updated":"2024-08-27T13:28:52Z","published":"2022-12-29T01:07:19Z","title":"Deep R Programming","summary":"  Deep R Programming is a comprehensive and in-depth introductory course on one\nof the most popular languages for data science. It equips ambitious students,\nprofessionals, and researchers with the knowledge and skills to become\nindependent users of this potent environment so that they can tackle any\nproblem related to data wrangling and analytics, numerical computing,\nstatistics, and machine learning. This textbook is a non-profit project. Its\nonline and PDF versions are freely available at\n<https://deepr.gagolewski.com/>.\n","authors":["Marek Gagolewski"],"pdf_url":"https://arxiv.org/pdf/2301.01188v4.pdf","comment":"v1.0.1 (2024-08-27)"},{"id":"http://arxiv.org/abs/2403.00381v2","updated":"2024-08-27T13:13:54Z","published":"2024-03-01T09:09:37Z","title":"Structured Deep Neural Networks-Based Backstepping Trajectory Tracking\n  Control for Lagrangian Systems","summary":"  Deep neural networks (DNN) are increasingly being used to learn controllers\ndue to their excellent approximation capabilities. However, their black-box\nnature poses significant challenges to closed-loop stability guarantees and\nperformance analysis. In this paper, we introduce a structured DNN-based\ncontroller for the trajectory tracking control of Lagrangian systems using\nbacking techniques. By properly designing neural network structures, the\nproposed controller can ensure closed-loop stability for any compatible neural\nnetwork parameters. In addition, improved control performance can be achieved\nby further optimizing neural network parameters. Besides, we provide explicit\nupper bounds on tracking errors in terms of controller parameters, which allows\nus to achieve the desired tracking performance by properly selecting the\ncontroller parameters. Furthermore, when system models are unknown, we propose\nan improved Lagrangian neural network (LNN) structure to learn the system\ndynamics and design the controller. We show that in the presence of model\napproximation errors and external disturbances, the closed-loop stability and\ntracking control performance can still be guaranteed. The effectiveness of the\nproposed approach is demonstrated through simulations.\n","authors":["Jiajun Qian","Liang Xu","Xiaoqiang Ren","Xiaofan Wang"],"pdf_url":"https://arxiv.org/pdf/2403.00381v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15041v1","updated":"2024-08-27T13:10:26Z","published":"2024-08-27T13:10:26Z","title":"Earth Observation Satellite Scheduling with Graph Neural Networks","summary":"  The Earth Observation Satellite Planning (EOSP) is a difficult optimization\nproblem with considerable practical interest. A set of requested observations\nmust be scheduled on an agile Earth observation satellite while respecting\nconstraints on their visibility window, as well as maneuver constraints that\nimpose varying delays between successive observations. In addition, the problem\nis largely oversubscribed: there are much more candidate observations than what\ncan possibly be achieved. Therefore, one must select the set of observations\nthat will be performed while maximizing their weighted cumulative benefit, and\npropose a feasible schedule for these observations. As previous work mostly\nfocused on heuristic and iterative search algorithms, this paper presents a new\ntechnique for selecting and scheduling observations based on Graph Neural\nNetworks (GNNs) and Deep Reinforcement Learning (DRL). GNNs are used to extract\nrelevant information from the graphs representing instances of the EOSP, and\nDRL drives the search for optimal schedules. Our simulations show that it is\nable to learn on small problem instances and generalize to larger real-world\ninstances, with very competitive performance compared to traditional\napproaches.\n","authors":["Antoine Jacquet","Guillaume Infantes","Nicolas Meuleau","Emmanuel Benazera","Stéphanie Roussel","Vincent Baudoui","Jonathan Guerra"],"pdf_url":"https://arxiv.org/pdf/2408.15041v1.pdf","comment":"Accepted at 17th European Workshop on Reinforcement Learning (EWRL\n  2024)"},{"id":"http://arxiv.org/abs/2405.17035v3","updated":"2024-08-27T13:05:33Z","published":"2024-05-27T10:42:13Z","title":"Glauber Generative Model: Discrete Diffusion Models via Binary\n  Classification","summary":"  We introduce the Glauber Generative Model (GGM), a new class of discrete\ndiffusion models, to obtain new samples from a distribution given samples from\na discrete space. GGM deploys a discrete Markov chain called the heat bath\ndynamics (or the Glauber dynamics) to denoise a sequence of noisy tokens to a\nsample from a joint distribution of discrete tokens. Our novel conceptual\nframework provides an exact reduction of the task of learning the denoising\nMarkov chain to solving a class of binary classification tasks. More\nspecifically, the model learns to classify a given token in a noisy sequence as\nsignal or noise. In contrast, prior works on discrete diffusion models either\nsolve regression problems to learn importance ratios, or minimize loss\nfunctions given by variational approximations. We apply GGM to language\nmodeling and image generation, where images are discretized using image\ntokenizers like VQGANs. We show that it outperforms existing discrete diffusion\nmodels in language generation, and demonstrates strong performance for image\ngeneration without using dataset-specific image tokenizers. We also show that\nour model is capable of performing well in zero-shot control settings like text\nand image infilling.\n","authors":["Harshit Varma","Dheeraj Nagaraj","Karthikeyan Shanmugam"],"pdf_url":"https://arxiv.org/pdf/2405.17035v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.09766v2","updated":"2024-08-27T13:01:56Z","published":"2024-02-15T07:35:52Z","title":"From Variability to Stability: Advancing RecSys Benchmarking Practices","summary":"  In the rapidly evolving domain of Recommender Systems (RecSys), new\nalgorithms frequently claim state-of-the-art performance based on evaluations\nover a limited set of arbitrarily selected datasets. However, this approach may\nfail to holistically reflect their effectiveness due to the significant impact\nof dataset characteristics on algorithm performance. Addressing this\ndeficiency, this paper introduces a novel benchmarking methodology to\nfacilitate a fair and robust comparison of RecSys algorithms, thereby advancing\nevaluation practices. By utilizing a diverse set of $30$ open datasets,\nincluding two introduced in this work, and evaluating $11$ collaborative\nfiltering algorithms across $9$ metrics, we critically examine the influence of\ndataset characteristics on algorithm performance. We further investigate the\nfeasibility of aggregating outcomes from multiple datasets into a unified\nranking. Through rigorous experimental analysis, we validate the reliability of\nour methodology under the variability of datasets, offering a benchmarking\nstrategy that balances quality and computational demands. This methodology\nenables a fair yet effective means of evaluating RecSys algorithms, providing\nvaluable guidance for future research endeavors.\n","authors":["Valeriy Shevchenko","Nikita Belousov","Alexey Vasilev","Vladimir Zholobov","Artyom Sosedka","Natalia Semenova","Anna Volodkevich","Andrey Savchenko","Alexey Zaytsev"],"pdf_url":"https://arxiv.org/pdf/2402.09766v2.pdf","comment":"8 pages with 11 figures"},{"id":"http://arxiv.org/abs/2408.13628v2","updated":"2024-08-27T12:53:22Z","published":"2024-08-24T17:10:59Z","title":"Enhancing Uplift Modeling in Multi-Treatment Marketing Campaigns:\n  Leveraging Score Ranking and Calibration Techniques","summary":"  Uplift modeling is essential for optimizing marketing strategies by selecting\nindividuals likely to respond positively to specific marketing campaigns. This\nimportance escalates in multi-treatment marketing campaigns, where diverse\ntreatment is available and we may want to assign the customers to treatment\nthat can make the most impact. While there are existing approaches with\nconvenient frameworks like Causalml, there are potential spaces to enhance the\neffect of uplift modeling in multi treatment cases. This paper introduces a\nnovel approach to uplift modeling in multi-treatment campaigns, leveraging\nscore ranking and calibration techniques to improve overall performance of the\nmarketing campaign. We review existing uplift models, including Meta Learner\nframeworks (S, T, X), and their application in real-world scenarios.\nAdditionally, we delve into insights from multi-treatment studies to highlight\nthe complexities and potential advancements in the field. Our methodology\nincorporates Meta-Learner calibration and a scoring rank-based offer selection\nstrategy. Extensive experiment results with real-world datasets demonstrate the\npractical benefits and superior performance of our approach. The findings\nunderscore the critical role of integrating score ranking and calibration\ntechniques in refining the performance and reliability of uplift predictions,\nthereby advancing predictive modeling in marketing analytics and providing\nactionable insights for practitioners seeking to optimize their campaign\nstrategies.\n","authors":["Yoon Tae Park","Ting Xu","Mohamed Anany"],"pdf_url":"https://arxiv.org/pdf/2408.13628v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10635v5","updated":"2024-08-27T12:12:42Z","published":"2024-03-26T15:36:47Z","title":"Compressed Federated Reinforcement Learning with a Generative Model","summary":"  Reinforcement learning has recently gained unprecedented popularity, yet it\nstill grapples with sample inefficiency. Addressing this challenge, federated\nreinforcement learning (FedRL) has emerged, wherein agents collaboratively\nlearn a single policy by aggregating local estimations. However, this\naggregation step incurs significant communication costs. In this paper, we\npropose CompFedRL, a communication-efficient FedRL approach incorporating both\n\\textit{periodic aggregation} and (direct/error-feedback) compression\nmechanisms. Specifically, we consider compressed federated $Q$-learning with a\ngenerative model setup, where a central server learns an optimal $Q$-function\nby periodically aggregating compressed $Q$-estimates from local agents. For the\nfirst time, we characterize the impact of these two mechanisms (which have\nremained elusive) by providing a finite-time analysis of our algorithm,\ndemonstrating strong convergence behaviors when utilizing either direct or\nerror-feedback compression. Our bounds indicate improved solution accuracy\nconcerning the number of agents and other federated hyperparameters while\nsimultaneously reducing communication costs. To corroborate our theory, we also\nconduct in-depth numerical experiments to verify our findings, considering\nTop-$K$ and Sparsified-$K$ sparsification operators.\n","authors":["Ali Beikmohammadi","Sarit Khirirat","Sindri Magnússon"],"pdf_url":"https://arxiv.org/pdf/2404.10635v5.pdf","comment":"European Conference on Machine Learning and Principles and Practice\n  of Knowledge Discovery in Databases (ECML-PKDD 2024)"},{"id":"http://arxiv.org/abs/2111.10847v3","updated":"2024-08-27T12:09:32Z","published":"2021-11-21T15:58:01Z","title":"Diffusion Tensor Estimation with Uncertainty Calibration","summary":"  It is highly desirable to know how uncertain a model's predictions are,\nespecially for models that are complex and hard to understand as in deep\nlearning. Although there has been a growing interest in using deep learning\nmethods in diffusion-weighted MRI, prior works have not addressed the issue of\nmodel uncertainty. Here, we propose a deep learning method to estimate the\ndiffusion tensor and compute the estimation uncertainty. Data-dependent\nuncertainty is computed directly by the network and learned via loss\nattenuation. Model uncertainty is computed using Monte Carlo dropout. We also\npropose a new method for evaluating the quality of predicted uncertainties. We\ncompare the new method with the standard least-squares tensor estimation and\nbootstrap-based uncertainty computation techniques. Our experiments show that\nwhen the number of measurements is small the deep learning method is more\naccurate and its uncertainty predictions are better calibrated than the\nstandard methods. We show that the estimation uncertainties computed by the new\nmethod can highlight the model's biases, detect domain shift, and reflect the\nstrength of noise in the measurements. Our study shows the importance and\npractical value of modeling prediction uncertainties in deep learning-based\ndiffusion MRI analysis.\n","authors":["Davood Karimi","Simon K. Warfield","Ali Gholipour"],"pdf_url":"https://arxiv.org/pdf/2111.10847v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.09096v2","updated":"2024-08-27T11:50:50Z","published":"2024-07-12T08:48:16Z","title":"STD-PLM: Understanding Both Spatial and Temporal Properties of\n  Spatial-Temporal Data with PLM","summary":"  Spatial-temporal forecasting and imputation are important for real-world\nintelligent systems. Most existing methods are tailored for individual\nforecasting or imputation tasks but are not designed for both. Additionally,\nthey are less effective for zero-shot and few-shot learning. While pre-trained\nlanguage model (PLM) have exhibited strong pattern recognition and reasoning\nabilities across various tasks, including few-shot and zero-shot learning,\ntheir applications in spatial-temporal data understanding has been constrained\nby insufficient modeling of complex correlations such as the temporal\ncorrelations, spatial connectivity, non-pairwise and high-order\nspatial-temporal correlations within data. In this paper, we propose STD-PLM\nfor understanding both spatial and temporal properties of\n\\underline{S}patial-\\underline{T}emporal \\underline{D}ata with \\underline{PLM},\nwhich is capable of implementing both spatial-temporal forecasting and\nimputation tasks. STD-PLM understands spatial-temporal correlations via\nexplicitly designed spatial and temporal tokenizers. Topology-aware node\nembeddings are designed for PLM to comprehend and exploit the topology\nstructure of data in inductive manner. Furthermore, to mitigate the efficiency\nissues introduced by the PLM, we design a sandglass attention module (SGA)\ncombined with a specific constrained loss function, which significantly\nimproves the model's efficiency while ensuring performance. Extensive\nexperiments demonstrate that STD-PLM exhibits competitive performance and\ngeneralization capabilities across the forecasting and imputation tasks on\nvarious datasets. Moreover, STD-PLM achieves promising results on both few-shot\nand zero-shot tasks.\n","authors":["YiHeng Huang","Xiaowei Mao","Shengnan Guo","Yubin Chen","Junfeng Shen","Tiankuo Li","Youfang Lin","Huaiyu Wan"],"pdf_url":"https://arxiv.org/pdf/2407.09096v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.16263v4","updated":"2024-08-27T11:48:49Z","published":"2022-03-30T12:48:22Z","title":"Does Audio Deepfake Detection Generalize?","summary":"  Current text-to-speech algorithms produce realistic fakes of human voices,\nmaking deepfake detection a much-needed area of research. While researchers\nhave presented various techniques for detecting audio spoofs, it is often\nunclear exactly why these architectures are successful: Preprocessing steps,\nhyperparameter settings, and the degree of fine-tuning are not consistent\nacross related work. Which factors contribute to success, and which are\naccidental? In this work, we address this problem: We systematize audio\nspoofing detection by re-implementing and uniformly evaluating architectures\nfrom related work. We identify overarching features for successful audio\ndeepfake detection, such as using cqtspec or logspec features instead of\nmelspec features, which improves performance by 37% EER on average, all other\nfactors constant. Additionally, we evaluate generalization capabilities: We\ncollect and publish a new dataset consisting of 37.9 hours of found audio\nrecordings of celebrities and politicians, of which 17.2 hours are deepfakes.\nWe find that related work performs poorly on such real-world data (performance\ndegradation of up to one thousand percent). This may suggest that the community\nhas tailored its solutions too closely to the prevailing ASVSpoof benchmark and\nthat deepfakes are much harder to detect outside the lab than previously\nthought.\n","authors":["Nicolas M. Müller","Pavel Czempin","Franziska Dieckmann","Adam Froghyar","Konstantin Böttinger"],"pdf_url":"https://arxiv.org/pdf/2203.16263v4.pdf","comment":"Interspeech 2022"},{"id":"http://arxiv.org/abs/2408.14976v1","updated":"2024-08-27T11:38:01Z","published":"2024-08-27T11:38:01Z","title":"Prior-free Balanced Replay: Uncertainty-guided Reservoir Sampling for\n  Long-Tailed Continual Learning","summary":"  Even in the era of large models, one of the well-known issues in continual\nlearning (CL) is catastrophic forgetting, which is significantly challenging\nwhen the continual data stream exhibits a long-tailed distribution, termed as\nLong-Tailed Continual Learning (LTCL). Existing LTCL solutions generally\nrequire the label distribution of the data stream to achieve re-balance\ntraining. However, obtaining such prior information is often infeasible in real\nscenarios since the model should learn without pre-identifying the majority and\nminority classes. To this end, we propose a novel Prior-free Balanced Replay\n(PBR) framework to learn from long-tailed data stream with less forgetting.\nConcretely, motivated by our experimental finding that the minority classes are\nmore likely to be forgotten due to the higher uncertainty, we newly design an\nuncertainty-guided reservoir sampling strategy to prioritize rehearsing\nminority data without using any prior information, which is based on the mutual\ndependence between the model and samples. Additionally, we incorporate two\nprior-free components to further reduce the forgetting issue: (1) Boundary\nconstraint is to preserve uncertain boundary supporting samples for continually\nre-estimating task boundaries. (2) Prototype constraint is to maintain the\nconsistency of learned class prototypes along with training. Our approach is\nevaluated on three standard long-tailed benchmarks, demonstrating superior\nperformance to existing CL methods and previous SOTA LTCL approach in both\ntask- and class-incremental learning settings, as well as ordered- and\nshuffled-LTCL settings.\n","authors":["Lei Liu","Li Liu","Yawen Cui"],"pdf_url":"https://arxiv.org/pdf/2408.14976v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10672v2","updated":"2024-08-27T11:13:43Z","published":"2024-03-15T20:48:41Z","title":"Riemannian Flow Matching Policy for Robot Motion Learning","summary":"  We introduce Riemannian Flow Matching Policies (RFMP), a novel model for\nlearning and synthesizing robot visuomotor policies. RFMP leverages the\nefficient training and inference capabilities of flow matching methods. By\ndesign, RFMP inherits the strengths of flow matching: the ability to encode\nhigh-dimensional multimodal distributions, commonly encountered in robotic\ntasks, and a very simple and fast inference process. We demonstrate the\napplicability of RFMP to both state-based and vision-conditioned robot motion\npolicies. Notably, as the robot state resides on a Riemannian manifold, RFMP\ninherently incorporates geometric awareness, which is crucial for realistic\nrobotic tasks. To evaluate RFMP, we conduct two proof-of-concept experiments,\ncomparing its performance against Diffusion Policies. Although both approaches\nsuccessfully learn the considered tasks, our results show that RFMP provides\nsmoother action trajectories with significantly lower inference times.\n","authors":["Max Braun","Noémie Jaquier","Leonel Rozo","Tamim Asfour"],"pdf_url":"https://arxiv.org/pdf/2403.10672v2.pdf","comment":"Accepted for publication at IROS'24. 8 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2408.14964v1","updated":"2024-08-27T11:10:39Z","published":"2024-08-27T11:10:39Z","title":"Cross-Modal Learning for Chemistry Property Prediction: Large Language\n  Models Meet Graph Machine Learning","summary":"  In the field of chemistry, the objective is to create novel molecules with\ndesired properties, facilitating accurate property predictions for applications\nsuch as material design and drug screening. However, existing graph deep\nlearning methods face limitations that curb their expressive power. To address\nthis, we explore the integration of vast molecular domain knowledge from Large\nLanguage Models (LLMs) with the complementary strengths of Graph Neural\nNetworks (GNNs) to enhance performance in property prediction tasks. We\nintroduce a Multi-Modal Fusion (MMF) framework that synergistically harnesses\nthe analytical prowess of GNNs and the linguistic generative and predictive\nabilities of LLMs, thereby improving accuracy and robustness in predicting\nmolecular properties. Our framework combines the effectiveness of GNNs in\nmodeling graph-structured data with the zero-shot and few-shot learning\ncapabilities of LLMs, enabling improved predictions while reducing the risk of\noverfitting. Furthermore, our approach effectively addresses distributional\nshifts, a common challenge in real-world applications, and showcases the\nefficacy of learning cross-modal representations, surpassing state-of-the-art\nbaselines on benchmark datasets for property prediction tasks.\n","authors":["Sakhinana Sagar Srinivas","Venkataramana Runkana"],"pdf_url":"https://arxiv.org/pdf/2408.14964v1.pdf","comment":"Paper Accepted at Workshop on Robustness of Few-shot and Zero-shot\n  Learning in Foundation Models at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2406.17640v2","updated":"2024-08-27T11:00:47Z","published":"2024-06-25T15:24:06Z","title":"BayTTA: Uncertainty-aware medical image classification with optimized\n  test-time augmentation using Bayesian model averaging","summary":"  Test-time augmentation (TTA) is a well-known technique employed during the\ntesting phase of computer vision tasks. It involves aggregating multiple\naugmented versions of input data. Combining predictions using a simple average\nformulation is a common and straightforward approach after performing TTA. This\npaper introduces a novel framework for optimizing TTA, called BayTTA\n(Bayesian-based TTA), which is based on Bayesian Model Averaging (BMA). First,\nwe generate a prediction list associated with different variations of the input\ndata created through TTA. Then, we use BMA to combine predictions weighted by\nthe respective posterior probabilities. Such an approach allows one to take\ninto account model uncertainty, and thus to enhance the predictive performance\nof the related machine learning or deep learning model. We evaluate the\nperformance of BayTTA on various public data, including three medical image\ndatasets comprising skin cancer, breast cancer, and chest X-ray images and two\nwell-known gene editing datasets, CRISPOR and GUIDE-seq. Our experimental\nresults indicate that BayTTA can be effectively integrated into\nstate-of-the-art deep learning models used in medical image analysis as well as\ninto some popular pre-trained CNN models such as VGG-16, MobileNetV2,\nDenseNet201, ResNet152V2, and InceptionRes-NetV2, leading to the enhancement in\ntheir accuracy and robustness performance. The source code of the proposed\nBayTTA method is freely available at: \\underline\n{https://github.com/Z-Sherkat/BayTTA}.\n","authors":["Zeinab Sherkatghanad","Moloud Abdar","Mohammadreza Bakhtyari","Pawel Plawiak","Vladimir Makarenkov"],"pdf_url":"https://arxiv.org/pdf/2406.17640v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03729v2","updated":"2024-08-27T10:57:01Z","published":"2024-06-06T04:05:12Z","title":"Enhancing Sign Language Detection through Mediapipe and Convolutional\n  Neural Networks (CNN)","summary":"  This research combines MediaPipe and CNNs for the efficient and accurate\ninterpretation of ASL dataset for the real-time detection of sign language. The\nsystem presented here captures and processes hands' gestures in real time. the\nintended purpose was to create a very easy, accurate, and fast way of entering\ncommands without the necessity of touching something.MediaPipe supports one of\nthe powerful frameworks in real-time hand tracking capabilities for the ability\nto capture and preprocess hand movements, which increases the accuracy of the\ngesture recognition system. Actually, the integration of CNN with the MediaPipe\nresults in higher efficiency in using the model of real-time processing.The\naccuracy achieved by the model on ASL datasets is 99.12\\%.The model was tested\nusing American Sign Language (ASL) datasets. The results were then compared to\nthose of existing methods to evaluate how well it performed, using established\nevaluation techniques. The system will have applications in the communication,\neducation, and accessibility domains. Making systems such as described in this\npaper even better will assist people with hearing impairment and make things\naccessible to them. We tested the recognition and translation performance on an\nASL dataset and achieved better accuracy over previous models.It is meant to\nthe research is to identify the characters that American signs recognize using\nhand images taken from a web camera by based on mediapipe and CNNs\n","authors":["Aditya Raj Verma","Gagandeep Singh","Karnim Meghwal","Banawath Ramji","Praveen Kumar Dadheech"],"pdf_url":"https://arxiv.org/pdf/2406.03729v2.pdf","comment":"We have decided to withdraw our paper due to significant revisions\n  and improvements that need to be made based on new findings. After further\n  analysis, we believe these changes are necessary to ensure the accuracy and\n  completeness of our work. We plan to resubmit the revised version in the\n  future once the updates are complete"},{"id":"http://arxiv.org/abs/2408.14951v1","updated":"2024-08-27T10:54:51Z","published":"2024-08-27T10:54:51Z","title":"Domain-decoupled Physics-informed Neural Networks with Closed-form\n  Gradients for Fast Model Learning of Dynamical Systems","summary":"  Physics-informed neural networks (PINNs) are trained using physical equations\nand can also incorporate unmodeled effects by learning from data. PINNs for\ncontrol (PINCs) of dynamical systems are gaining interest due to their\nprediction speed compared to classical numerical integration methods for\nnonlinear state-space models, making them suitable for real-time control\napplications. We introduce the domain-decoupled physics-informed neural network\n(DD-PINN) to address current limitations of PINC in handling large and complex\nnonlinear dynamic systems. The time domain is decoupled from the feed-forward\nneural network to construct an Ansatz function, allowing for calculation of\ngradients in closed form. This approach significantly reduces training times,\nespecially for large dynamical systems, compared to PINC, which relies on\ngraph-based automatic differentiation. Additionally, the DD-PINN inherently\nfulfills the initial condition and supports higher-order excitation inputs,\nsimplifying the training process and enabling improved prediction accuracy.\nValidation on three systems - a nonlinear mass-spring-damper, a\nfive-mass-chain, and a two-link robot - demonstrates that the DD-PINN achieves\nsignificantly shorter training times. In cases where the PINC's prediction\ndiverges, the DD-PINN's prediction remains stable and accurate due to higher\nphysics loss reduction or use of a higher-order excitation input. The DD-PINN\nallows for fast and accurate learning of large dynamical systems previously out\nof reach for the PINC.\n","authors":["Henrik Krauss","Tim-Lukas Habich","Max Bartholdt","Thomas Seel","Moritz Schappler"],"pdf_url":"https://arxiv.org/pdf/2408.14951v1.pdf","comment":"Accepted to International Conference on Informatics in Control,\n  Automation and Robotics (ICINCO) 2024"},{"id":"http://arxiv.org/abs/2408.14935v1","updated":"2024-08-27T10:17:22Z","published":"2024-08-27T10:17:22Z","title":"Quotient Normalized Maximum Likelihood Criterion for Learning Bayesian\n  Network Structures","summary":"  We introduce an information theoretic criterion for Bayesian network\nstructure learning which we call quotient normalized maximum likelihood (qNML).\nIn contrast to the closely related factorized normalized maximum likelihood\ncriterion, qNML satisfies the property of score equivalence. It is also\ndecomposable and completely free of adjustable hyperparameters. For practical\ncomputations, we identify a remarkably accurate approximation proposed earlier\nby Szpankowski and Weinberger. Experiments on both simulated and real data\ndemonstrate that the new criterion leads to parsimonious models with good\npredictive accuracy.\n","authors":["Tomi Silander","Janne Leppä-aho","Elias Jääsaari","Teemu Roos"],"pdf_url":"https://arxiv.org/pdf/2408.14935v1.pdf","comment":"Accepted to AISTATS 2018"},{"id":"http://arxiv.org/abs/2406.15504v2","updated":"2024-08-27T10:07:27Z","published":"2024-06-19T16:43:56Z","title":"Dr.E Bridges Graphs with Large Language Models through Words","summary":"  Significant efforts have been dedicated to integrating the powerful Large\nLanguage Models (LLMs) with diverse modalities, particularly focusing on the\nfusion of language, vision and audio data. However, the graph-structured data,\nwhich is inherently rich in structural and domain-specific knowledge, has not\nyet been gracefully adapted to LLMs. Existing methods either describe the graph\nwith raw text, suffering the loss of graph structural information, or feed\nGraph Neural Network (GNN) embeddings into LLMs at the cost of losing\nexplainable prompt semantics. To bridge this gap, we introduce an end-to-end\nmodality-aligning framework for LLM-graph alignment: Dual-Residual Vector\nQuantized-Variational AutoEncoder, namely Dr.E. Our approach is purposefully\ndesigned to facilitate token-level alignment with LLMs, enabling an effective\ntranslation of the intrinsic `language' of graphs into comprehensible natural\nlanguage. We also manage to enhance LLMs' more robust structural understanding\nof graphs by incorporating multiple views of the central nodes based on their\nsurrounding nodes at various distances. Our experimental evaluations on\nstandard graph tasks demonstrate competitive performance against other\nstate-of-the-art (SOTA) approaches. Additionally, our framework ensures certain\nvisual interpretability, efficiency, and robustness, marking the promising\nsuccessful endeavor to achieve token-level alignment between LLMs and GNNs. Our\ncode is available at: https://anonymous.4open.science/r/dre-817.\n","authors":["Zipeng Liu","Likang Wu","Ming He","Zhong Guan","Hongke Zhao","Nan Feng"],"pdf_url":"https://arxiv.org/pdf/2406.15504v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14928v1","updated":"2024-08-27T10:05:37Z","published":"2024-08-27T10:05:37Z","title":"Targetin the partition function of chemically disordered materials with\n  a generative approach based on inverse variational autoencoders","summary":"  Computing atomic-scale properties of chemically disordered materials requires\nan efficient exploration of their vast configuration space. Traditional\napproaches such as Monte Carlo or Special Quasirandom Structures either entail\nsampling an excessive amount of configurations or do not ensure that the\nconfiguration space has been properly covered. In this work, we propose a novel\napproach where generative machine learning is used to yield a representative\nset of configurations for accurate property evaluation and provide accurate\nestimations of atomic-scale properties with minimal computational cost. Our\nmethod employs a specific type of variational autoencoder with inverse roles\nfor the encoder and decoder, enabling the application of an unsupervised active\nlearning scheme that does not require any initial training database. The model\niteratively generates configuration batches, whose properties are computed with\nconventional atomic-scale methods. These results are then fed back into the\nmodel to estimate the partition function, repeating the process until\nconvergence. We illustrate our approach by computing point-defect formation\nenergies and concentrations in (U, Pu)O2 mixed-oxide fuels. In addition, the ML\nmodel provides valuable insights into the physical factors influencing the\ntarget property. Our method is generally applicable to explore other\nproperties, such as atomic-scale diffusion coefficients, in ideally or\nnon-ideally disordered materials like high-entropy alloys.\n","authors":["Maciej J. Karcz","Luca Messina","Eiji Kawasaki","Emeric Bourasseau"],"pdf_url":"https://arxiv.org/pdf/2408.14928v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14915v1","updated":"2024-08-27T09:44:01Z","published":"2024-08-27T09:44:01Z","title":"Can Transformers Do Enumerative Geometry?","summary":"  How can Transformers model and learn enumerative geometry? What is a robust\nprocedure for using Transformers in abductive knowledge discovery within a\nmathematician-machine collaboration? In this work, we introduce a new paradigm\nin computational enumerative geometry in analyzing the $\\psi$-class\nintersection numbers on the moduli space of curves. By formulating the\nenumerative problem as a continuous optimization task, we develop a\nTransformer-based model for computing $\\psi$-class intersection numbers based\non the underlying quantum Airy structure. For a finite range of genera, our\nmodel is capable of regressing intersection numbers that span an extremely wide\nrange of values, from $10^{-45}$ to $10^{45}$. To provide a proper inductive\nbias for capturing the recursive behavior of intersection numbers, we propose a\nnew activation function, Dynamic Range Activator (DRA). Moreover, given the\nsevere heteroscedasticity of $\\psi$-class intersections and the required\nprecision, we quantify the uncertainty of the predictions using Conformal\nPrediction with a dynamic sliding window that is aware of the number of marked\npoints. Next, we go beyond merely computing intersection numbers and explore\nthe enumerative \"world-model\" of the Transformers. Through a series of causal\ninference and correlational interpretability analyses, we demonstrate that\nTransformers are actually modeling Virasoro constraints in a purely data-driven\nmanner. Additionally, we provide evidence for the comprehension of several\nvalues appearing in the large genus asymptotic of $\\psi$-class intersection\nnumbers through abductive hypothesis testing.\n","authors":["Baran Hashemi","Roderic G. Corominas","Alessandro Giacchetto"],"pdf_url":"https://arxiv.org/pdf/2408.14915v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14909v1","updated":"2024-08-27T09:35:49Z","published":"2024-08-27T09:35:49Z","title":"SpikingSSMs: Learning Long Sequences with Sparse and Parallel Spiking\n  State Space Models","summary":"  Known as low energy consumption networks, spiking neural networks (SNNs) have\ngained a lot of attention within the past decades. While SNNs are increasing\ncompetitive with artificial neural networks (ANNs) for vision tasks, they are\nrarely used for long sequence tasks, despite their intrinsic temporal dynamics.\nIn this work, we develop spiking state space models (SpikingSSMs) for long\nsequence learning by leveraging on the sequence learning abilities of state\nspace models (SSMs). Inspired by dendritic neuron structure, we hierarchically\nintegrate neuronal dynamics with the original SSM block, meanwhile realizing\nsparse synaptic computation. Furthermore, to solve the conflict of event-driven\nneuronal dynamics with parallel computing, we propose a light-weight surrogate\ndynamic network which accurately predicts the after-reset membrane potential\nand compatible to learnable thresholds, enabling orders of acceleration in\ntraining speed compared with conventional iterative methods. On the long range\narena benchmark task, SpikingSSM achieves competitive performance to\nstate-of-the-art SSMs meanwhile realizing on average 90\\% of network sparsity.\nOn language modeling, our network significantly surpasses existing spiking\nlarge language models (spikingLLMs) on the WikiText-103 dataset with only a\nthird of the model size, demonstrating its potential as backbone architecture\nfor low computation cost LLMs.\n","authors":["Shuaijie Shen","Chao Wang","Renzhuo Huang","Yan Zhong","Qinghai Guo","Zhichao Lu","Jianguo Zhang","Luziwei Leng"],"pdf_url":"https://arxiv.org/pdf/2408.14909v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.10779v2","updated":"2024-08-27T09:28:35Z","published":"2024-05-17T13:40:59Z","title":"Baseline Results for Selected Nonlinear System Identification Benchmarks","summary":"  Nonlinear system identification remains an important open challenge across\nresearch and academia. Large numbers of novel approaches are seen published\neach year, each presenting improvements or extensions to existing methods. It\nis natural, therefore, to consider how one might choose between these competing\nmodels. Benchmark datasets provide one clear way to approach this question.\nHowever, to make meaningful inference based on benchmark performance it is\nimportant to understand how well a new method performs comparatively to results\navailable with well-established methods. This paper presents a set of ten\nbaseline techniques and their relative performances on five popular benchmarks.\nThe aim of this contribution is to stimulate thought and discussion regarding\nobjective comparison of identification methodologies.\n","authors":["Max D. Champneys","Gerben I. Beintema","Roland Tóth","Maarten Schoukens","Timothy J. Rogers"],"pdf_url":"https://arxiv.org/pdf/2405.10779v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05655v2","updated":"2024-08-27T09:24:41Z","published":"2023-10-09T12:10:51Z","title":"Causal structure learning with momentum: Sampling distributions over\n  Markov Equivalence Classes of DAGs","summary":"  In the context of inferring a Bayesian network structure (directed acyclic\ngraph, DAG for short), we devise a non-reversible continuous time Markov chain,\nthe ``Causal Zig-Zag sampler'', that targets a probability distribution over\nclasses of observationally equivalent (Markov equivalent) DAGs. The classes are\nrepresented as completed partially directed acyclic graphs (CPDAGs). The\nnon-reversible Markov chain relies on the operators used in Chickering's Greedy\nEquivalence Search (GES) and is endowed with a momentum variable, which\nimproves mixing significantly as we show empirically. The possible target\ndistributions include posterior distributions based on a prior over DAGs and a\nMarkov equivalent likelihood. We offer an efficient implementation wherein we\ndevelop new algorithms for listing, counting, uniformly sampling, and applying\npossible moves of the GES operators, all of which significantly improve upon\nthe state-of-the-art run-time.\n","authors":["Moritz Schauer","Marcel Wienöbst"],"pdf_url":"https://arxiv.org/pdf/2310.05655v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2109.06458v3","updated":"2024-08-27T09:09:05Z","published":"2021-09-14T05:54:29Z","title":"A Note on Knowledge Distillation Loss Function for Object Classification","summary":"  This research note provides a quick introduction to the knowledge\ndistillation loss function used in object classification. In particular, we\ndiscuss its connection to a previously proposed logits matching loss function.\nWe further treat knowledge distillation as a specific form of output\nregularization and demonstrate its connection to label smoothing and\nentropy-based regularization.\n","authors":["Defang Chen"],"pdf_url":"https://arxiv.org/pdf/2109.06458v3.pdf","comment":"Research Note, 4 pages"},{"id":"http://arxiv.org/abs/2408.14890v1","updated":"2024-08-27T09:06:29Z","published":"2024-08-27T09:06:29Z","title":"Development of Large Annotated Music Datasets using HMM-based Forced\n  Viterbi Alignment","summary":"  Datasets are essential for any machine learning task. Automatic Music\nTranscription (AMT) is one such task, where considerable amount of data is\nrequired depending on the way the solution is achieved. Considering the fact\nthat a music dataset, complete with audio and its time-aligned transcriptions\nwould require the effort of people with musical experience, it could be stated\nthat the task becomes even more challenging. Musical experience is required in\nplaying the musical instrument(s), and in annotating and verifying the\ntranscriptions. We propose a method that would help in streamlining this\nprocess, making the task of obtaining a dataset from a particular instrument\neasy and efficient. We use predefined guitar exercises and hidden Markov\nmodel(HMM) based forced viterbi alignment to accomplish this. The guitar\nexercises are designed to be simple. Since the note sequence are already\ndefined, HMM based forced viterbi alignment provides time-aligned\ntranscriptions of these audio files. The onsets of the transcriptions are\nmanually verified and the labels are accurate up to 10ms, averaging at 5ms. The\ncontributions of the proposed work is two fold, i) a well streamlined and\nefficient method for generating datasets for any instrument, especially\nmonophonic and, ii) an acoustic plectrum guitar dataset containing wave files\nand transcriptions in the form of label files. This method will aid as a\npreliminary step towards building concrete datasets for building AMT systems\nfor different instruments.\n","authors":["S. Johanan Joysingh","P. Vijayalakshmi","T. Nagarajan"],"pdf_url":"https://arxiv.org/pdf/2408.14890v1.pdf","comment":"submitted to TENCON 2019"},{"id":"http://arxiv.org/abs/2408.08448v3","updated":"2024-08-27T09:04:35Z","published":"2024-08-15T22:57:39Z","title":"Exploring Cross-model Neuronal Correlations in the Context of Predicting\n  Model Performance and Generalizability","summary":"  As Artificial Intelligence (AI) models are increasingly integrated into\ncritical systems, the need for a robust framework to establish the\ntrustworthiness of AI is increasingly paramount. While collaborative efforts\nhave established conceptual foundations for such a framework, there remains a\nsignificant gap in developing concrete, technically robust methods for\nassessing AI model quality and performance. A critical drawback in the\ntraditional methods for assessing the validity and generalizability of models\nis their dependence on internal developer datasets, rendering it challenging to\nindependently assess and verify their performance claims. This paper introduces\na novel approach for assessing a newly trained model's performance based on\nanother known model by calculating correlation between neural networks. The\nproposed method evaluates correlations by determining if, for each neuron in\none network, there exists a neuron in the other network that produces similar\noutput. This approach has implications for memory efficiency, allowing for the\nuse of smaller networks when high correlation exists between networks of\ndifferent sizes. Additionally, the method provides insights into robustness,\nsuggesting that if two highly correlated networks are compared and one\ndemonstrates robustness when operating in production environments, the other is\nlikely to exhibit similar robustness. This contribution advances the technical\ntoolkit for responsible AI, supporting more comprehensive and nuanced\nevaluations of AI models to ensure their safe and effective deployment. Code is\navailable at https://github.com/aheldis/Cross-model-correlation.git.\n","authors":["Haniyeh Ehsani Oskouie","Lionel Levine","Majid Sarrafzadeh"],"pdf_url":"https://arxiv.org/pdf/2408.08448v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14889v1","updated":"2024-08-27T09:04:08Z","published":"2024-08-27T09:04:08Z","title":"Towards turbine-location-aware multi-decadal wind power predictions with\n  CMIP6","summary":"  With the increasing amount of renewable energy in the grid, long-term wind\npower forecasting for multiple decades becomes more critical. In these\nlong-term forecasts, climate data is essential as it allows us to account for\nclimate change. Yet the resolution of climate models is often very coarse. In\nthis paper, we show that by including turbine locations when downscaling with\nGaussian Processes, we can generate valuable aggregate wind power predictions\ndespite the low resolution of the CMIP6 climate models. This work is a first\nstep towards multi-decadal turbine-location-aware wind power forecasting using\nglobal climate model output.\n","authors":["Nina Effenberger","Nicole Ludwig"],"pdf_url":"https://arxiv.org/pdf/2408.14889v1.pdf","comment":"4 pages, pre-print"},{"id":"http://arxiv.org/abs/2408.14887v1","updated":"2024-08-27T09:00:27Z","published":"2024-08-27T09:00:27Z","title":"Literary and Colloquial Dialect Identification for Tamil using Acoustic\n  Features","summary":"  The evolution and diversity of a language is evident from it's various\ndialects. If the various dialects are not addressed in technological\nadvancements like automatic speech recognition and speech synthesis, there is a\nchance that these dialects may disappear. Speech technology plays a role in\npreserving various dialects of a language from going extinct. In order to build\na full fledged automatic speech recognition system that addresses various\ndialects, an Automatic Dialect Identification (ADI) system acting as the front\nend is required. This is similar to how language identification systems act as\nfront ends to automatic speech recognition systems that handle multiple\nlanguages. The current work proposes a way to identify two popular and broadly\nclassified Tamil dialects, namely literary and colloquial Tamil. Acoustical\ncharacteristics rather than phonetics and phonotactics are used, alleviating\nthe requirement of language-dependant linguistic tools. Hence one major\nadvantage of the proposed method is that it does not require an annotated\ncorpus, hence it can be easily adapted to other languages. Gaussian Mixture\nModels (GMM) using Mel Frequency Cepstral Coefficient (MFCC) features are used\nto perform the classification task. The experiments yielded an error rate of\n12%. Vowel nasalization, as being the reason for this good performance, is\ndiscussed. The number of mixture models for the GMM is varied and the\nperformance is analysed.\n","authors":["M. Nanmalar","P. Vijayalakshmi","T. Nagarajan"],"pdf_url":"https://arxiv.org/pdf/2408.14887v1.pdf","comment":"submitted to TENCON 2019"},{"id":"http://arxiv.org/abs/2408.14875v1","updated":"2024-08-27T08:44:31Z","published":"2024-08-27T08:44:31Z","title":"Adversarial Attacks and Defenses in Multivariate Time-Series Forecasting\n  for Smart and Connected Infrastructures","summary":"  The emergence of deep learning models has revolutionized various industries\nover the last decade, leading to a surge in connected devices and\ninfrastructures. However, these models can be tricked into making incorrect\npredictions with high confidence, leading to disastrous failures and security\nconcerns. To this end, we explore the impact of adversarial attacks on\nmultivariate time-series forecasting and investigate methods to counter them.\nSpecifically, we employ untargeted white-box attacks, namely the Fast Gradient\nSign Method (FGSM) and the Basic Iterative Method (BIM), to poison the inputs\nto the training process, effectively misleading the model. We also illustrate\nthe subtle modifications to the inputs after the attack, which makes detecting\nthe attack using the naked eye quite difficult. Having demonstrated the\nfeasibility of these attacks, we develop robust models through adversarial\ntraining and model hardening. We are among the first to showcase the\ntransferability of these attacks and defenses by extrapolating our work from\nthe benchmark electricity data to a larger, 10-year real-world data used for\npredicting the time-to-failure of hard disks. Our experimental results confirm\nthat the attacks and defenses achieve the desired security thresholds, leading\nto a 72.41% and 94.81% decrease in RMSE for the electricity and hard disk\ndatasets respectively after implementing the adversarial defenses.\n","authors":["Pooja Krishan","Rohan Mohapatra","Saptarshi Sengupta"],"pdf_url":"https://arxiv.org/pdf/2408.14875v1.pdf","comment":"17 pages, 32 figures"},{"id":"http://arxiv.org/abs/2405.07488v2","updated":"2024-08-27T08:44:20Z","published":"2024-05-13T06:04:26Z","title":"Predictive Modeling of Flexible EHD Pumps using Kolmogorov-Arnold\n  Networks","summary":"  We present a novel approach to predicting the pressure and flow rate of\nflexible electrohydrodynamic pumps using the Kolmogorov-Arnold Network.\nInspired by the Kolmogorov-Arnold representation theorem, KAN replaces fixed\nactivation functions with learnable spline-based activation functions, enabling\nit to approximate complex nonlinear functions more effectively than traditional\nmodels like Multi-Layer Perceptron and Random Forest. We evaluated KAN on a\ndataset of flexible EHD pump parameters and compared its performance against\nRF, and MLP models. KAN achieved superior predictive accuracy, with Mean\nSquared Errors of 12.186 and 0.001 for pressure and flow rate predictions,\nrespectively. The symbolic formulas extracted from KAN provided insights into\nthe nonlinear relationships between input parameters and pump performance.\nThese findings demonstrate that KAN offers exceptional accuracy and\ninterpretability, making it a promising alternative for predictive modeling in\nelectrohydrodynamic pumping.\n","authors":["Yanhong Peng","Yuxin Wang","Fangchao Hu","Miao He","Zebing Mao","Xia Huang","Jun Ding"],"pdf_url":"https://arxiv.org/pdf/2405.07488v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14871v1","updated":"2024-08-27T08:41:42Z","published":"2024-08-27T08:41:42Z","title":"Learning Robust Reward Machines from Noisy Labels","summary":"  This paper presents PROB-IRM, an approach that learns robust reward machines\n(RMs) for reinforcement learning (RL) agents from noisy execution traces. The\nkey aspect of RM-driven RL is the exploitation of a finite-state machine that\ndecomposes the agent's task into different subtasks. PROB-IRM uses a\nstate-of-the-art inductive logic programming framework robust to noisy examples\nto learn RMs from noisy traces using the Bayesian posterior degree of beliefs,\nthus ensuring robustness against inconsistencies. Pivotal for the results is\nthe interleaving between RM learning and policy learning: a new RM is learned\nwhenever the RL agent generates a trace that is believed not to be accepted by\nthe current RM. To speed up the training of the RL agent, PROB-IRM employs a\nprobabilistic formulation of reward shaping that uses the posterior Bayesian\nbeliefs derived from the traces. Our experimental analysis shows that PROB-IRM\ncan learn (potentially imperfect) RMs from noisy traces and exploit them to\ntrain an RL agent to solve its tasks successfully. Despite the complexity of\nlearning the RM from noisy traces, agents trained with PROB-IRM perform\ncomparably to agents provided with handcrafted RMs.\n","authors":["Roko Parac","Lorenzo Nodari","Leo Ardon","Daniel Furelos-Blanco","Federico Cerutti","Alessandra Russo"],"pdf_url":"https://arxiv.org/pdf/2408.14871v1.pdf","comment":"Preprint accepted for publication to the 21st International\n  Conference on Principles of Knowledge Representation and Reasoning (KR 2024)"},{"id":"http://arxiv.org/abs/2308.16818v3","updated":"2024-08-27T08:39:38Z","published":"2023-08-31T15:49:21Z","title":"Irregular Traffic Time Series Forecasting Based on Asynchronous\n  Spatio-Temporal Graph Convolutional Network","summary":"  Accurate traffic forecasting is crucial for the development of Intelligent\nTransportation Systems (ITS), playing a pivotal role in modern urban traffic\nmanagement. Traditional forecasting methods, however, struggle with the\nirregular traffic time series resulting from adaptive traffic signal controls,\npresenting challenges in asynchronous spatial dependency, irregular temporal\ndependency, and predicting variable-length sequences. To this end, we propose\nan Asynchronous Spatio-tEmporal graph convolutional nEtwoRk (ASeer) tailored\nfor irregular traffic time series forecasting. Specifically, we first propose\nan Asynchronous Graph Diffusion Network to capture the spatial dependency\nbetween asynchronously measured traffic states regulated by adaptive traffic\nsignals. After that, to capture the temporal dependency within irregular\ntraffic state sequences, a personalized time encoding is devised to embed the\ncontinuous time signals. Then, we propose a Transformable Time-aware\nConvolution Network, which adapts meta-filters for time-aware convolution on\nthe sequences with inconsistent temporal flow. Additionally, a\nSemi-Autoregressive Prediction Network, comprising a state evolution unit and a\nsemi-autoregressive predictor, is designed to predict variable-length traffic\nsequences effectively and efficiently. Extensive experiments on a newly\nestablished benchmark demonstrate the superiority of ASeer compared with twelve\ncompetitive baselines across six metrics.\n","authors":["Weijia Zhang","Le Zhang","Jindong Han","Hao Liu","Yanjie Fu","Jingbo Zhou","Yu Mei","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2308.16818v3.pdf","comment":"This work is published in the research track of KDD 2024"},{"id":"http://arxiv.org/abs/2408.14866v1","updated":"2024-08-27T08:38:48Z","published":"2024-08-27T08:38:48Z","title":"Advancing Adversarial Suffix Transfer Learning on Aligned Large Language\n  Models","summary":"  Language Language Models (LLMs) face safety concerns due to potential misuse\nby malicious users. Recent red-teaming efforts have identified adversarial\nsuffixes capable of jailbreaking LLMs using the gradient-based search algorithm\nGreedy Coordinate Gradient (GCG). However, GCG struggles with computational\ninefficiency, limiting further investigations regarding suffix transferability\nand scalability across models and data. In this work, we bridge the connection\nbetween search efficiency and suffix transferability. We propose a two-stage\ntransfer learning framework, DeGCG, which decouples the search process into\nbehavior-agnostic pre-searching and behavior-relevant post-searching.\nSpecifically, we employ direct first target token optimization in pre-searching\nto facilitate the search process. We apply our approach to cross-model,\ncross-data, and self-transfer scenarios. Furthermore, we introduce an\ninterleaved variant of our approach, i-DeGCG, which iteratively leverages\nself-transferability to accelerate the search process. Experiments on HarmBench\ndemonstrate the efficiency of our approach across various models and domains.\nNotably, our i-DeGCG outperforms the baseline on Llama2-chat-7b with ASRs of\n$43.9$ ($+22.2$) and $39.0$ ($+19.5$) on valid and test sets, respectively.\nFurther analysis on cross-model transfer indicates the pivotal role of first\ntarget token optimization in leveraging suffix transferability for efficient\nsearching.\n","authors":["Hongfu Liu","Yuxi Xie","Ye Wang","Michael Shieh"],"pdf_url":"https://arxiv.org/pdf/2408.14866v1.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2408.14865v1","updated":"2024-08-27T08:38:45Z","published":"2024-08-27T08:38:45Z","title":"Data downlink prioritization using image classification on-board a 6U\n  CubeSat","summary":"  Nanosatellites are proliferating as low-cost dedicated sensing systems with\nlean development cycles. Kyushu Institute of Technology and collaborators have\nlaunched a joint venture for a nanosatellite mission, VERTECS. The primary\nmission is to elucidate the formation history of stars by observing the\noptical-wavelength cosmic background radiation. The VERTECS satellite will be\nequipped with a small-aperture telescope and a high-precision attitude control\nsystem to capture the cosmic data for analysis on the ground. However,\nnanosatellites are limited by their onboard memory resources and downlink speed\ncapabilities. Additionally, due to a limited number of ground stations, the\nsatellite mission will face issues meeting the required data budget for mission\nsuccess. To alleviate this issue, we propose an on-orbit system to autonomously\nclassify and then compress desirable image data for data downlink\nprioritization and optimization. The system comprises a prototype Camera\nController Board (CCB) which carries a Raspberry Pi Compute Module 4 which is\nused for classification and compression. The system uses a lightweight\nConvolutional Neural Network (CNN) model to classify and determine the\ndesirability of captured image data. The model is designed to be lean and\nrobust to reduce the computational and memory load on the satellite. The model\nis trained and tested on a novel star field dataset consisting of data captured\nby the Sloan Digital Sky Survey (SDSS). The dataset is meant to simulate the\nexpected data produced by the 6U satellite. The compression step implements\nGZip, RICE or HCOMPRESS compression, which are standards for astronomical data.\nPreliminary testing on the proposed CNN model results in a classification\naccuracy of about 100\\% on the star field dataset, with compression ratios of\n3.99, 5.16 and 5.43 for GZip, RICE and HCOMPRESS that were achieved on tested\nFITS image data.\n","authors":["Keenan A. A. Chatar","Ezra Fielding","Kei Sano","Kentaro Kitamura"],"pdf_url":"https://arxiv.org/pdf/2408.14865v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2408.14864v1","updated":"2024-08-27T08:38:17Z","published":"2024-08-27T08:38:17Z","title":"Dynamic operator management in meta-heuristics using reinforcement\n  learning: an application to permutation flowshop scheduling problems","summary":"  This study develops a framework based on reinforcement learning to\ndynamically manage a large portfolio of search operators within\nmeta-heuristics. Using the idea of tabu search, the framework allows for\ncontinuous adaptation by temporarily excluding less efficient operators and\nupdating the portfolio composition during the search. A Q-learning-based\nadaptive operator selection mechanism is used to select the most suitable\noperator from the dynamically updated portfolio at each stage. Unlike\ntraditional approaches, the proposed framework requires no input from the\nexperts regarding the search operators, allowing domain-specific non-experts to\neffectively use the framework. The performance of the proposed framework is\nanalyzed through an application to the permutation flowshop scheduling problem.\nThe results demonstrate the superior performance of the proposed framework\nagainst state-of-the-art algorithms in terms of optimality gap and convergence\nspeed.\n","authors":["Maryam Karimi Mamaghan","Mehrdad Mohammadi","Wout Dullaert","Daniele Vigo","Amir Pirayesh"],"pdf_url":"https://arxiv.org/pdf/2408.14864v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03870v2","updated":"2024-08-27T08:31:04Z","published":"2024-03-06T17:23:28Z","title":"Learning to Decode Collaboratively with Multiple Language Models","summary":"  We propose a method to teach multiple large language models (LLM) to\ncollaborate by interleaving their generations at the token level. We model the\ndecision of which LLM generates the next token as a latent variable. By\noptimizing the marginal likelihood of a training set under our latent variable\nmodel, the base LLM automatically learns when to generate itself and when to\ncall on one of the ``assistant'' language models to generate, all without\ndirect supervision. Token-level collaboration during decoding allows for a\nfusion of each model's expertise in a manner tailored to the specific task at\nhand. Our collaborative decoding is especially useful in cross-domain settings\nwhere a generalist base LLM learns to invoke domain expert models. On\ninstruction-following, domain-specific QA, and reasoning tasks, we show that\nthe performance of the joint system exceeds that of the individual models.\nThrough qualitative analysis of the learned latent decisions, we show models\ntrained with our method exhibit several interesting collaboration patterns,\ne.g., template-filling. Our code is available at\nhttps://github.com/clinicalml/co-llm.\n","authors":["Shannon Zejiang Shen","Hunter Lang","Bailin Wang","Yoon Kim","David Sontag"],"pdf_url":"https://arxiv.org/pdf/2403.03870v2.pdf","comment":"16 pages, 4 figures, 11 tables"},{"id":"http://arxiv.org/abs/2408.13766v2","updated":"2024-08-27T08:07:20Z","published":"2024-08-25T08:23:06Z","title":"Enhancing Robustness of Human Detection Algorithms in Maritime SAR\n  through Augmented Aerial Images to Simulate Weather Conditions","summary":"  7,651 cases of Search and Rescue Missions (SAR) were reported by the United\nStates Coast Guard in 2024, with over 1322 SAR helicopters deployed in the 6\nfirst months alone. Through the utilizations of YOLO, we were able to run\ndifferent weather conditions and lighting from our augmented dataset for\ntraining. YOLO then utilizes CNNs to apply a series of convolutions and pooling\nlayers to the input image, where the convolution layers are able to extract the\nmain features of the image. Through this, our YOLO model is able to learn to\ndifferentiate different objects which may considerably improve its accuracy,\npossibly enhancing the efficiency of SAR operations through enhanced detection\naccuracy. This paper aims to improve the model's accuracy of human detection in\nmaritime SAR by evaluating a robust datasets containing various elevations and\ngeological locations, as well as through data augmentation which simulates\ndifferent weather and lighting. We observed that models trained on augmented\ndatasets outperformed their non-augmented counterparts in which the human\nrecall scores ranged from 0.891 to 0.911 with an improvement rate of 3.4\\% on\nthe YOLOv5l model. Results showed that these models demonstrate greater\nrobustness to real-world conditions in varying of weather, brightness, tint,\nand contrast.\n","authors":["Miguel Tjia","Artem Kim","Elaine Wynette Wijaya","Hanna Tefara","Kevin Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.13766v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14010v2","updated":"2024-08-27T08:02:49Z","published":"2024-08-26T04:31:55Z","title":"Improving Water Quality Time-Series Prediction in Hong Kong using\n  Sentinel-2 MSI Data and Google Earth Engine Cloud Computing","summary":"  Effective water quality monitoring in coastal regions is crucial due to the\nprogressive deterioration caused by pollution and human activities. To address\nthis, this study develops time-series models to predict chlorophyll-a (Chl-a),\nsuspended solids (SS), and turbidity using Sentinel-2 satellite data and Google\nEarth Engine (GEE) in the coastal regions of Hong Kong. Leveraging Long\nShort-Term Memory (LSTM) Recurrent Neural Networks, the study incorporates\nextensive temporal datasets to enhance prediction accuracy. The models utilize\nspectral data from Sentinel-2, focusing on optically active components, and\ndemonstrate that selected variables closely align with the spectral\ncharacteristics of Chl-a and SS. The results indicate improved predictive\nperformance over previous methods, highlighting the potential for remote\nsensing technology in continuous and comprehensive water quality assessment.\n","authors":["Rohin Sood","Kevin Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.14010v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14847v1","updated":"2024-08-27T07:58:08Z","published":"2024-08-27T07:58:08Z","title":"Intraoperative Glioma Segmentation with YOLO + SAM for Improved Accuracy\n  in Tumor Resection","summary":"  Gliomas, a common type of malignant brain tumor, present significant surgical\nchallenges due to their similarity to healthy tissue. Preoperative Magnetic\nResonance Imaging (MRI) images are often ineffective during surgery due to\nfactors such as brain shift, which alters the position of brain structures and\ntumors. This makes real-time intraoperative MRI (ioMRI) crucial, as it provides\nupdated imaging that accounts for these shifts, ensuring more accurate tumor\nlocalization and safer resections. This paper presents a deep learning pipeline\ncombining You Only Look Once Version 8 (YOLOv8) and Segment Anything Model\nVision Transformer-base (SAM ViT-b) to enhance glioma detection and\nsegmentation during ioMRI. Our model was trained using the Brain Tumor\nSegmentation 2021 (BraTS 2021) dataset, which includes standard magnetic\nresonance imaging (MRI) images, and noise-augmented MRI images that simulate\nioMRI images. Noised MRI images are harder for a deep learning pipeline to\nsegment, but they are more representative of surgical conditions. Achieving a\nDice Similarity Coefficient (DICE) score of 0.79, our model performs comparably\nto state-of-the-art segmentation models tested on noiseless data. This\nperformance demonstrates the model's potential to assist surgeons in maximizing\ntumor resection and improving surgical outcomes.\n","authors":["Samir Kassam","Angelo Markham","Katie Vo","Yashas Revanakara","Michael Lam","Kevin Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.14847v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14843v1","updated":"2024-08-27T07:54:15Z","published":"2024-08-27T07:54:15Z","title":"Correntropy-Based Improper Likelihood Model for Robust\n  Electrophysiological Source Imaging","summary":"  Bayesian learning provides a unified skeleton to solve the\nelectrophysiological source imaging task. From this perspective, existing\nsource imaging algorithms utilize the Gaussian assumption for the observation\nnoise to build the likelihood function for Bayesian inference. However, the\nelectromagnetic measurements of brain activity are usually affected by\nmiscellaneous artifacts, leading to a potentially non-Gaussian distribution for\nthe observation noise. Hence the conventional Gaussian likelihood model is a\nsuboptimal choice for the real-world source imaging task. In this study, we aim\nto solve this problem by proposing a new likelihood model which is robust with\nrespect to non-Gaussian noises. Motivated by the robust maximum correntropy\ncriterion, we propose a new improper distribution model concerning the noise\nassumption. This new noise distribution is leveraged to structure a robust\nlikelihood function and integrated with hierarchical prior distributions to\nestimate source activities by variational inference. In particular, the score\nmatching is adopted to determine the hyperparameters for the improper\nlikelihood model. A comprehensive performance evaluation is performed to\ncompare the proposed noise assumption to the conventional Gaussian model.\nSimulation results show that, the proposed method can realize more precise\nsource reconstruction by designing known ground-truth. The real-world dataset\nalso demonstrates the superiority of our new method with the visual perception\ntask. This study provides a new backbone for Bayesian source imaging, which\nwould facilitate its application using real-world noisy brain signal.\n","authors":["Yuanhao Li","Badong Chen","Zhongxu Hu","Keita Suzuki","Wenjun Bai","Yasuharu Koike","Okito Yamashita"],"pdf_url":"https://arxiv.org/pdf/2408.14843v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14842v1","updated":"2024-08-27T07:54:01Z","published":"2024-08-27T07:54:01Z","title":"From Bias to Balance: Detecting Facial Expression Recognition Biases in\n  Large Multimodal Foundation Models","summary":"  This study addresses the racial biases in facial expression recognition (FER)\nsystems within Large Multimodal Foundation Models (LMFMs). Despite advances in\ndeep learning and the availability of diverse datasets, FER systems often\nexhibit higher error rates for individuals with darker skin tones. Existing\nresearch predominantly focuses on traditional FER models (CNNs, RNNs, ViTs),\nleaving a gap in understanding racial biases in LMFMs. We benchmark four\nleading LMFMs: GPT-4o, PaliGemma, Gemini, and CLIP to assess their performance\nin facial emotion detection across different racial demographics. A linear\nclassifier trained on CLIP embeddings obtains accuracies of 95.9\\% for RADIATE,\n90.3\\% for Tarr, and 99.5\\% for Chicago Face. Furthermore, we identify that\nAnger is misclassified as Disgust 2.1 times more often in Black Females than\nWhite Females. This study highlights the need for fairer FER systems and\nestablishes a foundation for developing unbiased, accurate FER technologies.\nVisit https://kvjvhub.github.io/FERRacialBias/ for further information\nregarding the biases within facial expression recognition.\n","authors":["Kaylee Chhua","Zhoujinyi Wen","Vedant Hathalia","Kevin Zhu","Sean O'Brien"],"pdf_url":"https://arxiv.org/pdf/2408.14842v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14840v1","updated":"2024-08-27T07:51:26Z","published":"2024-08-27T07:51:26Z","title":"CL4KGE: A Curriculum Learning Method for Knowledge Graph Embedding","summary":"  Knowledge graph embedding (KGE) constitutes a foundational task, directed\ntowards learning representations for entities and relations within knowledge\ngraphs (KGs), with the objective of crafting representations comprehensive\nenough to approximate the logical and symbolic interconnections among entities.\nIn this paper, we define a metric Z-counts to measure the difficulty of\ntraining each triple ($<$head entity, relation, tail entity$>$) in KGs with\ntheoretical analysis. Based on this metric, we propose \\textbf{CL4KGE}, an\nefficient \\textbf{C}urriculum \\textbf{L}earning based training strategy for\n\\textbf{KGE}. This method includes a difficulty measurer and a training\nscheduler that aids in the training of KGE models. Our approach possesses the\nflexibility to act as a plugin within a wide range of KGE models, with the\nadded advantage of adaptability to the majority of KGs in existence. The\nproposed method has been evaluated on popular KGE models, and the results\ndemonstrate that it enhances the state-of-the-art methods. The use of Z-counts\nas a metric has enabled the identification of challenging triples in KGs, which\nhelps in devising effective training strategies.\n","authors":["Yang Liu","Chuan Zhou","Peng Zhang","Yanan Cao","Yongchao Liu","Zhao Li","Hongyang Chen"],"pdf_url":"https://arxiv.org/pdf/2408.14840v1.pdf","comment":"16 pages, 3 figures"},{"id":"http://arxiv.org/abs/2408.14837v1","updated":"2024-08-27T07:46:07Z","published":"2024-08-27T07:46:07Z","title":"Diffusion Models Are Real-Time Game Engines","summary":"  We present GameNGen, the first game engine powered entirely by a neural model\nthat enables real-time interaction with a complex environment over long\ntrajectories at high quality. GameNGen can interactively simulate the classic\ngame DOOM at over 20 frames per second on a single TPU. Next frame prediction\nachieves a PSNR of 29.4, comparable to lossy JPEG compression. Human raters are\nonly slightly better than random chance at distinguishing short clips of the\ngame from clips of the simulation. GameNGen is trained in two phases: (1) an\nRL-agent learns to play the game and the training sessions are recorded, and\n(2) a diffusion model is trained to produce the next frame, conditioned on the\nsequence of past frames and actions. Conditioning augmentations enable stable\nauto-regressive generation over long trajectories.\n","authors":["Dani Valevski","Yaniv Leviathan","Moab Arar","Shlomi Fruchter"],"pdf_url":"https://arxiv.org/pdf/2408.14837v1.pdf","comment":"Project page: https://gamengen.github.io/"},{"id":"http://arxiv.org/abs/2405.18723v3","updated":"2024-08-27T07:31:44Z","published":"2024-05-29T03:08:30Z","title":"Conformal Depression Prediction","summary":"  While existing depression prediction methods based on deep learning show\npromise, their practical application is hindered by the lack of\ntrustworthiness, as these deep models are often deployed as black box models,\nleaving us uncertain on the confidence of their predictions. For high-risk\nclinical applications like depression prediction, uncertainty quantification is\nessential in decision-making. In this paper, we introduce conformal depression\nprediction (CDP), a depression prediction method with uncertainty\nquantification based on conformal prediction (CP), giving valid confidence\nintervals with theoretical coverage guarantees for the model predictions. CDP\nis a plug-and-play module that requires neither model retraining nor an\nassumption about the depression data distribution. As CDP provides only an\naverage coverage guarantee across all inputs rather than per-input performance\nguarantee, we further propose CDP-ACC, an improved conformal prediction with\napproximate conditional coverage. CDP-ACC firstly estimates the prediction\ndistribution through neighborhood relaxation, and then introduces a conformal\nscore function by constructing nested sequences, so as to provide a tighter\nprediction interval adaptive to specific input. We empirically demonstrate the\napplication of CDP in uncertainty-aware facial depression prediction, as well\nas the effectiveness and superiority of CDP-ACC on the AVEC 2013 and AVEC 2014\ndatasets. Our code is publicly available at https://github.com/PushineLee/CDP.\n","authors":["Yonghong Li","Xiuzhuang Zhou"],"pdf_url":"https://arxiv.org/pdf/2405.18723v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14831v1","updated":"2024-08-27T07:28:05Z","published":"2024-08-27T07:28:05Z","title":"DRL-Based Federated Self-Supervised Learning for Task Offloading and\n  Resource Allocation in ISAC-Enabled Vehicle Edge Computing","summary":"  Intelligent Transportation Systems (ITS) leverage Integrated Sensing and\nCommunications (ISAC) to enhance data exchange between vehicles and\ninfrastructure in the Internet of Vehicles (IoV). This integration inevitably\nincreases computing demands, risking real-time system stability. Vehicle Edge\nComputing (VEC) addresses this by offloading tasks to Road Side Unit (RSU),\nensuring timely services. Our previous work FLSimCo algorithm, which uses local\nresources for Federated Self-Supervised Learning (SSL), though vehicles often\ncan't complete all iterations task. Our improved algorithm offloads partial\ntask to RSU and optimizes energy consumption by adjusting transmission power,\nCPU frequency, and task assignment ratios, balancing local and RSU-based\ntraining. Meanwhile, setting an offloading threshold further prevents\ninefficiencies. Simulation results show that the enhanced algorithm reduces\nenergy consumption, improves offloading efficiency and the accuracy of\nFederated SSL.\n","authors":["Xueying Gu","Qiong Wu","Pingyi Fan","Nan Cheng","Wen Chen","Khaled B. Letaief"],"pdf_url":"https://arxiv.org/pdf/2408.14831v1.pdf","comment":"This paper has been submitted to Digital Communications and Networks.\n  The source code has been released at:\n  https://github.com/qiongwu86/Federated-SSL-task-offloading-and-resource-allocation"},{"id":"http://arxiv.org/abs/2408.13751v2","updated":"2024-08-27T07:26:20Z","published":"2024-08-25T07:32:58Z","title":"Improved identification of breakpoints in piecewise regression and its\n  applications","summary":"  Identifying breakpoints in piecewise regression is critical in enhancing the\nreliability and interpretability of data fitting. In this paper, we propose\nnovel algorithms based on the greedy algorithm to accurately and efficiently\nidentify breakpoints in piecewise polynomial regression. The algorithm updates\nthe breakpoints to minimize the error by exploring the neighborhood of each\nbreakpoint. It has a fast convergence rate and stability to find optimal\nbreakpoints. Moreover, it can determine the optimal number of breakpoints. The\ncomputational results for real and synthetic data show that its accuracy is\nbetter than any existing methods. The real-world datasets demonstrate that\nbreakpoints through the proposed algorithm provide valuable data information.\n","authors":["Taehyeong Kim","Hyungu Lee","Hayoung Choi"],"pdf_url":"https://arxiv.org/pdf/2408.13751v2.pdf","comment":"13 pages, 6 figures"},{"id":"http://arxiv.org/abs/2408.14825v1","updated":"2024-08-27T07:11:45Z","published":"2024-08-27T07:11:45Z","title":"From Rule-Based Models to Deep Learning Transformers Architectures for\n  Natural Language Processing and Sign Language Translation Systems: Survey,\n  Taxonomy and Performance Evaluation","summary":"  With the growing Deaf and Hard of Hearing population worldwide and the\npersistent shortage of certified sign language interpreters, there is a\npressing need for an efficient, signs-driven, integrated end-to-end translation\nsystem, from sign to gloss to text and vice-versa. There has been a wealth of\nresearch on machine translations and related reviews. However, there are few\nworks on sign language machine translation considering the particularity of the\nlanguage being continuous and dynamic. This paper aims to address this void,\nproviding a retrospective analysis of the temporal evolution of sign language\nmachine translation algorithms and a taxonomy of the Transformers\narchitectures, the most used approach in language translation. We also present\nthe requirements of a real-time Quality-of-Service sign language ma-chine\ntranslation system underpinned by accurate deep learning algorithms. We propose\nfuture research directions for sign language translation systems.\n","authors":["Nada Shahin","Leila Ismail"],"pdf_url":"https://arxiv.org/pdf/2408.14825v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14821v1","updated":"2024-08-27T07:03:51Z","published":"2024-08-27T07:03:51Z","title":"Data-driven Effective Modeling of Multiscale Stochastic Dynamical\n  Systems","summary":"  We present a numerical method for learning the dynamics of slow components of\nunknown multiscale stochastic dynamical systems. While the governing equations\nof the systems are unknown, bursts of observation data of the slow variables\nare available. By utilizing the observation data, our proposed method is\ncapable of constructing a generative stochastic model that can accurately\ncapture the effective dynamics of the slow variables in distribution. We\npresent a comprehensive set of numerical examples to demonstrate the\nperformance of the proposed method.\n","authors":["Yuan Chen","Dongbin Xiu"],"pdf_url":"https://arxiv.org/pdf/2408.14821v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2406.15747"},{"id":"http://arxiv.org/abs/2408.14817v1","updated":"2024-08-27T06:58:52Z","published":"2024-08-27T06:58:52Z","title":"A Comprehensive Benchmark of Machine and Deep Learning Across Diverse\n  Tabular Datasets","summary":"  The analysis of tabular datasets is highly prevalent both in scientific\nresearch and real-world applications of Machine Learning (ML). Unlike many\nother ML tasks, Deep Learning (DL) models often do not outperform traditional\nmethods in this area. Previous comparative benchmarks have shown that DL\nperformance is frequently equivalent or even inferior to models such as\nGradient Boosting Machines (GBMs). In this study, we introduce a comprehensive\nbenchmark aimed at better characterizing the types of datasets where DL models\nexcel. Although several important benchmarks for tabular datasets already\nexist, our contribution lies in the variety and depth of our comparison: we\nevaluate 111 datasets with 20 different models, including both regression and\nclassification tasks. These datasets vary in scale and include both those with\nand without categorical variables. Importantly, our benchmark contains a\nsufficient number of datasets where DL models perform best, allowing for a\nthorough analysis of the conditions under which DL models excel. Building on\nthe results of this benchmark, we train a model that predicts scenarios where\nDL models outperform alternative methods with 86.1% accuracy (AUC 0.78). We\npresent insights derived from this characterization and compare these findings\nto previous benchmarks.\n","authors":["Assaf Shmuel","Oren Glickman","Teddy Lazebnik"],"pdf_url":"https://arxiv.org/pdf/2408.14817v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14806v1","updated":"2024-08-27T06:28:35Z","published":"2024-08-27T06:28:35Z","title":"Poly2Vec: Polymorphic Encoding of Geospatial Objects for Spatial\n  Reasoning with Deep Neural Networks","summary":"  Encoding geospatial data is crucial for enabling machine learning (ML) models\nto perform tasks that require spatial reasoning, such as identifying the\ntopological relationships between two different geospatial objects. However,\nexisting encoding methods are limited as they are typically customized to\nhandle only specific types of spatial data, which impedes their applicability\nacross different downstream tasks where multiple data types coexist. To address\nthis, we introduce Poly2Vec, an encoding framework that unifies the modeling of\ndifferent geospatial objects, including 2D points, polylines, and polygons,\nirrespective of the downstream task. We leverage the power of the 2D Fourier\ntransform to encode useful spatial properties, such as shape and location, from\ngeospatial objects into fixed-length vectors. These vectors are then inputted\ninto neural network models for spatial reasoning tasks.This unified approach\neliminates the need to develop and train separate models for each distinct\nspatial type. We evaluate Poly2Vec on both synthetic and real datasets of mixed\ngeometry types and verify its consistent performance across several downstream\nspatial reasoning tasks.\n","authors":["Maria Despoina Siampou","Jialiang Li","John Krumm","Cyrus Shahabi","Hua Lu"],"pdf_url":"https://arxiv.org/pdf/2408.14806v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14797v1","updated":"2024-08-27T06:07:18Z","published":"2024-08-27T06:07:18Z","title":"MaskCycleGAN-based Whisper to Normal Speech Conversion","summary":"  Whisper to normal speech conversion is an active area of research. Various\narchitectures based on generative adversarial networks have been proposed in\nthe recent past. Especially, recent study shows that MaskCycleGAN, which is a\nmask guided, and cyclic consistency keeping, generative adversarial network,\nperforms really well for voice conversion from spectrogram representations. In\nthe current work we present a MaskCycleGAN approach for the conversion of\nwhispered speech to normal speech. We find that tuning the mask parameters, and\npre-processing the signal with a voice activity detector provides superior\nperformance when compared to the existing approach. The wTIMIT dataset is used\nfor evaluation. Objective metrics such as PESQ and G-Loss are used to evaluate\nthe converted speech, along with subjective evaluation using mean opinion\nscore. The results show that the proposed approach offers considerable\nbenefits.\n","authors":["K. Rohith Gupta","K. Ramnath","S. Johanan Joysingh","P. Vijayalakshmi","T. Nagarajan"],"pdf_url":"https://arxiv.org/pdf/2408.14797v1.pdf","comment":"submitted to TENCON 2024"},{"id":"http://arxiv.org/abs/2408.14788v1","updated":"2024-08-27T05:28:52Z","published":"2024-08-27T05:28:52Z","title":"Learning from Complementary Features","summary":"  While precise data observation is essential for the learning processes of\npredictive models, it can be challenging owing to factors such as insufficient\nobservation accuracy, high collection costs, and privacy constraints. In this\npaper, we examines cases where some qualitative features are unavailable as\nprecise information indicating \"what it is,\" but rather as complementary\ninformation indicating \"what it is not.\" We refer to features defined by\nprecise information as ordinary features (OFs) and those defined by\ncomplementary information as complementary features (CFs). We then formulate a\nnew learning scenario termed Complementary Feature Learning (CFL), where\npredictive models are constructed using instances consisting of OFs and CFs.\nThe simplest formalization of CFL applies conventional supervised learning\ndirectly using the observed values of CFs. However, this approach does not\nresolve the ambiguity associated with CFs, making learning challenging and\ncomplicating the interpretation of the predictive model's specific predictions.\nTherefore, we derive an objective function from an information-theoretic\nperspective to estimate the OF values corresponding to CFs and to predict\noutput labels based on these estimations. Based on this objective function, we\npropose a theoretically guaranteed graph-based estimation method along with its\npractical approximation, for estimating OF values corresponding to CFs. The\nresults of numerical experiments conducted with real-world data demonstrate\nthat our proposed method effectively estimates OF values corresponding to CFs\nand predicts output labels.\n","authors":["Kosuke Sugiyama","Masato Uchida"],"pdf_url":"https://arxiv.org/pdf/2408.14788v1.pdf","comment":"16 pages, 7 figures"},{"id":"http://arxiv.org/abs/2402.02051v2","updated":"2024-08-27T05:26:14Z","published":"2024-02-03T06:01:21Z","title":"Nonlinear subspace clustering by functional link neural networks","summary":"  Nonlinear subspace clustering based on a feed-forward neural network has been\ndemonstrated to provide better clustering accuracy than some advanced subspace\nclustering algorithms. While this approach demonstrates impressive outcomes, it\ninvolves a balance between effectiveness and computational cost. In this study,\nwe employ a functional link neural network to transform data samples into a\nnonlinear domain. Subsequently, we acquire a self-representation matrix through\na learning mechanism that builds upon the mapped samples. As the functional\nlink neural network is a single-layer neural network, our proposed method\nachieves high computational efficiency while ensuring desirable clustering\nperformance. By incorporating the local similarity regularization to enhance\nthe grouping effect, our proposed method further improves the quality of the\nclustering results. Additionally, we introduce a convex combination subspace\nclustering scheme, which combining a linear subspace clustering method with the\nfunctional link neural network subspace clustering approach. This combination\napproach allows for a dynamic balance between linear and nonlinear\nrepresentations. Extensive experiments confirm the advancement of our methods.\nThe source code will be released on https://lshi91.github.io/ soon.\n","authors":["Long Shi","Lei Cao","Zhongpu Chen","Badong Chen","Yu Zhao"],"pdf_url":"https://arxiv.org/pdf/2402.02051v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14785v1","updated":"2024-08-27T05:23:45Z","published":"2024-08-27T05:23:45Z","title":"Unsupervised-to-Online Reinforcement Learning","summary":"  Offline-to-online reinforcement learning (RL), a framework that trains a\npolicy with offline RL and then further fine-tunes it with online RL, has been\nconsidered a promising recipe for data-driven decision-making. While sensible,\nthis framework has drawbacks: it requires domain-specific offline RL\npre-training for each task, and is often brittle in practice. In this work, we\npropose unsupervised-to-online RL (U2O RL), which replaces domain-specific\nsupervised offline RL with unsupervised offline RL, as a better alternative to\noffline-to-online RL. U2O RL not only enables reusing a single pre-trained\nmodel for multiple downstream tasks, but also learns better representations,\nwhich often result in even better performance and stability than supervised\noffline-to-online RL. To instantiate U2O RL in practice, we propose a general\nrecipe for U2O RL to bridge task-agnostic unsupervised offline skill-based\npolicy pre-training and supervised online fine-tuning. Throughout our\nexperiments in nine state-based and pixel-based environments, we empirically\ndemonstrate that U2O RL achieves strong performance that matches or even\noutperforms previous offline-to-online RL approaches, while being able to reuse\na single pre-trained model for a number of different downstream tasks.\n","authors":["Junsu Kim","Seohong Park","Sergey Levine"],"pdf_url":"https://arxiv.org/pdf/2408.14785v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14763v2","updated":"2024-08-27T05:09:09Z","published":"2023-12-22T15:28:55Z","title":"Enhanced Latent Multi-view Subspace Clustering","summary":"  Latent multi-view subspace clustering has been demonstrated to have desirable\nclustering performance. However, the original latent representation method\nvertically concatenates the data matrices from multiple views into a single\nmatrix along the direction of dimensionality to recover the latent\nrepresentation matrix, which may result in an incomplete information recovery.\nTo fully recover the latent space representation, we in this paper propose an\nEnhanced Latent Multi-view Subspace Clustering (ELMSC) method. The ELMSC method\ninvolves constructing an augmented data matrix that enhances the representation\nof multi-view data. Specifically, we stack the data matrices from various views\ninto the block-diagonal locations of the augmented matrix to exploit the\ncomplementary information. Meanwhile, the non-block-diagonal entries are\ncomposed based on the similarity between different views to capture the\nconsistent information. In addition, we enforce a sparse regularization for the\nnon-diagonal blocks of the augmented self-representation matrix to avoid\nredundant calculations of consistency information. Finally, a novel iterative\nalgorithm based on the framework of Alternating Direction Method of Multipliers\n(ADMM) is developed to solve the optimization problem for ELMSC. Extensive\nexperiments on real-world datasets demonstrate that our proposed ELMSC is able\nto achieve higher clustering performance than some state-of-art multi-view\nclustering methods.\n","authors":["Long Shi","Lei Cao","Jun Wang","Badong Chen"],"pdf_url":"https://arxiv.org/pdf/2312.14763v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14780v1","updated":"2024-08-27T04:57:53Z","published":"2024-08-27T04:57:53Z","title":"GINN-KAN: Interpretability pipelining with applications in Physics\n  Informed Neural Networks","summary":"  Neural networks are powerful function approximators, yet their ``black-box\"\nnature often renders them opaque and difficult to interpret. While many\npost-hoc explanation methods exist, they typically fail to capture the\nunderlying reasoning processes of the networks. A truly interpretable neural\nnetwork would be trained similarly to conventional models using techniques such\nas backpropagation, but additionally provide insights into the learned\ninput-output relationships. In this work, we introduce the concept of\ninterpretability pipelineing, to incorporate multiple interpretability\ntechniques to outperform each individual technique. To this end, we first\nevaluate several architectures that promise such interpretability, with a\nparticular focus on two recent models selected for their potential to\nincorporate interpretability into standard neural network architectures while\nstill leveraging backpropagation: the Growing Interpretable Neural Network\n(GINN) and Kolmogorov Arnold Networks (KAN). We analyze the limitations and\nstrengths of each and introduce a novel interpretable neural network GINN-KAN\nthat synthesizes the advantages of both models. When tested on the Feynman\nsymbolic regression benchmark datasets, GINN-KAN outperforms both GINN and KAN.\nTo highlight the capabilities and the generalizability of this approach, we\nposition GINN-KAN as an alternative to conventional black-box networks in\nPhysics-Informed Neural Networks (PINNs). We expect this to have far-reaching\nimplications in the application of deep learning pipelines in the natural\nsciences. Our experiments with this interpretable PINN on 15 different partial\ndifferential equations demonstrate that GINN-KAN augmented PINNs outperform\nPINNs with black-box networks in solving differential equations and surpass the\ncapabilities of both GINN and KAN.\n","authors":["Nisal Ranasinghe","Yu Xia","Sachith Seneviratne","Saman Halgamuge"],"pdf_url":"https://arxiv.org/pdf/2408.14780v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14778v1","updated":"2024-08-27T04:56:45Z","published":"2024-08-27T04:56:45Z","title":"GPU-Accelerated Counterfactual Regret Minimization","summary":"  Counterfactual regret minimization (CFR) is a family of algorithms of\nno-regret learning dynamics capable of solving large-scale imperfect\ninformation games. There has been a notable lack of work on making CFR more\ncomputationally efficient. We propose implementing this algorithm as a series\nof dense and sparse matrix and vector operations, thereby making it highly\nparallelizable for a graphical processing unit. Our experiments show that our\nimplementation performs up to about 352.5 times faster than OpenSpiel's Python\nimplementation and up to about 22.2 times faster than OpenSpiel's C++\nimplementation and the speedup becomes more pronounced as the size of the game\nbeing solved grows.\n","authors":["Juho Kim"],"pdf_url":"https://arxiv.org/pdf/2408.14778v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14777v1","updated":"2024-08-27T04:56:22Z","published":"2024-08-27T04:56:22Z","title":"Quartered Chirp Spectral Envelope for Whispered vs Normal Speech\n  Classification","summary":"  Whispered speech as an acceptable form of human-computer interaction is\ngaining traction. Systems that address multiple modes of speech require a\nrobust front-end speech classifier. Performance of whispered vs normal speech\nclassification drops in the presence of additive white Gaussian noise, since\nnormal speech takes on some of the characteristics of whispered speech. In this\nwork, we propose a new feature named the quartered chirp spectral envelope, a\ncombination of the chirp spectrum and the quartered spectral envelope, to\nclassify whispered and normal speech. The chirp spectrum can be fine-tuned to\nobtain customized features for a given task, and the quartered spectral\nenvelope has been proven to work especially well for the current task. The\nfeature is trained on a one dimensional convolutional neural network, that\ncaptures the trends in the spectral envelope. The proposed system performs\nbetter than the state of the art, in the presence of white noise.\n","authors":["S. Johanan Joysingh","P. Vijayalakshmi","T. Nagarajan"],"pdf_url":"https://arxiv.org/pdf/2408.14777v1.pdf","comment":"submitted to TENCON 2024"},{"id":"http://arxiv.org/abs/2408.13609v2","updated":"2024-08-27T04:49:46Z","published":"2024-08-24T15:43:02Z","title":"GNN: Graph Neural Network and Large Language Model for Data Discovery","summary":"  Our algorithm GNN: Graph Neural Network and Large Language Model for Data\nDiscovery inherit the benefits of \\cite{hoang2024plod} (PLOD: Predictive\nLearning Optimal Data Discovery), \\cite{Hoang2024BODBO} (BOD: Blindly Optimal\nData Discovery) in terms of overcoming the challenges of having to predefine\nutility function and the human input for attribute ranking, which helps prevent\nthe time-consuming loop process. In addition to these previous works, our\nalgorithm GNN leverages the advantages of graph neural networks and large\nlanguage models to understand text type values that cannot be understood by\nPLOD and MOD, thus making the task of predicting outcomes more reliable. GNN\ncould be seen as an extension of PLOD in terms of understanding the text type\nvalue and the user's preferences, not only numerical values but also text\nvalues, making the promise of data science and analytics purposes.\n","authors":["Thomas Hoang"],"pdf_url":"https://arxiv.org/pdf/2408.13609v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04814v4","updated":"2024-08-27T04:43:10Z","published":"2024-04-07T05:47:41Z","title":"Inference-Time Rule Eraser: Fair Recognition via Distilling and Removing\n  Biased Rules","summary":"  Machine learning models often make predictions based on biased features such\nas gender, race, and other social attributes, posing significant fairness\nrisks, especially in societal applications, such as hiring, banking, and\ncriminal justice. Traditional approaches to addressing this issue involve\nretraining or fine-tuning neural networks with fairness-aware optimization\nobjectives. However, these methods can be impractical due to significant\ncomputational resources, complex industrial tests, and the associated CO2\nfootprint. Additionally, regular users often fail to fine-tune models because\nthey lack access to model parameters In this paper, we introduce the\nInference-Time Rule Eraser (Eraser), a novel method designed to address\nfairness concerns by removing biased decision-making rules from deployed models\nduring inference without altering model weights. We begin by establishing a\ntheoretical foundation for modifying model outputs to eliminate biased rules\nthrough Bayesian analysis. Next, we present a specific implementation of Eraser\nthat involves two stages: (1) distilling the biased rules from the deployed\nmodel into an additional patch model, and (2) removing these biased rules from\nthe output of the deployed model during inference. Extensive experiments\nvalidate the effectiveness of our approach, showcasing its superior performance\nin addressing fairness concerns in AI systems.\n","authors":["Yi Zhang","Dongyuan Lu","Jitao Sang"],"pdf_url":"https://arxiv.org/pdf/2404.04814v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15773v2","updated":"2024-08-27T04:41:40Z","published":"2024-07-22T16:25:41Z","title":"STAMP: Outlier-Aware Test-Time Adaptation with Stable Memory Replay","summary":"  Test-time adaptation (TTA) aims to address the distribution shift between the\ntraining and test data with only unlabeled data at test time. Existing TTA\nmethods often focus on improving recognition performance specifically for test\ndata associated with classes in the training set. However, during the\nopen-world inference process, there are inevitably test data instances from\nunknown classes, commonly referred to as outliers. This paper pays attention to\nthe problem that conducts both sample recognition and outlier rejection during\ninference while outliers exist. To address this problem, we propose a new\napproach called STAble Memory rePlay (STAMP), which performs optimization over\na stable memory bank instead of the risky mini-batch. In particular, the memory\nbank is dynamically updated by selecting low-entropy and label-consistent\nsamples in a class-balanced manner. In addition, we develop a self-weighted\nentropy minimization strategy that assigns higher weight to low-entropy\nsamples. Extensive results demonstrate that STAMP outperforms existing TTA\nmethods in terms of both recognition and outlier detection performance. The\ncode is released at https://github.com/yuyongcan/STAMP.\n","authors":["Yongcan Yu","Lijun Sheng","Ran He","Jian Liang"],"pdf_url":"https://arxiv.org/pdf/2407.15773v2.pdf","comment":"Accepted by ECCV 2024; Fixed a bug in calculating OOD score of STAMP\n  and updated the results"},{"id":"http://arxiv.org/abs/2408.14025v2","updated":"2024-08-27T04:36:52Z","published":"2024-08-26T05:31:46Z","title":"An Item Response Theory-based R Module for Algorithm Portfolio Analysis","summary":"  Experimental evaluation is crucial in AI research, especially for assessing\nalgorithms across diverse tasks. Many studies often evaluate a limited set of\nalgorithms, failing to fully understand their strengths and weaknesses within a\ncomprehensive portfolio. This paper introduces an Item Response Theory (IRT)\nbased analysis tool for algorithm portfolio evaluation called AIRT-Module.\nTraditionally used in educational psychometrics, IRT models test question\ndifficulty and student ability using responses to test questions. Adapting IRT\nto algorithm evaluation, the AIRT-Module contains a Shiny web application and\nthe R package airt. AIRT-Module uses algorithm performance measures to compute\nanomalousness, consistency, and difficulty limits for an algorithm and the\ndifficulty of test instances. The strengths and weaknesses of algorithms are\nvisualised using the difficulty spectrum of the test instances. AIRT-Module\noffers a detailed understanding of algorithm capabilities across varied test\ninstances, thus enhancing comprehensive AI method assessment. It is available\nat https://sevvandi.shinyapps.io/AIRT/ .\n","authors":["Brodie Oldfield","Sevvandi Kandanaarachchi","Ziqi Xu","Mario Andrés Muñoz"],"pdf_url":"https://arxiv.org/pdf/2408.14025v2.pdf","comment":"10 Pages, 6 Figures. Submitted to SoftwareX"},{"id":"http://arxiv.org/abs/2408.14774v1","updated":"2024-08-27T04:31:58Z","published":"2024-08-27T04:31:58Z","title":"Instruct-SkillMix: A Powerful Pipeline for LLM Instruction Tuning","summary":"  We introduce Instruct-SkillMix, an automated approach for creating diverse,\nhigh quality SFT data. The Instruct-SkillMix pipeline involves two stages, each\nleveraging an existing powerful LLM: (1) Skill extraction: uses the LLM to\nextract core \"skills\" for instruction-following, either from existing datasets,\nor by directly prompting the model; (2) Data generation: uses the powerful LLM\nto generate (instruction, response) data that exhibit a randomly chosen pair of\nthese skills. Here, the use of random skill combinations promotes diversity and\ndifficulty.\n  Vanilla SFT (i.e., no PPO, DPO, or RL methods) on data generated from\nInstruct-SkillMix leads to strong gains on instruction following benchmarks\nsuch as AlpacaEval 2.0, MT-Bench, and WildBench. With just $4$K examples,\nLLaMA-3-8B-Base achieves 42.76% length-controlled win rate on AlpacaEval 2.0.\nTo our knowledge, this achieves state-of-the-art performance among all models\nthat have only undergone SFT (no RL methods) and competes with proprietary\nmodels such as Claude 3 Opus and LLaMA-3.1-405B-Instruct.\n  Ablation studies also suggest plausible reasons for why creating open\ninstruction-tuning datasets via naive crowd-sourcing has proved difficult.\nIntroducing low quality answers (\"shirkers\") in $20\\%$ of Instruct-SkillMix\nexamples causes performance to plummet, sometimes catastrophically.\n  The Instruct-SkillMix pipeline is flexible and is adaptable to other\nsettings.\n","authors":["Simran Kaur","Simon Park","Anirudh Goyal","Sanjeev Arora"],"pdf_url":"https://arxiv.org/pdf/2408.14774v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14080v2","updated":"2024-08-27T04:14:14Z","published":"2024-08-26T08:02:57Z","title":"SONICS: Synthetic Or Not -- Identifying Counterfeit Songs","summary":"  The recent surge in AI-generated songs presents exciting possibilities and\nchallenges. While these tools democratize music creation, they also necessitate\nthe ability to distinguish between human-composed and AI-generated songs for\nsafeguarding artistic integrity and content curation. Existing research and\ndatasets in fake song detection only focus on singing voice deepfake detection\n(SVDD), where the vocals are AI-generated but the instrumental music is sourced\nfrom real songs. However, this approach is inadequate for contemporary\nend-to-end AI-generated songs where all components (vocals, lyrics, music, and\nstyle) could be AI-generated. Additionally, existing datasets lack lyrics-music\ndiversity, long-duration songs, and open fake songs. To address these gaps, we\nintroduce SONICS, a novel dataset for end-to-end Synthetic Song Detection\n(SSD), comprising over 97k songs with over 49k synthetic songs from popular\nplatforms like Suno and Udio. Furthermore, we highlight the importance of\nmodeling long-range temporal dependencies in songs for effective authenticity\ndetection, an aspect overlooked in existing methods. To capture these patterns,\nwe propose a novel model, SpecTTTra, that is up to 3 times faster and 6 times\nmore memory efficient compared to popular CNN and Transformer-based models\nwhile maintaining competitive performance. Finally, we offer both AI-based and\nHuman evaluation benchmarks, addressing another deficiency in current research.\n","authors":["Md Awsafur Rahman","Zaber Ibn Abdul Hakim","Najibul Haque Sarker","Bishmoy Paul","Shaikh Anowarul Fattah"],"pdf_url":"https://arxiv.org/pdf/2408.14080v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.13820v2","updated":"2024-08-27T03:58:09Z","published":"2023-10-20T21:14:07Z","title":"FERI: A Multitask-based Fairness Achieving Algorithm with Applications\n  to Fair Organ Transplantation","summary":"  Liver transplantation often faces fairness challenges across subgroups\ndefined by sensitive attributes such as age group, gender, and race/ethnicity.\nMachine learning models for outcome prediction can introduce additional biases.\nTherefore, we introduce Fairness through the Equitable Rate of Improvement in\nMultitask Learning (FERI) algorithm for fair predictions of graft failure risk\nin liver transplant patients. FERI constrains subgroup loss by balancing\nlearning rates and preventing subgroup dominance in the training process. Our\nresults show that FERI maintained high predictive accuracy with AUROC and AUPRC\ncomparable to baseline models. More importantly, FERI demonstrated an ability\nto improve fairness without sacrificing accuracy. Specifically, for the gender,\nFERI reduced the demographic parity disparity by 71.74%, and for the age group,\nit decreased the equalized odds disparity by 40.46%. Therefore, the FERI\nalgorithm advanced fairness-aware predictive modeling in healthcare and\nprovides an invaluable tool for equitable healthcare systems.\n","authors":["Can Li","Dejian Lai","Xiaoqian Jiang","Kai Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.13820v2.pdf","comment":"First Prize Student Award Paper, American Medical Informatics\n  Association 2024 Informatics Summit"},{"id":"http://arxiv.org/abs/2405.19730v4","updated":"2024-08-27T03:45:18Z","published":"2024-05-30T06:21:34Z","title":"Research on the Spatial Data Intelligent Foundation Model","summary":"  This report focuses on spatial data intelligent large models, delving into\nthe principles, methods, and cutting-edge applications of these models. It\nprovides an in-depth discussion on the definition, development history, current\nstatus, and trends of spatial data intelligent large models, as well as the\nchallenges they face. The report systematically elucidates the key technologies\nof spatial data intelligent large models and their applications in urban\nenvironments, aerospace remote sensing, geography, transportation, and other\nscenarios. Additionally, it summarizes the latest application cases of spatial\ndata intelligent large models in themes such as urban development, multimodal\nsystems, remote sensing, smart transportation, and resource environments.\nFinally, the report concludes with an overview and outlook on the development\nprospects of spatial data intelligent large models.\n","authors":["Shaohua Wang","Xing Xie","Yong Li","Danhuai Guo","Zhi Cai","Yu Liu","Yang Yue","Xiao Pan","Feng Lu","Huayi Wu","Zhipeng Gui","Zhiming Ding","Bolong Zheng","Fuzheng Zhang","Jingyuan Wang","Zhengchao Chen","Hao Lu","Jiayi Li","Peng Yue","Wenhao Yu","Yao Yao","Leilei Sun","Yong Zhang","Longbiao Chen","Xiaoping Du","Xiang Li","Xueying Zhang","Kun Qin","Zhaoya Gong","Weihua Dong","Xiaofeng Meng"],"pdf_url":"https://arxiv.org/pdf/2405.19730v4.pdf","comment":"V1 and V2 are in Chinese language, other versions are in English"},{"id":"http://arxiv.org/abs/2402.10260v2","updated":"2024-08-27T03:32:47Z","published":"2024-02-15T18:58:09Z","title":"A StrongREJECT for Empty Jailbreaks","summary":"  Most jailbreak papers claim the jailbreaks they propose are highly effective,\noften boasting near-100% attack success rates. However, it is perhaps more\ncommon than not for jailbreak developers to substantially exaggerate the\neffectiveness of their jailbreaks. We suggest this problem arises because\njailbreak researchers lack a standard, high-quality benchmark for evaluating\njailbreak performance, leaving researchers to create their own. To create a\nbenchmark, researchers must choose a dataset of forbidden prompts to which a\nvictim model will respond, along with an evaluation method that scores the\nharmfulness of the victim model's responses. We show that existing benchmarks\nsuffer from significant shortcomings and introduce the StrongREJECT benchmark\nto address these issues. StrongREJECT's dataset contains prompts that victim\nmodels must answer with specific, harmful information, while its automated\nevaluator measures the extent to which a response gives useful information to\nforbidden prompts. In doing so, the StrongREJECT evaluator achieves\nstate-of-the-art agreement with human judgments of jailbreak effectiveness.\nNotably, we find that existing evaluation methods significantly overstate\njailbreak effectiveness compared to human judgments and the StrongREJECT\nevaluator. We describe a surprising and novel phenomenon that explains this\ndiscrepancy: jailbreaks bypassing a victim model's safety fine-tuning tend to\nreduce its capabilities. Together, our findings underscore the need for\nresearchers to use a high-quality benchmark, such as StrongREJECT, when\ndeveloping new jailbreak attacks. We release the StrongREJECT code and data at\nhttps://strong-reject.readthedocs.io/en/latest/.\n","authors":["Alexandra Souly","Qingyuan Lu","Dillon Bowen","Tu Trinh","Elvis Hsieh","Sana Pandey","Pieter Abbeel","Justin Svegliato","Scott Emmons","Olivia Watkins","Sam Toyer"],"pdf_url":"https://arxiv.org/pdf/2402.10260v2.pdf","comment":"Code and data at https://strong-reject.readthedocs.io/en/latest/"},{"id":"http://arxiv.org/abs/2408.14763v1","updated":"2024-08-27T03:30:18Z","published":"2024-08-27T03:30:18Z","title":"Channel-wise Influence: Estimating Data Influence for Multivariate Time\n  Series","summary":"  The influence function, a technique from robust statistics, measures the\nimpact on model parameters or related functions when training data is removed\nor modified. This effective and valuable post-hoc method allows for studying\nthe interpretability of machine learning models without requiring costly model\nretraining. It would provide extensions like increasing model performance,\nimproving model generalization, and offering interpretability. Recently,\nMultivariate Time Series (MTS) analysis has become an important yet challenging\ntask, attracting significant attention. However, there is no preceding research\non the influence functions of MTS to shed light on the effects of modifying the\nchannel of training MTS. Given that each channel in an MTS plays a crucial role\nin its analysis, it is essential to characterize the influence of different\nchannels. To fill this gap, we propose a channel-wise influence function, which\nis the first method that can estimate the influence of different channels in\nMTS, utilizing a first-order gradient approximation that leverages the more\ninformative average gradient of the data set. Additionally, we demonstrate how\nthis influence function can be used to estimate the impact of a channel in MTS.\nFinally, we validated the accuracy and effectiveness of our influence\nestimation function in critical MTS analysis tasks, such as MTS anomaly\ndetection and MTS forecasting. According to abundant experiments on real-world\ndataset, the original influence function performs worse than our method and\neven fail for the channel pruning problem, which demonstrate the superiority\nand necessity of channel-wise influence function in MTS analysis tasks.\n","authors":["Muyao Wang","Zeke Xie","Bo Chen"],"pdf_url":"https://arxiv.org/pdf/2408.14763v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14762v1","updated":"2024-08-27T03:30:01Z","published":"2024-08-27T03:30:01Z","title":"Explainable Hierarchical Urban Representation Learning for Commuting\n  Flow Prediction","summary":"  Commuting flow prediction is an essential task for municipal operations in\nthe real world. Previous studies have revealed that it is feasible to estimate\nthe commuting origin-destination (OD) demand within a city using multiple\nauxiliary data. However, most existing methods are not suitable to deal with a\nsimilar task at a large scale, namely within a prefecture or the whole nation,\nowing to the increased number of geographical units that need to be maintained.\nIn addition, region representation learning is a universal approach for gaining\nurban knowledge for diverse metropolitan downstream tasks. Although many\nresearchers have developed comprehensive frameworks to describe urban units\nfrom multi-source data, they have not clarified the relationship between the\nselected geographical elements. Furthermore, metropolitan areas naturally\npreserve ranked structures, like cities and their inclusive districts, which\nmakes elucidating relations between cross-level urban units necessary.\nTherefore, we develop a heterogeneous graph-based model to generate meaningful\nregion embeddings at multiple spatial resolutions for predicting different\ntypes of inter-level OD flows. To demonstrate the effectiveness of the proposed\nmethod, extensive experiments were conducted using real-world aggregated mobile\nphone datasets collected from Shizuoka Prefecture, Japan. The results indicate\nthat our proposed model outperforms existing models in terms of a uniform urban\nstructure. We extend the understanding of predicted results using reasonable\nexplanations to enhance the credibility of the model.\n","authors":["Mingfei Cai","Yanbo Pang","Yoshihide Sekimoto"],"pdf_url":"https://arxiv.org/pdf/2408.14762v1.pdf","comment":"11 pages, 6 figures"},{"id":"http://arxiv.org/abs/2408.13448v2","updated":"2024-08-27T03:28:50Z","published":"2024-08-24T03:12:21Z","title":"ALIAS: DAG Learning with Efficient Unconstrained Policies","summary":"  Recently, reinforcement learning (RL) has proved a promising alternative for\nconventional local heuristics in score-based approaches to learning directed\nacyclic causal graphs (DAGs) from observational data. However, the intricate\nacyclicity constraint still challenges the efficient exploration of the vast\nspace of DAGs in existing methods. In this study, we introduce ALIAS\n(reinforced dAg Learning wIthout Acyclicity conStraints), a novel approach to\ncausal discovery powered by the RL machinery. Our method features an efficient\npolicy for generating DAGs in just a single step with an optimal quadratic\ncomplexity, fueled by a novel parametrization of DAGs that directly translates\na continuous space to the space of all DAGs, bypassing the need for explicitly\nenforcing acyclicity constraints. This approach enables us to navigate the\nsearch space more effectively by utilizing policy gradient methods and\nestablished scoring functions. In addition, we provide compelling empirical\nevidence for the strong performance of ALIAS in comparison with\nstate-of-the-arts in causal discovery over increasingly difficult experiment\nconditions on both synthetic and real datasets.\n","authors":["Bao Duong","Hung Le","Thin Nguyen"],"pdf_url":"https://arxiv.org/pdf/2408.13448v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.03865v3","updated":"2024-08-27T03:25:58Z","published":"2023-11-07T10:28:17Z","title":"When Fairness Meets Privacy: Exploring Privacy Threats in Fair Binary\n  Classifiers via Membership Inference Attacks","summary":"  Previous studies have developed fairness methods for biased models that\nexhibit discriminatory behaviors towards specific subgroups. While these models\nhave shown promise in achieving fair predictions, recent research has\nidentified their potential vulnerability to score-based membership inference\nattacks (MIAs). In these attacks, adversaries can infer whether a particular\ndata sample was used during training by analyzing the model's prediction\nscores. However, our investigations reveal that these score-based MIAs are\nineffective when targeting fairness-enhanced models in binary classifications.\nThe attack models trained to launch the MIAs degrade into simplistic threshold\nmodels, resulting in lower attack performance. Meanwhile, we observe that\nfairness methods often lead to prediction performance degradation for the\nmajority subgroups of the training data. This raises the barrier to successful\nattacks and widens the prediction gaps between member and non-member data.\nBuilding upon these insights, we propose an efficient MIA method against\nfairness-enhanced models based on fairness discrepancy results (FD-MIA). It\nleverages the difference in the predictions from both the original and\nfairness-enhanced models and exploits the observed prediction gaps as attack\nclues. We also explore potential strategies for mitigating privacy leakages.\nExtensive experiments validate our findings and demonstrate the efficacy of the\nproposed method.\n","authors":["Huan Tian","Guangsheng Zhang","Bo Liu","Tianqing Zhu","Ming Ding","Wanlei Zhou"],"pdf_url":"https://arxiv.org/pdf/2311.03865v3.pdf","comment":"Accepted by IJCAI 2024"},{"id":"http://arxiv.org/abs/2408.14757v1","updated":"2024-08-27T03:17:52Z","published":"2024-08-27T03:17:52Z","title":"Learning effective pruning at initialization from iterative pruning","summary":"  Pruning at initialization (PaI) reduces training costs by removing weights\nbefore training, which becomes increasingly crucial with the growing network\nsize. However, current PaI methods still have a large accuracy gap with\niterative pruning, especially at high sparsity levels. This raises an\nintriguing question: can we get inspiration from iterative pruning to improve\nthe PaI performance? In the lottery ticket hypothesis, the iterative rewind\npruning (IRP) finds subnetworks retroactively by rewinding the parameter to the\noriginal initialization in every pruning iteration, which means all the\nsubnetworks are based on the initial state. Here, we hypothesise the surviving\nsubnetworks are more important and bridge the initial feature and their\nsurviving score as the PaI criterion. We employ an end-to-end neural network\n(\\textbf{AutoS}parse) to learn this correlation, input the model's initial\nfeatures, output their score and then prune the lowest score parameters before\ntraining. To validate the accuracy and generalization of our method, we\nperformed PaI across various models. Results show that our approach outperforms\nexisting methods in high-sparsity settings. Notably, as the underlying logic of\nmodel pruning is consistent in different models, only one-time IRP on one model\nis needed (e.g., once IRP on ResNet-18/CIFAR-10, AutoS can be generalized to\nVGG-16/CIFAR-10, ResNet-18/TinyImageNet, et al.). As the first neural\nnetwork-based PaI method, we conduct extensive experiments to validate the\nfactors influencing this approach. These results reveal the learning tendencies\nof neural networks and provide new insights into our understanding and research\nof PaI from a practical perspective. Our code is available at:\nhttps://github.com/ChengYaofeng/AutoSparse.git.\n","authors":["Shengkai Liu","Yaofeng Cheng","Fusheng Zha","Wei Guo","Lining Sun","Zhenshan Bing","Chenguang Yang"],"pdf_url":"https://arxiv.org/pdf/2408.14757v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14756v1","updated":"2024-08-27T03:12:08Z","published":"2024-08-27T03:12:08Z","title":"Training-Free Time-Series Anomaly Detection: Leveraging Image Foundation\n  Models","summary":"  Recent advancements in time-series anomaly detection have relied on deep\nlearning models to handle the diverse behaviors of time-series data. However,\nthese models often suffer from unstable training and require extensive\nhyperparameter tuning, leading to practical limitations. Although foundation\nmodels present a potential solution, their use in time series is limited. To\novercome these issues, we propose an innovative image-based, training-free\ntime-series anomaly detection (ITF-TAD) approach. ITF-TAD converts time-series\ndata into images using wavelet transform and compresses them into a single\nrepresentation, leveraging image foundation models for anomaly detection. This\napproach achieves high-performance anomaly detection without unstable neural\nnetwork training or hyperparameter tuning. Furthermore, ITF-TAD identifies\nanomalies across different frequencies, providing users with a detailed\nvisualization of anomalies and their corresponding frequencies. Comprehensive\nexperiments on five benchmark datasets, including univariate and multivariate\ntime series, demonstrate that ITF-TAD offers a practical and effective solution\nwith performance exceeding or comparable to that of deep models.\n","authors":["Nobuo Namura","Yuma Ichikawa"],"pdf_url":"https://arxiv.org/pdf/2408.14756v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14747v1","updated":"2024-08-27T02:52:15Z","published":"2024-08-27T02:52:15Z","title":"Benchmarking Reinforcement Learning Methods for Dexterous Robotic\n  Manipulation with a Three-Fingered Gripper","summary":"  Reinforcement Learning (RL) training is predominantly conducted in\ncost-effective and controlled simulation environments. However, the transfer of\nthese trained models to real-world tasks often presents unavoidable challenges.\nThis research explores the direct training of RL algorithms in controlled yet\nrealistic real-world settings for the execution of dexterous manipulation. The\nbenchmarking results of three RL algorithms trained on intricate in-hand\nmanipulation tasks within practical real-world contexts are presented. Our\nstudy not only demonstrates the practicality of RL training in authentic\nreal-world scenarios, facilitating direct real-world applications, but also\nprovides insights into the associated challenges and considerations.\nAdditionally, our experiences with the employed experimental methods are\nshared, with the aim of empowering and engaging fellow researchers and\npractitioners in this dynamic field of robotics.\n","authors":["Elizabeth Cutler","Yuning Xing","Tony Cui","Brendan Zhou","Koen van Rijnsoever","Ben Hart","David Valencia","Lee Violet C. Ong","Trevor Gee","Minas Liarokapis","Henry Williams"],"pdf_url":"https://arxiv.org/pdf/2408.14747v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13713v2","updated":"2024-08-27T02:39:56Z","published":"2024-08-25T03:26:00Z","title":"Verifiable cloud-based variational quantum algorithms","summary":"  Variational quantum algorithms (VQAs) have shown potential for quantum\nadvantage with noisy intermediate-scale quantum (NISQ) devices for quantum\nmachine learning (QML). However, given the high cost and limited availability\nof quantum resources, delegating VQAs via cloud networks is a more practical\nsolution for clients with limited quantum capabilities. Recently, Shingu et\nal.[Physical Review A, 105, 022603 (2022)] proposed a variational secure cloud\nquantum computing protocol, utilizing ancilla-driven quantum computation (ADQC)\nfor cloud-based VQAs with minimal quantum resource consumption. However, their\nprotocol lacks verifiability, which exposes it to potential malicious behaviors\nby the server. Additionally, channel loss requires frequent re-delegation as\nthe size of the delegated variational circuit grows, complicating verification\ndue to increased circuit complexity. This paper introduces a new protocol to\naddress these challenges and enhance both verifiability and tolerance to\nchannel loss in cloud-based VQAs.\n","authors":["Junhong Yang","Banghai Wang","Junyu Quan","Qin Li"],"pdf_url":"https://arxiv.org/pdf/2408.13713v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14156v2","updated":"2024-08-27T02:31:50Z","published":"2024-06-20T09:53:56Z","title":"Tractable Equilibrium Computation in Markov Games through Risk Aversion","summary":"  A significant roadblock to the development of principled multi-agent\nreinforcement learning is the fact that desired solution concepts like Nash\nequilibria may be intractable to compute. To overcome this obstacle, we take\ninspiration from behavioral economics and show that -- by imbuing agents with\nimportant features of human decision-making like risk aversion and bounded\nrationality -- a class of risk-averse quantal response equilibria (RQE) become\ntractable to compute in all $n$-player matrix and finite-horizon Markov games.\nIn particular, we show that they emerge as the endpoint of no-regret learning\nin suitably adjusted versions of the games. Crucially, the class of\ncomputationally tractable RQE is independent of the underlying game structure\nand only depends on agents' degree of risk-aversion and bounded rationality. To\nvalidate the richness of this class of solution concepts we show that it\ncaptures peoples' patterns of play in a number of 2-player matrix games\npreviously studied in experimental economics. Furthermore, we give a first\nanalysis of the sample complexity of computing these equilibria in\nfinite-horizon Markov games when one has access to a generative model and\nvalidate our findings on a simple multi-agent reinforcement learning benchmark.\n","authors":["Eric Mazumdar","Kishan Panaganti","Laixi Shi"],"pdf_url":"https://arxiv.org/pdf/2406.14156v2.pdf","comment":"preprint of multi-agent RL with risk-averse equilibria"},{"id":"http://arxiv.org/abs/2408.14738v1","updated":"2024-08-27T02:29:29Z","published":"2024-08-27T02:29:29Z","title":"Learning Differentially Private Diffusion Models via Stochastic\n  Adversarial Distillation","summary":"  While the success of deep learning relies on large amounts of training\ndatasets, data is often limited in privacy-sensitive domains. To address this\nchallenge, generative model learning with differential privacy has emerged as a\nsolution to train private generative models for desensitized data generation.\nHowever, the quality of the images generated by existing methods is limited due\nto the complexity of modeling data distribution. We build on the success of\ndiffusion models and introduce DP-SAD, which trains a private diffusion model\nby a stochastic adversarial distillation method. Specifically, we first train a\ndiffusion model as a teacher and then train a student by distillation, in which\nwe achieve differential privacy by adding noise to the gradients from other\nmodels to the student. For better generation quality, we introduce a\ndiscriminator to distinguish whether an image is from the teacher or the\nstudent, which forms the adversarial training. Extensive experiments and\nanalysis clearly demonstrate the effectiveness of our proposed method.\n","authors":["Bochao Liu","Pengju Wang","Shiming Ge"],"pdf_url":"https://arxiv.org/pdf/2408.14738v1.pdf","comment":"accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2408.14736v1","updated":"2024-08-27T02:28:27Z","published":"2024-08-27T02:28:27Z","title":"Bandwidth-Aware and Overlap-Weighted Compression for\n  Communication-Efficient Federated Learning","summary":"  Current data compression methods, such as sparsification in Federated\nAveraging (FedAvg), effectively enhance the communication efficiency of\nFederated Learning (FL). However, these methods encounter challenges such as\nthe straggler problem and diminished model performance due to heterogeneous\nbandwidth and non-IID (Independently and Identically Distributed) data. To\naddress these issues, we introduce a bandwidth-aware compression framework for\nFL, aimed at improving communication efficiency while mitigating the problems\nassociated with non-IID data. First, our strategy dynamically adjusts\ncompression ratios according to bandwidth, enabling clients to upload their\nmodels at a close pace, thus exploiting the otherwise wasted time to transmit\nmore data. Second, we identify the non-overlapped pattern of retained\nparameters after compression, which results in diminished client update signals\ndue to uniformly averaged weights. Based on this finding, we propose a\nparameter mask to adjust the client-averaging coefficients at the parameter\nlevel, thereby more closely approximating the original updates, and improving\nthe training convergence under heterogeneous environments. Our evaluations\nreveal that our method significantly boosts model accuracy, with a maximum\nimprovement of 13% over the uncompressed FedAvg. Moreover, it achieves a\n$3.37\\times$ speedup in reaching the target accuracy compared to FedAvg with a\nTop-K compressor, demonstrating its effectiveness in accelerating convergence\nwith compression. The integration of common compression techniques into our\nframework further establishes its potential as a versatile foundation for\nfuture cross-device, communication-efficient FL research, addressing critical\nchallenges in FL and advancing the field of distributed machine learning.\n","authors":["Zichen Tang","Junlin Huang","Rudan Yan","Yuxin Wang","Zhenheng Tang","Shaohuai Shi","Amelie Chi Zhou","Xiaowen Chu"],"pdf_url":"https://arxiv.org/pdf/2408.14736v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.05579v2","updated":"2024-08-27T02:23:23Z","published":"2023-12-09T13:53:35Z","title":"Conditional Stochastic Interpolation for Generative Learning","summary":"  We propose a conditional stochastic interpolation (CSI) method for learning\nconditional distributions. CSI is based on estimating probability flow\nequations or stochastic differential equations that transport a reference\ndistribution to the target conditional distribution. This is achieved by first\nlearning the conditional drift and score functions based on CSI, which are then\nused to construct a deterministic process governed by an ordinary differential\nequation or a diffusion process for conditional sampling. In our proposed\napproach, we incorporate an adaptive diffusion term to address the instability\nissues arising in the diffusion process. We derive explicit expressions of the\nconditional drift and score functions in terms of conditional expectations,\nwhich naturally lead to an nonparametric regression approach to estimating\nthese functions. Furthermore, we establish nonasymptotic error bounds for\nlearning the target conditional distribution. We illustrate the application of\nCSI on image generation using a benchmark image dataset.\n","authors":["Ding Huang","Jian Huang","Ting Li","Guohao Shen"],"pdf_url":"https://arxiv.org/pdf/2312.05579v2.pdf","comment":"57 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.13452v2","updated":"2024-08-27T02:19:31Z","published":"2024-08-24T03:43:35Z","title":"Data Augmentation for Continual RL via Adversarial Gradient Episodic\n  Memory","summary":"  Data efficiency of learning, which plays a key role in the Reinforcement\nLearning (RL) training process, becomes even more important in continual RL\nwith sequential environments. In continual RL, the learner interacts with\nnon-stationary, sequential tasks and is required to learn new tasks without\nforgetting previous knowledge. However, there is little work on implementing\ndata augmentation for continual RL. In this paper, we investigate the efficacy\nof data augmentation for continual RL. Specifically, we provide benchmarking\ndata augmentations for continual RL, by (1) summarising existing data\naugmentation methods and (2) including a new augmentation method for continual\nRL: Adversarial Augmentation with Gradient Episodic Memory (Adv-GEM). Extensive\nexperiments show that data augmentations, such as random amplitude scaling,\nstate-switch, mixup, adversarial augmentation, and Adv-GEM, can improve\nexisting continual RL algorithms in terms of their average performance,\ncatastrophic forgetting, and forward transfer, on robot control tasks. All data\naugmentation methods are implemented as plug-in modules for trivial integration\ninto continual RL methods.\n","authors":["Sihao Wu","Xingyu Zhao","Xiaowei Huang"],"pdf_url":"https://arxiv.org/pdf/2408.13452v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14734v1","updated":"2024-08-27T02:03:22Z","published":"2024-08-27T02:03:22Z","title":"General-Kindred Physics-Informed Neural Network to the Solutions of\n  Singularly Perturbed Differential Equations","summary":"  Physics-Informed Neural Networks (PINNs) have become a promising research\ndirection in the field of solving Partial Differential Equations (PDEs).\nDealing with singular perturbation problems continues to be a difficult\nchallenge in the field of PINN. The solution of singular perturbation problems\noften exhibits sharp boundary layers and steep gradients, and traditional PINN\ncannot achieve approximation of boundary layers. In this manuscript, we propose\nthe General-Kindred Physics-Informed Neural Network (GKPINN) for solving\nSingular Perturbation Differential Equations (SPDEs). This approach utilizes\nasymptotic analysis to acquire prior knowledge of the boundary layer from the\nequation and establishes a novel network to assist PINN in approximating the\nboundary layer. It is compared with traditional PINN by solving examples of\none-dimensional, two-dimensional, and time-varying SPDE equations. The research\nfindings underscore the exceptional performance of our novel approach, GKPINN,\nwhich delivers a remarkable enhancement in reducing the $L_2$ error by two to\nfour orders of magnitude compared to the established PINN methodology. This\nsignificant improvement is accompanied by a substantial acceleration in\nconvergence rates, without compromising the high precision that is critical for\nour applications. Furthermore, GKPINN still performs well in extreme cases with\nperturbation parameters of ${1\\times10}^{-38}$, demonstrating its excellent\ngeneralization ability.\n","authors":["Sen Wang","Peizhi Zhao","Qinglong Ma","Tao Song"],"pdf_url":"https://arxiv.org/pdf/2408.14734v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14728v1","updated":"2024-08-27T01:41:21Z","published":"2024-08-27T01:41:21Z","title":"TART: Boosting Clean Accuracy Through Tangent Direction Guided\n  Adversarial Training","summary":"  Adversarial training has been shown to be successful in enhancing the\nrobustness of deep neural networks against adversarial attacks. However, this\nrobustness is accompanied by a significant decline in accuracy on clean data.\nIn this paper, we propose a novel method, called Tangent Direction Guided\nAdversarial Training (TART), that leverages the tangent space of the data\nmanifold to ameliorate the existing adversarial defense algorithms. We argue\nthat training with adversarial examples having large normal components\nsignificantly alters the decision boundary and hurts accuracy. TART mitigates\nthis issue by estimating the tangent direction of adversarial examples and\nallocating an adaptive perturbation limit according to the norm of their\ntangential component. To the best of our knowledge, our paper is the first work\nto consider the concept of tangent space and direction in the context of\nadversarial defense. We validate the effectiveness of TART through extensive\nexperiments on both simulated and benchmark datasets. The results demonstrate\nthat TART consistently boosts clean accuracy while retaining a high level of\nrobustness against adversarial attacks. Our findings suggest that incorporating\nthe geometric properties of data can lead to more effective and efficient\nadversarial training methods.\n","authors":["Bongsoo Yi","Rongjie Lai","Yao Li"],"pdf_url":"https://arxiv.org/pdf/2408.14728v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06635v6","updated":"2024-08-27T01:27:29Z","published":"2023-12-11T18:51:59Z","title":"Gated Linear Attention Transformers with Hardware-Efficient Training","summary":"  Transformers with linear attention allow for efficient parallel training but\ncan simultaneously be formulated as an RNN with 2D (matrix-valued) hidden\nstates, thus enjoying linear-time inference complexity. However, linear\nattention generally underperforms ordinary softmax attention. Moreover, current\nimplementations of linear attention lack I/O-awareness and are thus slower than\nhighly optimized implementations of softmax attention. This work describes a\nhardware-efficient algorithm for linear attention that trades off memory\nmovement against parallelizability. The resulting implementation, dubbed\nFLASHLINEARATTENTION, is faster than FLASHATTENTION-2 (Dao, 2023) as a\nstandalone layer even on short sequence lengths (e.g., 1K). We then generalize\nthis algorithm to a more expressive variant of linear attention with\ndata-dependent gates. When used as a replacement for the standard attention\nlayer in Transformers, the resulting gated linear attention (GLA) Transformer\nis found to perform competitively against the LLaMA-architecture Transformer\n(Touvron et al., 2023) as well recent linear-time-inference baselines such as\nRetNet (Sun et al., 2023a) and Mamba (Gu & Dao, 2023) on moderate-scale\nlanguage modeling experiments. GLA Transformer is especially effective at\nlength generalization, enabling a model trained on 2K to generalize to\nsequences longer than 20K without significant perplexity degradations. For\ntraining speed, the GLA Transformer has higher throughput than a\nsimilarly-sized Mamba model.\n","authors":["Songlin Yang","Bailin Wang","Yikang Shen","Rameswar Panda","Yoon Kim"],"pdf_url":"https://arxiv.org/pdf/2312.06635v6.pdf","comment":"minor update"},{"id":"http://arxiv.org/abs/2404.13621v5","updated":"2024-08-27T01:23:50Z","published":"2024-04-21T11:21:27Z","title":"Attack on Scene Flow using Point Clouds","summary":"  Deep neural networks have made significant advancements in accurately\nestimating scene flow using point clouds, which is vital for many applications\nlike video analysis, action recognition, and navigation. The robustness of\nthese techniques, however, remains a concern, particularly in the face of\nadversarial attacks that have been proven to deceive state-of-the-art deep\nneural networks in many domains. Surprisingly, the robustness of scene flow\nnetworks against such attacks has not been thoroughly investigated. To address\nthis problem, the proposed approach aims to bridge this gap by introducing\nadversarial white-box attacks specifically tailored for scene flow networks.\nExperimental results show that the generated adversarial examples obtain up to\n33.7 relative degradation in average end-point error on the KITTI and\nFlyingThings3D datasets. The study also reveals the significant impact that\nattacks targeting point clouds in only one dimension or color channel have on\naverage end-point error. Analyzing the success and failure of these attacks on\nthe scene flow networks and their 2D optical flow network variants shows a\nhigher vulnerability for the optical flow networks. Code is available at\nhttps://github.com/aheldis/Attack-on-Scene-Flow-using-Point-Clouds.git.\n","authors":["Haniyeh Ehsani Oskouie","Mohammad-Shahram Moin","Shohreh Kasaei"],"pdf_url":"https://arxiv.org/pdf/2404.13621v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16338v2","updated":"2024-08-27T01:17:34Z","published":"2023-09-28T10:51:12Z","title":"Anti-Matthew FL: Bridging the Performance Gap in Federated Learning to\n  Counteract the Matthew Effect","summary":"  Federated learning (FL) stands as a paradigmatic approach that facilitates\nmodel training across heterogeneous and diverse datasets originating from\nvarious data providers. However, conventional FLs fall short of achieving\nconsistent performance, potentially leading to performance degradation for\nclients who are disadvantaged in data resources. Influenced by the Matthew\neffect, deploying a performance-imbalanced global model in applications further\nimpedes the generation of high-quality data from disadvantaged clients,\nexacerbating the disparities in data resources among clients. In this work, we\npropose anti-Matthew fairness for the global model at the client level,\nrequiring equal accuracy and equal decision bias across clients. To balance the\ntrade-off between achieving anti-Matthew fairness and performance optimality,\nwe formalize the anti-Matthew effect federated learning (anti-Matthew FL) as a\nmulti-constrained multi-objectives optimization (MCMOO) problem and propose a\nthree-stage multi-gradient descent algorithm to obtain the Pareto optimality.\nWe theoretically analyze the convergence and time complexity of our proposed\nalgorithms. Additionally, through extensive experimentation, we demonstrate\nthat our proposed anti-Matthew FL outperforms other state-of-the-art FL\nalgorithms in achieving a high-performance global model while effectively\nbridging performance gaps among clients. We hope this work provides valuable\ninsights into the manifestation of the Matthew effect in FL and other\ndecentralized learning scenarios and can contribute to designing fairer\nlearning mechanisms, ultimately fostering societal welfare.\n","authors":["Jiashi Gao","Xin Yao","Xuetao Wei"],"pdf_url":"https://arxiv.org/pdf/2309.16338v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14721v1","updated":"2024-08-27T01:04:14Z","published":"2024-08-27T01:04:14Z","title":"PAT: Pruning-Aware Tuning for Large Language Models","summary":"  Large language models (LLMs) excel in language tasks, especially with\nsupervised fine-tuning after pre-training. However, their substantial memory\nand computational requirements hinder practical applications. Structural\npruning, which reduces less significant weight dimensions, is one solution.\nYet, traditional post-hoc pruning often leads to significant performance loss,\nwith limited recovery from further fine-tuning due to reduced capacity. Since\nthe model fine-tuning refines the general and chaotic knowledge in pre-trained\nmodels, we aim to incorporate structural pruning with the fine-tuning, and\npropose the Pruning-Aware Tuning (PAT) paradigm to eliminate model redundancy\nwhile preserving the model performance to the maximum extend. Specifically, we\ninsert the innovative Hybrid Sparsification Modules (HSMs) between the\nAttention and FFN components to accordingly sparsify the upstream and\ndownstream linear modules. The HSM comprises a lightweight operator and a\nglobally shared trainable mask. The lightweight operator maintains a training\noverhead comparable to that of LoRA, while the trainable mask unifies the\nchannels to be sparsified, ensuring structural pruning. Additionally, we\npropose the Identity Loss which decouples the transformation and scaling\nproperties of the HSMs to enhance training robustness. Extensive experiments\ndemonstrate that PAT excels in both performance and efficiency. For example,\nour Llama2-7b model with a 25\\% pruning ratio achieves 1.33$\\times$ speedup\nwhile outperforming the LoRA-finetuned model by up to 1.26\\% in accuracy with a\nsimilar training cost. Code:\nhttps://github.com/kriskrisliu/PAT_Pruning-Aware-Tuning\n","authors":["Yijiang Liu","Huanrui Yang","Youxin Chen","Rongyu Zhang","Miao Wang","Yuan Du","Li Du"],"pdf_url":"https://arxiv.org/pdf/2408.14721v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12023v4","updated":"2024-08-27T00:48:35Z","published":"2023-11-20T18:57:41Z","title":"LQ-LoRA: Low-rank Plus Quantized Matrix Decomposition for Efficient\n  Language Model Finetuning","summary":"  We propose a simple approach for memory-efficient adaptation of pretrained\nlanguage models. Our approach uses an iterative algorithm to decompose each\npretrained matrix into a high-precision low-rank component and a\nmemory-efficient quantized component. During finetuning, the quantized\ncomponent remains fixed and only the low-rank component is updated. We present\nan integer linear programming formulation of the quantization component which\nenables dynamic configuration of quantization parameters (e.g., bit-width,\nblock size) for each matrix given an overall target memory budget. We further\nexplore a data-aware version of the algorithm which uses an approximation of\nthe Fisher information matrix to weight the reconstruction objective during\nmatrix decomposition. Experiments on finetuning RoBERTa and LLaMA-2 (7B and\n70B) demonstrate that our low-rank plus quantized matrix decomposition approach\n(LQ-LoRA) outperforms strong QLoRA and GPTQ-LoRA baselines and enables\naggressive quantization to sub-3 bits with only minor performance degradations.\nWhen finetuned on a language modeling calibration dataset, LQ-LoRA can also be\nused for model compression; in this setting our 2.75-bit LLaMA-2-70B model\n(which has 2.85 bits on average when including the low-rank components and\nrequires 27GB of GPU memory) performs respectably compared to the 16-bit\nbaseline.\n","authors":["Han Guo","Philip Greengard","Eric P. Xing","Yoon Kim"],"pdf_url":"https://arxiv.org/pdf/2311.12023v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.10960v2","updated":"2024-08-27T00:27:12Z","published":"2024-07-15T17:55:42Z","title":"Fast Matrix Multiplications for Lookup Table-Quantized LLMs","summary":"  The deployment of large language models (LLMs) is often constrained by memory\nbandwidth, where the primary bottleneck is the cost of transferring model\nparameters from the GPU's global memory to its registers. When coupled with\ncustom kernels that fuse the dequantization and matmul operations, weight-only\nquantization can thus enable faster inference by reducing the amount of memory\nmovement. However, developing high-performance kernels for weight-quantized\nLLMs presents substantial challenges, especially when the weights are\ncompressed to non-evenly-divisible bit widths (e.g., 3 bits) with non-uniform,\nlookup table (LUT) quantization. This paper describes FLUTE, a flexible lookup\ntable engine for LUT-quantized LLMs, which uses offline restructuring of the\nquantized weight matrix to minimize bit manipulations associated with\nunpacking, and vectorization and duplication of the lookup table to mitigate\nshared memory bandwidth constraints. At batch sizes < 32 and quantization group\nsize of 128 (typical in LLM inference), the FLUTE kernel can be 2-4x faster\nthan existing GEMM kernels. As an application of FLUTE, we explore a simple\nextension to lookup table-based NormalFloat quantization and apply it to\nquantize LLaMA3 to various configurations, obtaining competitive quantization\nperformance against strong baselines while obtaining an end-to-end throughput\nincrease of 1.5 to 2 times.\n","authors":["Han Guo","William Brandon","Radostin Cholakov","Jonathan Ragan-Kelley","Eric P. Xing","Yoon Kim"],"pdf_url":"https://arxiv.org/pdf/2407.10960v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15449v1","updated":"2024-08-27T23:58:51Z","published":"2024-08-27T23:58:51Z","title":"Graph Attention Inference of Network Topology in Multi-Agent Systems","summary":"  Accurately identifying the underlying graph structures of multi-agent systems\nremains a difficult challenge. Our work introduces a novel machine\nlearning-based solution that leverages the attention mechanism to predict\nfuture states of multi-agent systems by learning node representations. The\ngraph structure is then inferred from the strength of the attention values.\nThis approach is applied to both linear consensus dynamics and the non-linear\ndynamics of Kuramoto oscillators, resulting in implicit learning the graph by\nlearning good agent representations. Our results demonstrate that the presented\ndata-driven graph attention machine learning model can identify the network\ntopology in multi-agent systems, even when the underlying dynamic model is not\nknown, as evidenced by the F1 scores achieved in the link prediction.\n","authors":["Akshay Kolli","Reza Azadeh","Kshitj Jerath"],"pdf_url":"https://arxiv.org/pdf/2408.15449v1.pdf","comment":"Accepted for publication at Modeling and Estimation Control\n  Conference 2024; 6 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.07877v2","updated":"2024-08-27T22:55:03Z","published":"2024-08-15T01:33:06Z","title":"IReCa: Intrinsic Reward-enhanced Context-aware Reinforcement Learning\n  for Human-AI Coordination","summary":"  In human-AI coordination scenarios, human agents usually exhibit asymmetric\nbehaviors that are significantly sparse and unpredictable compared to those of\nAI agents. These characteristics introduce two primary challenges to human-AI\ncoordination: the effectiveness of obtaining sparse rewards and the efficiency\nof training the AI agents. To tackle these challenges, we propose an Intrinsic\nReward-enhanced Context-aware (IReCa) reinforcement learning (RL) algorithm,\nwhich leverages intrinsic rewards to facilitate the acquisition of sparse\nrewards and utilizes environmental context to enhance training efficiency. Our\nIReCa RL algorithm introduces three unique features: (i) it encourages the\nexploration of sparse rewards by incorporating intrinsic rewards that\nsupplement traditional extrinsic rewards from the environment; (ii) it improves\nthe acquisition of sparse rewards by prioritizing the corresponding sparse\nstate-action pairs; and (iii) it enhances the training efficiency by optimizing\nthe exploration and exploitation through innovative context-aware weights of\nextrinsic and intrinsic rewards. Extensive simulations executed in the\nOvercooked layouts demonstrate that our IReCa RL algorithm can increase the\naccumulated rewards by approximately 20% and reduce the epochs required for\nconvergence by approximately 67% compared to state-of-the-art baselines.\n","authors":["Xin Hao","Bahareh Nakisa","Mohmmad Naim Rastgoo","Richard Dazeley"],"pdf_url":"https://arxiv.org/pdf/2408.07877v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15826v2","updated":"2024-08-27T22:18:07Z","published":"2024-03-23T12:53:51Z","title":"Scaling Learning based Policy Optimization for Temporal Logic Tasks by\n  Controller Network Dropout","summary":"  This paper introduces a model-based approach for training feedback\ncontrollers for an autonomous agent operating in a highly nonlinear (albeit\ndeterministic) environment. We desire the trained policy to ensure that the\nagent satisfies specific task objectives and safety constraints, both expressed\nin Discrete-Time Signal Temporal Logic (DT-STL). One advantage for\nreformulation of a task via formal frameworks, like DT-STL, is that it permits\nquantitative satisfaction semantics. In other words, given a trajectory and a\nDT-STL formula, we can compute the {\\em robustness}, which can be interpreted\nas an approximate signed distance between the trajectory and the set of\ntrajectories satisfying the formula. We utilize feedback control, and we assume\na feed forward neural network for learning the feedback controller. We show how\nthis learning problem is similar to training recurrent neural networks (RNNs),\nwhere the number of recurrent units is proportional to the temporal horizon of\nthe agent's task objectives. This poses a challenge: RNNs are susceptible to\nvanishing and exploding gradients, and na\\\"{i}ve gradient descent-based\nstrategies to solve long-horizon task objectives thus suffer from the same\nproblems. To tackle this challenge, we introduce a novel gradient approximation\nalgorithm based on the idea of dropout or gradient sampling. One of the main\ncontributions is the notion of {\\em controller network dropout}, where we\napproximate the NN controller in several time-steps in the task horizon by the\ncontrol input obtained using the controller in a previous training step. We\nshow that our control synthesis methodology, can be quite helpful for\nstochastic gradient descent to converge with less numerical issues, enabling\nscalable backpropagation over long time horizons and trajectories over high\ndimensional state spaces.\n","authors":["Navid Hashemi","Bardh Hoxha","Danil Prokhorov","Georgios Fainekos","Jyotirmoy Deshmukh"],"pdf_url":"https://arxiv.org/pdf/2403.15826v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15421v1","updated":"2024-08-27T21:54:26Z","published":"2024-08-27T21:54:26Z","title":"Simultaneous Training of First- and Second-Order Optimizers in\n  Population-Based Reinforcement Learning","summary":"  The tuning of hyperparameters in reinforcement learning (RL) is critical, as\nthese parameters significantly impact an agent's performance and learning\nefficiency. Dynamic adjustment of hyperparameters during the training process\ncan significantly enhance both the performance and stability of learning.\nPopulation-based training (PBT) provides a method to achieve this by\ncontinuously tuning hyperparameters throughout the training. This ongoing\nadjustment enables models to adapt to different learning stages, resulting in\nfaster convergence and overall improved performance. In this paper, we propose\nan enhancement to PBT by simultaneously utilizing both first- and second-order\noptimizers within a single population. We conducted a series of experiments\nusing the TD3 algorithm across various MuJoCo environments. Our results, for\nthe first time, empirically demonstrate the potential of incorporating\nsecond-order optimizers within PBT-based RL. Specifically, the combination of\nthe K-FAC optimizer with Adam led to up to a 10% improvement in overall\nperformance compared to PBT using only Adam. Additionally, in environments\nwhere Adam occasionally fails, such as the Swimmer environment, the mixed\npopulation with K-FAC exhibited more reliable learning outcomes, offering a\nsignificant advantage in training stability without a substantial increase in\ncomputational time.\n","authors":["Felix Pfeiffer","Shahram Eivazi"],"pdf_url":"https://arxiv.org/pdf/2408.15421v1.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.15418v1","updated":"2024-08-27T21:47:09Z","published":"2024-08-27T21:47:09Z","title":"Understanding GNNs for Boolean Satisfiability through Approximation\n  Algorithms","summary":"  The paper deals with the interpretability of Graph Neural Networks in the\ncontext of Boolean Satisfiability. The goal is to demystify the internal\nworkings of these models and provide insightful perspectives into their\ndecision-making processes. This is done by uncovering connections to two\napproximation algorithms studied in the domain of Boolean Satisfiability:\nBelief Propagation and Semidefinite Programming Relaxations. Revealing these\nconnections has empowered us to introduce a suite of impactful enhancements.\nThe first significant enhancement is a curriculum training procedure, which\nincrementally increases the problem complexity in the training set, together\nwith increasing the number of message passing iterations of the Graph Neural\nNetwork. We show that the curriculum, together with several other\noptimizations, reduces the training time by more than an order of magnitude\ncompared to the baseline without the curriculum. Furthermore, we apply\ndecimation and sampling of initial embeddings, which significantly increase the\npercentage of solved problems.\n","authors":["Jan Hůla","David Mojžíšek","Mikoláš Janota"],"pdf_url":"https://arxiv.org/pdf/2408.15418v1.pdf","comment":"CIKM 2024"},{"id":"http://arxiv.org/abs/2408.15417v1","updated":"2024-08-27T21:46:47Z","published":"2024-08-27T21:46:47Z","title":"Implicit Geometry of Next-token Prediction: From Language Sparsity\n  Patterns to Model Representations","summary":"  Next-token prediction (NTP) over large text corpora has become the go-to\nparadigm to train large language models. Yet, it remains unclear how NTP\ninfluences the mapping of linguistic patterns to geometric properties of the\nresulting model representations. We frame training of large language models as\nsoft-label classification over sparse probabilistic label vectors, coupled with\nan analytical approximation that allows unrestricted generation of context\nembeddings. This approach links NTP training to rank-constrained, nuclear-norm\nregularized optimization in the logit domain, offering a framework for\nanalyzing the geometry of word and context embeddings. In large embedding\nspaces, we find that NTP implicitly favors learning logits with a sparse plus\nlow-rank structure. While the sparse component captures the co-occurrence\nfrequency of context-word pairs, the orthogonal low-rank component, which\nbecomes dominant as training progresses, depends solely on the sparsity pattern\nof the co-occurrence matrix. Consequently, when projected onto an appropriate\nsubspace, representations of contexts that are followed by the same set of\nnext-tokens collapse, a phenomenon we term subspace-collapse. We validate our\nfindings on synthetic and small-scale real language datasets. Finally, we\noutline potential research directions aimed at deepening the understanding of\nNTP's influence on the learning of linguistic patterns and regularities.\n","authors":["Yize Zhao","Tina Behnia","Vala Vakilian","Christos Thrampoulidis"],"pdf_url":"https://arxiv.org/pdf/2408.15417v1.pdf","comment":"Accepted at COLM 2024"},{"id":"http://arxiv.org/abs/2310.07819v3","updated":"2024-08-27T21:37:57Z","published":"2023-10-11T19:00:40Z","title":"Faithfulness Measurable Masked Language Models","summary":"  A common approach to explaining NLP models is to use importance measures that\nexpress which tokens are important for a prediction. Unfortunately, such\nexplanations are often wrong despite being persuasive. Therefore, it is\nessential to measure their faithfulness. One such metric is if tokens are truly\nimportant, then masking them should result in worse model performance. However,\ntoken masking introduces out-of-distribution issues, and existing solutions\nthat address this are computationally expensive and employ proxy models.\nFurthermore, other metrics are very limited in scope. This work proposes an\ninherently faithfulness measurable model that addresses these challenges. This\nis achieved using a novel fine-tuning method that incorporates masking, such\nthat masking tokens become in-distribution by design. This differs from\nexisting approaches, which are completely model-agnostic but are inapplicable\nin practice. We demonstrate the generality of our approach by applying it to 16\ndifferent datasets and validate it using statistical in-distribution tests. The\nfaithfulness is then measured with 9 different importance measures. Because\nmasking is in-distribution, importance measures that themselves use masking\nbecome consistently more faithful. Additionally, because the model makes\nfaithfulness cheap to measure, we can optimize explanations towards maximal\nfaithfulness; thus, our model becomes indirectly inherently explainable.\n","authors":["Andreas Madsen","Siva Reddy","Sarath Chandar"],"pdf_url":"https://arxiv.org/pdf/2310.07819v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11986v2","updated":"2024-08-27T21:25:39Z","published":"2023-07-22T05:34:18Z","title":"Expert Knowledge-Aware Image Difference Graph Representation Learning\n  for Difference-Aware Medical Visual Question Answering","summary":"  To contribute to automating the medical vision-language model, we propose a\nnovel Chest-Xray Difference Visual Question Answering (VQA) task. Given a pair\nof main and reference images, this task attempts to answer several questions on\nboth diseases and, more importantly, the differences between them. This is\nconsistent with the radiologist's diagnosis practice that compares the current\nimage with the reference before concluding the report. We collect a new\ndataset, namely MIMIC-Diff-VQA, including 700,703 QA pairs from 164,324 pairs\nof main and reference images. Compared to existing medical VQA datasets, our\nquestions are tailored to the Assessment-Diagnosis-Intervention-Evaluation\ntreatment procedure used by clinical professionals. Meanwhile, we also propose\na novel expert knowledge-aware graph representation learning model to address\nthis task. The proposed baseline model leverages expert knowledge such as\nanatomical structure prior, semantic, and spatial knowledge to construct a\nmulti-relationship graph, representing the image differences between two images\nfor the image difference VQA task. The dataset and code can be found at\nhttps://github.com/Holipori/MIMIC-Diff-VQA. We believe this work would further\npush forward the medical vision language model.\n","authors":["Xinyue Hu","Lin Gu","Qiyuan An","Mengliang Zhang","Liangchen Liu","Kazuma Kobayashi","Tatsuya Harada","Ronald M. Summers","Yingying Zhu"],"pdf_url":"https://arxiv.org/pdf/2307.11986v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15408v1","updated":"2024-08-27T21:18:41Z","published":"2024-08-27T21:18:41Z","title":"Divergence-free neural operators for stress field modeling in\n  polycrystalline materials","summary":"  The purpose of the current work is the development and comparison of Fourier\nneural operators (FNOs) for surrogate modeling of the quasi-static mechanical\nresponse of polycrystalline materials. Three types of such FNOs are considered\nhere: a physics-guided FNO (PgFNO), a physics-informed FNO (PiFNO), and a\nphysics-encoded FNO (PeFNO). These are trained and compared with the help of\nstress field data from a reference model for heterogeneous elastic materials\nwith a periodic grain microstructure. Whereas PgFNO training is based solely on\nthese data, that of the PiFNO and PeFNO is in addition constrained by the\nrequirement that stress fields satisfy mechanical equilibrium, i.e., be\ndivergence-free. The difference between the PiFNO and PeFNO lies in how this\nconstraint is taken into account; in the PiFNO, it is included in the loss\nfunction, whereas in the PeFNO, it is \"encoded\" in the operator architecture.\nIn the current work, this encoding is based on a stress potential and Fourier\ntransforms. As a result, only the training of the PiFNO is constrained by\nmechanical equilibrium; in contrast, mechanical equilibrium constrains both the\ntraining and output of the PeFNO. Due in particular to this, stress fields\ncalculated by the trained PeFNO are significantly more accurate than those\ncalculated by the trained PiFNO in the example cases considered.\n","authors":["Mohammad S. Khorrami","Pawan Goyal","Jaber R. Mianroodi","Bob Svendsen","Peter Benner","Dierk Raabe"],"pdf_url":"https://arxiv.org/pdf/2408.15408v1.pdf","comment":"17 pages, 11 figures"},{"id":"http://arxiv.org/abs/2408.15404v1","updated":"2024-08-27T20:57:26Z","published":"2024-08-27T20:57:26Z","title":"Evaluating Credit VIX (CDS IV) Prediction Methods with Incremental Batch\n  Learning","summary":"  This paper presents the experimental process and results of SVM, Gradient\nBoosting, and an Attention-GRU Hybrid model in predicting the Implied\nVolatility of rolled-over five-year spread contracts of credit default swaps\n(CDS) on European corporate debt during the quarter following mid-May '24, as\nrepresented by the iTraxx/Cboe Europe Main 1-Month Volatility Index (BP\nVolatility). The analysis employs a feature matrix inspired by Merton's\ndeterminants of default probability. Our comparative assessment aims to\nidentify strengths in SOTA and classical machine learning methods for financial\nrisk prediction\n","authors":["Robert Taylor"],"pdf_url":"https://arxiv.org/pdf/2408.15404v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.11552v5","updated":"2024-08-27T20:56:53Z","published":"2023-02-22T18:48:46Z","title":"Reduce, Reuse, Recycle: Compositional Generation with Energy-Based\n  Diffusion Models and MCMC","summary":"  Since their introduction, diffusion models have quickly become the prevailing\napproach to generative modeling in many domains. They can be interpreted as\nlearning the gradients of a time-varying sequence of log-probability density\nfunctions. This interpretation has motivated classifier-based and\nclassifier-free guidance as methods for post-hoc control of diffusion models.\nIn this work, we build upon these ideas using the score-based interpretation of\ndiffusion models, and explore alternative ways to condition, modify, and reuse\ndiffusion models for tasks involving compositional generation and guidance. In\nparticular, we investigate why certain types of composition fail using current\ntechniques and present a number of solutions. We conclude that the sampler (not\nthe model) is responsible for this failure and propose new samplers, inspired\nby MCMC, which enable successful compositional generation. Further, we propose\nan energy-based parameterization of diffusion models which enables the use of\nnew compositional operators and more sophisticated, Metropolis-corrected\nsamplers. Intriguingly we find these samplers lead to notable improvements in\ncompositional generation across a wide set of problems such as\nclassifier-guided ImageNet modeling and compositional text-to-image generation.\n","authors":["Yilun Du","Conor Durkan","Robin Strudel","Joshua B. Tenenbaum","Sander Dieleman","Rob Fergus","Jascha Sohl-Dickstein","Arnaud Doucet","Will Grathwohl"],"pdf_url":"https://arxiv.org/pdf/2302.11552v5.pdf","comment":"ICML 2023, Project Webpage:\n  https://energy-based-model.github.io/reduce-reuse-recycle/"},{"id":"http://arxiv.org/abs/2408.15400v1","updated":"2024-08-27T20:51:48Z","published":"2024-08-27T20:51:48Z","title":"Exploring the origins of switching dynamics in a multifunctional\n  reservoir computer","summary":"  The concept of multifunctionality has enabled reservoir computers (RCs), a\ntype of dynamical system that is typically realised as an artificial neural\nnetwork, to reconstruct multiple attractors simultaneously using the same set\nof trained weights. However there are many additional phenomena that arise when\ntraining a RC to reconstruct more than one attractor. Previous studies have\nfound that, in certain cases, if the RC fails to reconstruct a coexistence of\nattractors then it exhibits a form of metastability whereby, without any\nexternal input, the state of the RC switches between different modes of\nbehaviour that resemble properties of the attractors it failed to reconstruct.\nIn this paper we explore the origins of these switching dynamics in a\nparadigmatic setting via the `seeing double' problem.\n","authors":["Andrew Flynn","Andreas Amann"],"pdf_url":"https://arxiv.org/pdf/2408.15400v1.pdf","comment":"Preprint submitted to Frontiers in Network Physiology"},{"id":"http://arxiv.org/abs/2408.15399v1","updated":"2024-08-27T20:51:06Z","published":"2024-08-27T20:51:06Z","title":"A Statistical Framework for Data-dependent Retrieval-Augmented Models","summary":"  Modern ML systems increasingly augment input instances with additional\nrelevant information to enhance final prediction. Despite growing interest in\nsuch retrieval-augmented models, their fundamental properties and training are\nnot well understood. We propose a statistical framework to study such models\nwith two components: 1) a {\\em retriever} to identify the relevant information\nout of a large corpus via a data-dependent metric; and 2) a {\\em predictor}\nthat consumes the input instances along with the retrieved information to make\nthe final predictions. We present a principled method for end-to-end training\nof both components and draw connections with various training approaches in the\nliterature. Furthermore, we establish excess risk bounds for\nretrieval-augmented models while delineating the contributions of both\nretriever and predictor towards the model performance. We validate the utility\nof our proposed training methods along with the key takeaways from our\nstatistical analysis on open domain question answering task where retrieval\naugmentation is important.\n","authors":["Soumya Basu","Ankit Singh Rawat","Manzil Zaheer"],"pdf_url":"https://arxiv.org/pdf/2408.15399v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15398v1","updated":"2024-08-27T20:49:11Z","published":"2024-08-27T20:49:11Z","title":"Evaluating Pre-Training Bias on Severe Acute Respiratory Syndrome\n  Dataset","summary":"  Machine learning (ML) is a growing field of computer science that has found\nmany practical applications in several domains, including Health. However, as\ndata grows in size and availability, and the number of models that aim to aid\nor replace human decisions, it raises the concern that these models can be\nsusceptible to bias, which can lead to harm to specific individuals by basing\nits decisions on protected attributes such as gender, religion, sexual\norientation, ethnicity, and others. Visualization techniques might generate\ninsights and help summarize large datasets, enabling data scientists to\nunderstand the data better before training a model by evaluating pre-training\nmetrics applied to the datasets before training, which might contribute to\nidentifying potential harm before any effort is put into training and deploying\nthe models. This work uses the severe acute respiratory syndrome dataset from\nOpenDataSUS to visualize three pre-training bias metrics and their distribution\nacross different regions in Brazil. A random forest model is trained in each\nregion and applied to the others. The aim is to compare the bias for the\ndifferent regions, focusing on their protected attributes and comparing the\nmodel's performance with the metric values.\n","authors":["Diego Dimer Rodrigues"],"pdf_url":"https://arxiv.org/pdf/2408.15398v1.pdf","comment":"short paper for eurovis, 5 pages"},{"id":"http://arxiv.org/abs/2408.15395v1","updated":"2024-08-27T20:39:09Z","published":"2024-08-27T20:39:09Z","title":"SCAN-Edge: Finding MobileNet-speed Hybrid Networks for Diverse Edge\n  Devices via Hardware-Aware Evolutionary Search","summary":"  Designing low-latency and high-efficiency hybrid networks for a variety of\nlow-cost commodity edge devices is both costly and tedious, leading to the\nadoption of hardware-aware neural architecture search (NAS) for finding optimal\narchitectures. However, unifying NAS for a wide range of edge devices presents\nchallenges due to the variety of hardware designs, supported operations, and\ncompilation optimizations. Existing methods often fix the search space of\narchitecture choices (e.g., activation, convolution, or self-attention) and\nestimate latency using hardware-agnostic proxies (e.g., FLOPs), which fail to\nachieve proclaimed latency across various edge devices. To address this issue,\nwe propose SCAN-Edge, a unified NAS framework that jointly searches for\nself-attention, convolution, and activation to accommodate the wide variety of\nedge devices, including CPU-, GPU-, and hardware accelerator-based systems. To\nhandle the large search space, SCAN-Edge relies on with a hardware-aware\nevolutionary algorithm that improves the quality of the search space to\naccelerate the sampling process. Experiments on large-scale datasets\ndemonstrate that our hybrid networks match the actual MobileNetV2 latency for\n224x224 input resolution on various commodity edge devices.\n","authors":["Hung-Yueh Chiang","Diana Marculescu"],"pdf_url":"https://arxiv.org/pdf/2408.15395v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15393v1","updated":"2024-08-27T20:33:16Z","published":"2024-08-27T20:33:16Z","title":"Stability Analysis of Physics-Informed Neural Networks for Stiff Linear\n  Differential Equations","summary":"  We present a stability analysis of Physics-Informed Neural Networks (PINNs)\ncoupled with random projections, for the numerical solution of (stiff) linear\ndifferential equations. For our analysis, we consider systems of linear ODEs,\nand linear parabolic PDEs. We prove that properly designed PINNs offer\nconsistent and asymptotically stable numerical schemes, thus convergent\nschemes. In particular, we prove that multi-collocation random projection PINNs\nguarantee asymptotic stability for very high stiffness and that\nsingle-collocation PINNs are $A$-stable. To assess the performance of the PINNs\nin terms of both numerical approximation accuracy and computational cost, we\ncompare it with other implicit schemes and in particular backward Euler, the\nmidpoint, trapezoidal (Crank-Nikolson), the 2-stage Gauss scheme and the 2 and\n3 stages Radau schemes. We show that the proposed PINNs outperform the above\ntraditional schemes, in both numerical approximation accuracy and importantly\ncomputational cost, for a wide range of step sizes.\n","authors":["Gianluca Fabiani","Erik Bollt","Constantinos Siettos","Athanasios N. Yannacopoulos"],"pdf_url":"https://arxiv.org/pdf/2408.15393v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15388v1","updated":"2024-08-27T20:14:42Z","published":"2024-08-27T20:14:42Z","title":"Panoptic Perception for Autonomous Driving: A Survey","summary":"  Panoptic perception represents a forefront advancement in autonomous driving\ntechnology, unifying multiple perception tasks into a singular, cohesive\nframework to facilitate a thorough understanding of the vehicle's surroundings.\nThis survey reviews typical panoptic perception models for their unique inputs\nand architectures and compares them to performance, responsiveness, and\nresource utilization. It also delves into the prevailing challenges faced in\npanoptic perception and explores potential trajectories for future research.\nOur goal is to furnish researchers in autonomous driving with a detailed\nsynopsis of panoptic perception, positioning this survey as a pivotal reference\nin the ever-evolving landscape of autonomous driving technologies.\n","authors":["Yunge Li","Lanyu Xu"],"pdf_url":"https://arxiv.org/pdf/2408.15388v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.01959v2","updated":"2024-08-27T19:57:45Z","published":"2024-08-04T08:26:58Z","title":"Dataset Scale and Societal Consistency Mediate Facial Impression Bias in\n  Vision-Language AI","summary":"  Multimodal AI models capable of associating images and text hold promise for\nnumerous domains, ranging from automated image captioning to accessibility\napplications for blind and low-vision users. However, uncertainty about bias\nhas in some cases limited their adoption and availability. In the present work,\nwe study 43 CLIP vision-language models to determine whether they learn\nhuman-like facial impression biases, and we find evidence that such biases are\nreflected across three distinct CLIP model families. We show for the first time\nthat the the degree to which a bias is shared across a society predicts the\ndegree to which it is reflected in a CLIP model. Human-like impressions of\nvisually unobservable attributes, like trustworthiness and sexuality, emerge\nonly in models trained on the largest dataset, indicating that a better fit to\nuncurated cultural data results in the reproduction of increasingly subtle\nsocial biases. Moreover, we use a hierarchical clustering approach to show that\ndataset size predicts the extent to which the underlying structure of facial\nimpression bias resembles that of facial impression bias in humans. Finally, we\nshow that Stable Diffusion models employing CLIP as a text encoder learn facial\nimpression biases, and that these biases intersect with racial biases in Stable\nDiffusion XL-Turbo. While pretrained CLIP models may prove useful for\nscientific studies of bias, they will also require significant dataset curation\nwhen intended for use as general-purpose models in a zero-shot setting.\n","authors":["Robert Wolfe","Aayushi Dangol","Alexis Hiniker","Bill Howe"],"pdf_url":"https://arxiv.org/pdf/2408.01959v2.pdf","comment":"Accepted at Artificial Intelligence, Ethics, and Society 2024"},{"id":"http://arxiv.org/abs/2401.03717v3","updated":"2024-08-27T19:45:07Z","published":"2024-01-08T08:00:04Z","title":"Universal Time-Series Representation Learning: A Survey","summary":"  Time-series data exists in every corner of real-world systems and services,\nranging from satellites in the sky to wearable devices on human bodies.\nLearning representations by extracting and inferring valuable information from\nthese time series is crucial for understanding the complex dynamics of\nparticular phenomena and enabling informed decisions. With the learned\nrepresentations, we can perform numerous downstream analyses more effectively.\nAmong several approaches, deep learning has demonstrated remarkable performance\nin extracting hidden patterns and features from time-series data without manual\nfeature engineering. This survey first presents a novel taxonomy based on three\nfundamental elements in designing state-of-the-art universal representation\nlearning methods for time series. According to the proposed taxonomy, we\ncomprehensively review existing studies and discuss their intuitions and\ninsights into how these methods enhance the quality of learned representations.\nFinally, as a guideline for future studies, we summarize commonly used\nexperimental setups and datasets and discuss several promising research\ndirections. An up-to-date corresponding resource is available at\nhttps://github.com/itouchz/awesome-deep-time-series-representations.\n","authors":["Patara Trirat","Yooju Shin","Junhyeok Kang","Youngeun Nam","Jihye Na","Minyoung Bae","Joeun Kim","Byunghyun Kim","Jae-Gil Lee"],"pdf_url":"https://arxiv.org/pdf/2401.03717v3.pdf","comment":"41 pages, 7 figures"},{"id":"http://arxiv.org/abs/2408.13683v2","updated":"2024-08-27T19:27:07Z","published":"2024-08-24T22:40:31Z","title":"Submodular Maximization Approaches for Equitable Client Selection in\n  Federated Learning","summary":"  In a conventional Federated Learning framework, client selection for training\ntypically involves the random sampling of a subset of clients in each\niteration. However, this random selection often leads to disparate performance\namong clients, raising concerns regarding fairness, particularly in\napplications where equitable outcomes are crucial, such as in medical or\nfinancial machine learning tasks. This disparity typically becomes more\npronounced with the advent of performance-centric client sampling techniques.\nThis paper introduces two novel methods, namely SUBTRUNC and UNIONFL, designed\nto address the limitations of random client selection. Both approaches utilize\nsubmodular function maximization to achieve more balanced models. By modifying\nthe facility location problem, they aim to mitigate the fairness concerns\nassociated with random selection. SUBTRUNC leverages client loss information to\ndiversify solutions, while UNIONFL relies on historical client selection data\nto ensure a more equitable performance of the final model. Moreover, these\nalgorithms are accompanied by robust theoretical guarantees regarding\nconvergence under reasonable assumptions. The efficacy of these methods is\ndemonstrated through extensive evaluations across heterogeneous scenarios,\nrevealing significant improvements in fairness as measured by a client\ndissimilarity metric.\n","authors":["Andrés Catalino Castillo Jiménez","Ege C. Kaya","Lintao Ye","Abolfazl Hashemi"],"pdf_url":"https://arxiv.org/pdf/2408.13683v2.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2408.15374v1","updated":"2024-08-27T19:22:06Z","published":"2024-08-27T19:22:06Z","title":"CycleGAN with Better Cycles","summary":"  CycleGAN provides a framework to train image-to-image translation with\nunpaired datasets using cycle consistency loss [4]. While results are great in\nmany applications, the pixel level cycle consistency can potentially be\nproblematic and causes unrealistic images in certain cases. In this project, we\npropose three simple modifications to cycle consistency, and show that such an\napproach achieves better results with fewer artifacts.\n","authors":["Tongzhou Wang","Yihan Lin"],"pdf_url":"https://arxiv.org/pdf/2408.15374v1.pdf","comment":"Technical Report 2018"},{"id":"http://arxiv.org/abs/2408.15373v1","updated":"2024-08-27T19:13:15Z","published":"2024-08-27T19:13:15Z","title":"Handling Geometric Domain Shifts in Semantic Segmentation of Surgical\n  RGB and Hyperspectral Images","summary":"  Robust semantic segmentation of intraoperative image data holds promise for\nenabling automatic surgical scene understanding and autonomous robotic surgery.\nWhile model development and validation are primarily conducted on idealistic\nscenes, geometric domain shifts, such as occlusions of the situs, are common in\nreal-world open surgeries. To close this gap, we (1) present the first analysis\nof state-of-the-art (SOA) semantic segmentation models when faced with\ngeometric out-of-distribution (OOD) data, and (2) propose an augmentation\ntechnique called \"Organ Transplantation\", to enhance generalizability. Our\ncomprehensive validation on six different OOD datasets, comprising 600 RGB and\nhyperspectral imaging (HSI) cubes from 33 pigs, each annotated with 19 classes,\nreveals a large performance drop in SOA organ segmentation models on geometric\nOOD data. This performance decline is observed not only in conventional RGB\ndata (with a dice similarity coefficient (DSC) drop of 46 %) but also in HSI\ndata (with a DSC drop of 45 %), despite the richer spectral information\ncontent. The performance decline increases with the spatial granularity of the\ninput data. Our augmentation technique improves SOA model performance by up to\n67 % for RGB data and 90 % for HSI data, achieving performance at the level of\nin-distribution performance on real OOD test data. Given the simplicity and\neffectiveness of our augmentation method, it is a valuable tool for addressing\ngeometric domain shifts in surgical scene segmentation, regardless of the\nunderlying model. Our code and pre-trained models are publicly available at\nhttps://github.com/IMSY-DKFZ/htc.\n","authors":["Silvia Seidlitz","Jan Sellner","Alexander Studier-Fischer","Alessandro Motta","Berkin Özdemir","Beat P. Müller-Stich","Felix Nickel","Lena Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2408.15373v1.pdf","comment":"Silvia Seidlitz and Jan Sellner contributed equally"},{"id":"http://arxiv.org/abs/2408.15371v1","updated":"2024-08-27T19:10:21Z","published":"2024-08-27T19:10:21Z","title":"Temporal Graph Neural Network-Powered Paper Recommendation on Dynamic\n  Citation Networks","summary":"  Due to the rapid growth of scientific publications, identifying all related\nreference articles in the literature has become increasingly challenging yet\nhighly demanding. Existing methods primarily assess candidate publications from\na static perspective, focusing on the content of articles and their structural\ninformation, such as citation relationships. There is a lack of research\nregarding how to account for the evolving impact among papers on their\nembeddings. Toward this goal, this paper introduces a temporal dimension to\npaper recommendation strategies. The core idea is to continuously update a\npaper's embedding when new citation relationships appear, enhancing its\nrelevance for future recommendations. Whenever a citation relationship is added\nto the literature upon the publication of a paper, the embeddings of the two\nrelated papers are updated through a Temporal Graph Neural Network (TGN). A\nlearnable memory update module based on a Recurrent Neural Network (RNN) is\nutilized to study the evolution of the embedding of a paper in order to predict\nits reference impact in a future timestamp. Such a TGN-based model learns a\npattern of how people's views of the paper may evolve, aiming to guide paper\nrecommendations more precisely. Extensive experiments on an open citation\nnetwork dataset, including 313,278 articles from\nhttps://paperswithcode.com/about PaperWithCode, have demonstrated the\neffectiveness of the proposed approach.\n","authors":["Junhao Shen","Mohammad Ausaf Ali Haqqani","Beichen Hu","Cheng Huang","Xihao Xie","Tsengdar Lee","Jia Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.15371v1.pdf","comment":"10 pages, 4 figures, accepted by SDU@AAAI-2024. The AAAI Workshop on\n  Scientific Document Understanding (2024)"},{"id":"http://arxiv.org/abs/2408.13912v2","updated":"2024-08-27T19:06:57Z","published":"2024-08-25T18:27:20Z","title":"Splatt3R: Zero-shot Gaussian Splatting from Uncalibrated Image Pairs","summary":"  In this paper, we introduce Splatt3R, a pose-free, feed-forward method for\nin-the-wild 3D reconstruction and novel view synthesis from stereo pairs. Given\nuncalibrated natural images, Splatt3R can predict 3D Gaussian Splats without\nrequiring any camera parameters or depth information. For generalizability, we\nbuild Splatt3R upon a ``foundation'' 3D geometry reconstruction method, MASt3R,\nby extending it to deal with both 3D structure and appearance. Specifically,\nunlike the original MASt3R which reconstructs only 3D point clouds, we predict\nthe additional Gaussian attributes required to construct a Gaussian primitive\nfor each point. Hence, unlike other novel view synthesis methods, Splatt3R is\nfirst trained by optimizing the 3D point cloud's geometry loss, and then a\nnovel view synthesis objective. By doing this, we avoid the local minima\npresent in training 3D Gaussian Splats from stereo views. We also propose a\nnovel loss masking strategy that we empirically find is critical for strong\nperformance on extrapolated viewpoints. We train Splatt3R on the ScanNet++\ndataset and demonstrate excellent generalisation to uncalibrated, in-the-wild\nimages. Splatt3R can reconstruct scenes at 4FPS at 512 x 512 resolution, and\nthe resultant splats can be rendered in real-time.\n","authors":["Brandon Smart","Chuanxia Zheng","Iro Laina","Victor Adrian Prisacariu"],"pdf_url":"https://arxiv.org/pdf/2408.13912v2.pdf","comment":"Our project page can be found at: https://splatt3r.active.vision/"},{"id":"http://arxiv.org/abs/2408.15368v1","updated":"2024-08-27T19:04:32Z","published":"2024-08-27T19:04:32Z","title":"Optimization Solution Functions as Deterministic Policies for Offline\n  Reinforcement Learning","summary":"  Offline reinforcement learning (RL) is a promising approach for many control\napplications but faces challenges such as limited data coverage and value\nfunction overestimation. In this paper, we propose an implicit actor-critic\n(iAC) framework that employs optimization solution functions as a deterministic\npolicy (actor) and a monotone function over the optimal value of optimization\nas a critic. By encoding optimality in the actor policy, we show that the\nlearned policies are robust to the suboptimality of the learned actor\nparameters via the exponentially decaying sensitivity (EDS) property. We obtain\nperformance guarantees for the proposed iAC framework and show its benefits\nover general function approximation schemes. Finally, we validate the proposed\nframework on two real-world applications and show a significant improvement\nover state-of-the-art (SOTA) offline RL methods.\n","authors":["Vanshaj Khattar","Ming Jin"],"pdf_url":"https://arxiv.org/pdf/2408.15368v1.pdf","comment":"American Control Conference 2024"},{"id":"http://arxiv.org/abs/2301.06267v5","updated":"2024-08-27T19:00:47Z","published":"2023-01-16T05:40:42Z","title":"Multimodality Helps Unimodality: Cross-Modal Few-Shot Learning with\n  Multimodal Models","summary":"  The ability to quickly learn a new task with minimal instruction - known as\nfew-shot learning - is a central aspect of intelligent agents. Classical\nfew-shot benchmarks make use of few-shot samples from a single modality, but\nsuch samples may not be sufficient to characterize an entire concept class. In\ncontrast, humans use cross-modal information to learn new concepts efficiently.\nIn this work, we demonstrate that one can indeed build a better ${\\bf visual}$\ndog classifier by ${\\bf read}$ing about dogs and ${\\bf listen}$ing to them\nbark. To do so, we exploit the fact that recent multimodal foundation models\nsuch as CLIP learn cross-modal encoders that map different modalities to the\nsame representation space. Specifically, we propose a simple strategy for ${\\bf\ncross-modal}$ ${\\bf adaptation}$: we treat examples from different modalities\nas additional few-shot examples. For example, by simply repurposing class names\nas an additional training sample, we trivially turn any n-shot learning problem\ninto a (n+1)-shot problem. This allows us to produce SOTA results with\nembarrassingly simple linear classifiers. We show that our approach can be\ncombined with existing methods such as prefix tuning, adapters, and classifier\nensembling. Finally, to explore other modalities beyond vision and language, we\nconstruct the first (to our knowledge) audiovisual few-shot benchmark and use\ncross-modal training to improve the performance of both image and audio\nclassification.\n","authors":["Zhiqiu Lin","Samuel Yu","Zhiyi Kuang","Deepak Pathak","Deva Ramanan"],"pdf_url":"https://arxiv.org/pdf/2301.06267v5.pdf","comment":"Published at CVPR 2023. Project site:\n  https://linzhiqiu.github.io/papers/cross_modal/"},{"id":"http://arxiv.org/abs/2403.13724v2","updated":"2024-08-27T18:42:55Z","published":"2024-03-20T16:33:06Z","title":"Probabilistic Forecasting with Stochastic Interpolants and Föllmer\n  Processes","summary":"  We propose a framework for probabilistic forecasting of dynamical systems\nbased on generative modeling. Given observations of the system state over time,\nwe formulate the forecasting problem as sampling from the conditional\ndistribution of the future system state given its current state. To this end,\nwe leverage the framework of stochastic interpolants, which facilitates the\nconstruction of a generative model between an arbitrary base distribution and\nthe target. We design a fictitious, non-physical stochastic dynamics that takes\nas initial condition the current system state and produces as output a sample\nfrom the target conditional distribution in finite time and without bias. This\nprocess therefore maps a point mass centered at the current state onto a\nprobabilistic ensemble of forecasts. We prove that the drift coefficient\nentering the stochastic differential equation (SDE) achieving this task is\nnon-singular, and that it can be learned efficiently by square loss regression\nover the time-series data. We show that the drift and the diffusion\ncoefficients of this SDE can be adjusted after training, and that a specific\nchoice that minimizes the impact of the estimation error gives a F\\\"ollmer\nprocess. We highlight the utility of our approach on several complex,\nhigh-dimensional forecasting problems, including stochastically forced\nNavier-Stokes and video prediction on the KTH and CLEVRER datasets.\n","authors":["Yifan Chen","Mark Goldstein","Mengjian Hua","Michael S. Albergo","Nicholas M. Boffi","Eric Vanden-Eijnden"],"pdf_url":"https://arxiv.org/pdf/2403.13724v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12091v2","updated":"2024-08-27T18:34:24Z","published":"2024-08-22T03:00:21Z","title":"Unsupervised discovery of the shared and private geometry in multi-view\n  data","summary":"  Modern applications often leverage multiple views of a subject of study.\nWithin neuroscience, there is growing interest in large-scale simultaneous\nrecordings across multiple brain regions. Understanding the relationship\nbetween views (e.g., the neural activity in each region recorded) can reveal\nfundamental principles about the characteristics of each representation and\nabout the system. However, existing methods to characterize such relationships\neither lack the expressivity required to capture complex nonlinearities,\ndescribe only sources of variance that are shared between views, or discard\ngeometric information that is crucial to interpreting the data. Here, we\ndevelop a nonlinear neural network-based method that, given paired samples of\nhigh-dimensional views, disentangles low-dimensional shared and private latent\nvariables underlying these views while preserving intrinsic data geometry.\nAcross multiple simulated and real datasets, we demonstrate that our method\noutperforms competing methods. Using simulated populations of lateral\ngeniculate nucleus (LGN) and V1 neurons we demonstrate our model's ability to\ndiscover interpretable shared and private structure across different noise\nconditions. On a dataset of unrotated and corresponding but randomly rotated\nMNIST digits, we recover private latents for the rotated view that encode\nrotation angle regardless of digit class, and places the angle representation\non a 1-d manifold, while shared latents encode digit class but not rotation\nangle. Applying our method to simultaneous Neuropixels recordings of\nhippocampus and prefrontal cortex while mice run on a linear track, we discover\na low-dimensional shared latent space that encodes the animal's position. We\npropose our approach as a general-purpose method for finding succinct and\ninterpretable descriptions of paired data sets in terms of disentangled shared\nand private latent variables.\n","authors":["Sai Koukuntla","Joshua B. Julian","Jesse C. Kaminsky","Manuel Schottdorf","David W. Tank","Carlos D. Brody","Adam S. Charles"],"pdf_url":"https://arxiv.org/pdf/2408.12091v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15357v1","updated":"2024-08-27T18:29:47Z","published":"2024-08-27T18:29:47Z","title":"On the effectiveness of smartphone IMU sensors and Deep Learning in the\n  detection of cardiorespiratory conditions","summary":"  This research introduces an innovative method for the early screening of\ncardiorespiratory diseases based on an acquisition protocol, which leverages\ncommodity smartphone's Inertial Measurement Units (IMUs) and deep learning\ntechniques. We collected, in a clinical setting, a dataset featuring recordings\nof breathing kinematics obtained by accelerometer and gyroscope readings from\nfive distinct body regions. We propose an end-to-end deep learning pipeline for\nearly cardiorespiratory disease screening, incorporating a preprocessing step\nsegmenting the data into individual breathing cycles, and a recurrent\nbidirectional module capturing features from diverse body regions. We employed\nLeave-one-out-cross-validation with Bayesian optimization for hyperparameter\ntuning and model selection. The experimental results consistently demonstrated\nthe superior performance of a bidirectional Long-Short Term Memory (Bi-LSTM) as\na feature encoder architecture, yielding an average sensitivity of $0.81 \\pm\n0.02$, specificity of $0.82 \\pm 0.05$, F1 score of $0.81 \\pm 0.02$, and\naccuracy of $80.2\\% \\pm 3.9$ across diverse seed variations. We also assessed\ngeneralization capabilities on a skewed distribution, comprising exclusively\nhealthy patients not used in training, revealing a true negative rate of $74.8\n\\% \\pm 4.5$. The sustained accuracy of predictions over time during breathing\ncycles within a single patient underscores the efficacy of the preprocessing\nstrategy, highlighting the model's ability to discern significant patterns\nthroughout distinct phases of the respiratory cycle. This investigation\nunderscores the potential usefulness of widely available smartphones as devices\nfor timely cardiorespiratory disease screening in the general population, in\nat-home settings, offering crucial assistance to public health efforts\n(especially during a pandemic outbreaks, such as the recent COVID-19).\n","authors":["Lorenzo Simone","Luca Miglior","Vincenzo Gervasi","Luca Moroni","Emanuele Vignali","Emanuele Gasparotti","Simona Celi"],"pdf_url":"https://arxiv.org/pdf/2408.15357v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15356v1","updated":"2024-08-27T18:28:31Z","published":"2024-08-27T18:28:31Z","title":"Optimal level set estimation for non-parametric tournament and\n  crowdsourcing problems","summary":"  Motivated by crowdsourcing, we consider a problem where we partially observe\nthe correctness of the answers of $n$ experts on $d$ questions. In this paper,\nwe assume that both the experts and the questions can be ordered, namely that\nthe matrix $M$ containing the probability that expert $i$ answers correctly to\nquestion $j$ is bi-isotonic up to a permutation of it rows and columns. When\n$n=d$, this also encompasses the strongly stochastic transitive (SST) model\nfrom the tournament literature. Here, we focus on the relevant problem of\ndeciphering small entries of $M$ from large entries of $M$, which is key in\ncrowdsourcing for efficient allocation of workers to questions. More precisely,\nwe aim at recovering a (or several) level set $p$ of the matrix up to a\nprecision $h$, namely recovering resp. the sets of positions $(i,j)$ in $M$\nsuch that $M_{ij}>p+h$ and $M_{i,j}<p-h$. We consider, as a loss measure, the\nnumber of misclassified entries. As our main result, we construct an efficient\npolynomial-time algorithm that turns out to be minimax optimal for this\nclassification problem. This heavily contrasts with existing literature in the\nSST model where, for the stronger reconstruction loss,\nstatistical-computational gaps have been conjectured. More generally, this\nshades light on the nature of statistical-computational gaps for permutations\nmodels.\n","authors":["Maximilian Graf","Alexandra Carpentier","Nicolas Verzelen"],"pdf_url":"https://arxiv.org/pdf/2408.15356v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15355v1","updated":"2024-08-27T18:27:47Z","published":"2024-08-27T18:27:47Z","title":"Optimizing Lung Cancer Detection in CT Imaging: A Wavelet Multi-Layer\n  Perceptron (WMLP) Approach Enhanced by Dragonfly Algorithm (DA)","summary":"  Lung cancer stands as the preeminent cause of cancer-related mortality\nglobally. Prompt and precise diagnosis, coupled with effective treatment, is\nimperative to reduce the fatality rates associated with this formidable\ndisease. This study introduces a cutting-edge deep learning framework for the\nclassification of lung cancer from CT scan imagery. The research encompasses a\nsuite of image pre-processing strategies, notably Canny edge detection, and\nwavelet transformations, which precede the extraction of salient features and\nsubsequent classification via a Multi-Layer Perceptron (MLP). The optimization\nprocess is further refined using the Dragonfly Algorithm (DA). The methodology\nput forth has attained an impressive training and testing accuracy of 99.82\\%,\nunderscoring its efficacy and reliability in the accurate diagnosis of lung\ncancer.\n","authors":["Bitasadat Jamshidi","Nastaran Ghorbani","Mohsen Rostamy-Malkhalifeh"],"pdf_url":"https://arxiv.org/pdf/2408.15355v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.15897v3","updated":"2024-08-27T18:23:44Z","published":"2024-01-29T05:46:14Z","title":"Red-Teaming for Generative AI: Silver Bullet or Security Theater?","summary":"  In response to rising concerns surrounding the safety, security, and\ntrustworthiness of Generative AI (GenAI) models, practitioners and regulators\nalike have pointed to AI red-teaming as a key component of their strategies for\nidentifying and mitigating these risks. However, despite AI red-teaming's\ncentral role in policy discussions and corporate messaging, significant\nquestions remain about what precisely it means, what role it can play in\nregulation, and how it relates to conventional red-teaming practices as\noriginally conceived in the field of cybersecurity. In this work, we identify\nrecent cases of red-teaming activities in the AI industry and conduct an\nextensive survey of relevant research literature to characterize the scope,\nstructure, and criteria for AI red-teaming practices. Our analysis reveals that\nprior methods and practices of AI red-teaming diverge along several axes,\nincluding the purpose of the activity (which is often vague), the artifact\nunder evaluation, the setting in which the activity is conducted (e.g., actors,\nresources, and methods), and the resulting decisions it informs (e.g.,\nreporting, disclosure, and mitigation). In light of our findings, we argue that\nwhile red-teaming may be a valuable big-tent idea for characterizing GenAI harm\nmitigations, and that industry may effectively apply red-teaming and other\nstrategies behind closed doors to safeguard AI, gestures towards red-teaming\n(based on public definitions) as a panacea for every possible risk verge on\nsecurity theater. To move toward a more robust toolbox of evaluations for\ngenerative AI, we synthesize our recommendations into a question bank meant to\nguide and scaffold future AI red-teaming practices.\n","authors":["Michael Feffer","Anusha Sinha","Wesley Hanwen Deng","Zachary C. Lipton","Hoda Heidari"],"pdf_url":"https://arxiv.org/pdf/2401.15897v3.pdf","comment":"AIES 2024"},{"id":"http://arxiv.org/abs/2408.15344v1","updated":"2024-08-27T18:06:45Z","published":"2024-08-27T18:06:45Z","title":"Conformal Disentanglement: A Neural Framework for Perspective Synthesis\n  and Differentiation","summary":"  For multiple scientific endeavors it is common to measure a phenomenon of\ninterest in more than one ways. We make observations of objects from several\ndifferent perspectives in space, at different points in time; we may also\nmeasure different properties of a mixture using different types of instruments.\nAfter collecting this heterogeneous information, it is necessary to be able to\nsynthesize a complete picture of what is `common' across its sources: the\nsubject we ultimately want to study. However, isolated (`clean') observations\nof a system are not always possible: observations often contain information\nabout other systems in its environment, or about the measuring instruments\nthemselves. In that sense, each observation may contain information that `does\nnot matter' to the original object of study; this `uncommon' information\nbetween sensors observing the same object may still be important, and\ndecoupling it from the main signal(s) useful. We introduce a neural network\nautoencoder framework capable of both tasks: it is structured to identify\n`common' variables, and, making use of orthogonality constraints to define\ngeometric independence, to also identify disentangled `uncommon' information\noriginating from the heterogeneous sensors. We demonstrate applications in\nseveral computational examples.\n","authors":["George A. Kevrekidis","Eleni D. Koronaki","Yannis G. Kevrekidis"],"pdf_url":"https://arxiv.org/pdf/2408.15344v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15339v1","updated":"2024-08-27T18:04:07Z","published":"2024-08-27T18:04:07Z","title":"UNA: Unifying Alignments of RLHF/PPO, DPO and KTO by a Generalized\n  Implicit Reward Function","summary":"  An LLM is pretrained on trillions of tokens, but the pretrained LLM may still\ngenerate undesired responses. To solve this problem, alignment techniques such\nas RLHF, DPO and KTO are proposed. However, these alignment techniques have\nlimitations. For example, RLHF requires training the reward model and policy\nseparately, which is complex, time-consuming, memory intensive and unstable\nduring training processes. DPO proposes a mapping between an optimal policy and\na reward, greatly simplifying the training process of RLHF. However, it can not\ntake full advantages of a reward model and it is limited to pairwise preference\ndata.\n  In this paper, we propose \\textbf{UN}ified \\textbf{A}lignment (UNA) which\nunifies RLHF/PPO, DPO and KTO. Firstly, we mathematically prove that given the\nclassical RLHF objective, the optimal policy is induced by a generalize\nimplicit reward function. With this novel mapping between a reward model and an\noptimal policy, UNA can 1. unify RLHF/PPO, DPO and KTO into a supervised\nlearning of minimizing the difference between an implicit reward and an\nexplicit reward; 2. outperform RLHF/PPO while simplify, stabilize, speed up and\nreduce memory burden of RL fine-tuning process; 3. accommodate different\nfeedback types including pairwise, binary and scalar feedback. Downstream\nexperiments show UNA outperforms DPO, KTO and RLHF.\n","authors":["Zhichao Wang","Bin Bi","Can Huang","Shiva Kumar Pentyala","Zixu James Zhu","Sitaram Asur","Na Claire Cheng"],"pdf_url":"https://arxiv.org/pdf/2408.15339v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15332v1","updated":"2024-08-27T18:00:06Z","published":"2024-08-27T18:00:06Z","title":"What makes math problems hard for reinforcement learning: a case study","summary":"  Using a long-standing conjecture from combinatorial group theory, we explore,\nfrom multiple angles, the challenges of finding rare instances carrying\ndisproportionately high rewards. Based on lessons learned in the mathematical\ncontext defined by the Andrews-Curtis conjecture, we propose algorithmic\nimprovements that can be relevant in other domains with ultra-sparse reward\nproblems. Although our case study can be formulated as a game, its shortest\nwinning sequences are potentially $10^6$ or $10^9$ times longer than those\nencountered in chess. In the process of our study, we demonstrate that one of\nthe potential counterexamples due to Akbulut and Kirby, whose status escaped\ndirect mathematical methods for 39 years, is stably AC-trivial.\n","authors":["Ali Shehper","Anibal M. Medina-Mardones","Bartłomiej Lewandowski","Angus Gruen","Piotr Kucharski","Sergei Gukov"],"pdf_url":"https://arxiv.org/pdf/2408.15332v1.pdf","comment":"39 pages, 18 figures, 1 table"},{"id":"http://arxiv.org/abs/2408.15328v1","updated":"2024-08-27T18:00:02Z","published":"2024-08-27T18:00:02Z","title":"Artificially intelligent Maxwell's demon for optimal control of open\n  quantum systems","summary":"  Feedback control of open quantum systems is of fundamental importance for\npractical applications in various contexts, ranging from quantum computation to\nquantum error correction and quantum metrology. Its use in the context of\nthermodynamics further enables the study of the interplay between information\nand energy. However, deriving optimal feedback control strategies is highly\nchallenging, as it involves the optimal control of open quantum systems, the\nstochastic nature of quantum measurement, and the inclusion of policies that\nmaximize a long-term time- and trajectory-averaged goal. In this work, we\nemploy a reinforcement learning approach to automate and capture the role of a\nquantum Maxwell's demon: the agent takes the literal role of discovering\noptimal feedback control strategies in qubit-based systems that maximize a\ntrade-off between measurement-powered cooling and measurement efficiency.\nConsidering weak or projective quantum measurements, we explore different\nregimes based on the ordering between the thermalization, the measurement, and\nthe unitary feedback timescales, finding different and highly non-intuitive,\nyet interpretable, strategies. In the thermalization-dominated regime, we find\nstrategies with elaborate finite-time thermalization protocols conditioned on\nmeasurement outcomes. In the measurement-dominated regime, we find that optimal\nstrategies involve adaptively measuring different qubit observables reflecting\nthe acquired information, and repeating multiple weak measurements until the\nquantum state is \"sufficiently pure\", leading to random walks in state space.\nFinally, we study the case when all timescales are comparable, finding new\nfeedback control strategies that considerably outperform more intuitive ones.\nWe discuss a two-qubit example where we explore the role of entanglement and\nconclude discussing the scaling of our results to quantum many-body systems.\n","authors":["Paolo Andrea Erdman","Robert Czupryniak","Bibek Bhandari","Andrew N. Jordan","Frank Noé","Jens Eisert","Giacomo Guarnieri"],"pdf_url":"https://arxiv.org/pdf/2408.15328v1.pdf","comment":"16+10 pages, 21 figures"}],"Multimedia":[{"id":"http://arxiv.org/abs/2408.15209v1","updated":"2024-08-27T17:18:02Z","published":"2024-08-27T17:18:02Z","title":"Sec2Sec Co-attention for Video-Based Apparent Affective Prediction","summary":"  Video-based apparent affect detection plays a crucial role in video\nunderstanding, as it encompasses various elements such as vision, audio,\naudio-visual interactions, and spatiotemporal information, which are essential\nfor accurate video predictions. However, existing approaches often focus on\nextracting only a subset of these elements, resulting in the limited predictive\ncapacity of their models. To address this limitation, we propose a novel\nLSTM-based network augmented with a Transformer co-attention mechanism for\npredicting apparent affect in videos. We demonstrate that our proposed Sec2Sec\nCo-attention Transformer surpasses multiple state-of-the-art methods in\npredicting apparent affect on two widely used datasets: LIRIS-ACCEDE and First\nImpressions. Notably, our model offers interpretability, allowing us to examine\nthe contributions of different time points to the overall prediction. The\nimplementation is available at: https://github.com/nestor-sun/sec2sec.\n","authors":["Mingwei Sun","Kunpeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.15209v1.pdf","comment":"5 pages, 3 figures"},{"id":"http://arxiv.org/abs/2404.13993v3","updated":"2024-08-27T15:56:33Z","published":"2024-04-22T08:59:35Z","title":"Zero-Shot Character Identification and Speaker Prediction in Comics via\n  Iterative Multimodal Fusion","summary":"  Recognizing characters and predicting speakers of dialogue are critical for\ncomic processing tasks, such as voice generation or translation. However,\nbecause characters vary by comic title, supervised learning approaches like\ntraining character classifiers which require specific annotations for each\ncomic title are infeasible. This motivates us to propose a novel zero-shot\napproach, allowing machines to identify characters and predict speaker names\nbased solely on unannotated comic images. In spite of their importance in\nreal-world applications, these task have largely remained unexplored due to\nchallenges in story comprehension and multimodal integration. Recent large\nlanguage models (LLMs) have shown great capability for text understanding and\nreasoning, while their application to multimodal content analysis is still an\nopen problem. To address this problem, we propose an iterative multimodal\nframework, the first to employ multimodal information for both character\nidentification and speaker prediction tasks. Our experiments demonstrate the\neffectiveness of the proposed framework, establishing a robust baseline for\nthese tasks. Furthermore, since our method requires no training data or\nannotations, it can be used as-is on any comic series.\n","authors":["Yingxuan Li","Ryota Hinami","Kiyoharu Aizawa","Yusuke Matsui"],"pdf_url":"https://arxiv.org/pdf/2404.13993v3.pdf","comment":"Accepted to ACM Multimedia 2024"},{"id":"http://arxiv.org/abs/2408.14826v1","updated":"2024-08-27T07:13:44Z","published":"2024-08-27T07:13:44Z","title":"Alfie: Democratising RGBA Image Generation With No $$$","summary":"  Designs and artworks are ubiquitous across various creative fields, requiring\ngraphic design skills and dedicated software to create compositions that\ninclude many graphical elements, such as logos, icons, symbols, and art scenes,\nwhich are integral to visual storytelling. Automating the generation of such\nvisual elements improves graphic designers' productivity, democratizes and\ninnovates the creative industry, and helps generate more realistic synthetic\ndata for related tasks. These illustration elements are mostly RGBA images with\nirregular shapes and cutouts, facilitating blending and scene composition.\nHowever, most image generation models are incapable of generating such images\nand achieving this capability requires expensive computational resources,\nspecific training recipes, or post-processing solutions. In this work, we\npropose a fully-automated approach for obtaining RGBA illustrations by\nmodifying the inference-time behavior of a pre-trained Diffusion Transformer\nmodel, exploiting the prompt-guided controllability and visual quality offered\nby such models with no additional computational cost. We force the generation\nof entire subjects without sharp croppings, whose background is easily removed\nfor seamless integration into design projects or artistic scenes. We show with\na user study that, in most cases, users prefer our solution over generating and\nthen matting an image, and we show that our generated illustrations yield good\nresults when used as inputs for composite scene generation pipelines. We\nrelease the code at https://github.com/aimagelab/Alfie.\n","authors":["Fabio Quattrini","Vittorio Pippi","Silvia Cascianelli","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2408.14826v1.pdf","comment":"Accepted at ECCV AI for Visual Arts Workshop and Challenges"},{"id":"http://arxiv.org/abs/2408.14823v1","updated":"2024-08-27T07:06:49Z","published":"2024-08-27T07:06:49Z","title":"LapisGS: Layered Progressive 3D Gaussian Splatting for Adaptive\n  Streaming","summary":"  The rise of Extended Reality (XR) requires efficient streaming of 3D online\nworlds, challenging current 3DGS representations to adapt to\nbandwidth-constrained environments. This paper proposes LapisGS, a layered 3DGS\nthat supports adaptive streaming and progressive rendering. Our method\nconstructs a layered structure for cumulative representation, incorporates\ndynamic opacity optimization to maintain visual fidelity, and utilizes\noccupancy maps to efficiently manage Gaussian splats. This proposed model\noffers a progressive representation supporting a continuous rendering quality\nadapted for bandwidth-aware streaming. Extensive experiments validate the\neffectiveness of our approach in balancing visual fidelity with the compactness\nof the model, with up to 50.71% improvement in SSIM, 286.53% improvement in\nLPIPS, and 318.41% reduction in model size, and shows its potential for\nbandwidth-adapted 3D streaming and rendering applications.\n","authors":["Yuang Shi","Simone Gasparini","Géraldine Morin","Wei Tsang Ooi"],"pdf_url":"https://arxiv.org/pdf/2408.14823v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14764v1","updated":"2024-08-27T03:31:24Z","published":"2024-08-27T03:31:24Z","title":"SynthDoc: Bilingual Documents Synthesis for Visual Document\n  Understanding","summary":"  This paper introduces SynthDoc, a novel synthetic document generation\npipeline designed to enhance Visual Document Understanding (VDU) by generating\nhigh-quality, diverse datasets that include text, images, tables, and charts.\nAddressing the challenges of data acquisition and the limitations of existing\ndatasets, SynthDoc leverages publicly available corpora and advanced rendering\ntools to create a comprehensive and versatile dataset. Our experiments,\nconducted using the Donut model, demonstrate that models trained with\nSynthDoc's data achieve superior performance in pre-training read tasks and\nmaintain robustness in downstream tasks, despite language inconsistencies. The\nrelease of a benchmark dataset comprising 5,000 image-text pairs not only\nshowcases the pipeline's capabilities but also provides a valuable resource for\nthe VDU community to advance research and development in document image\nrecognition. This work significantly contributes to the field by offering a\nscalable solution to data scarcity and by validating the efficacy of end-to-end\nmodels in parsing complex, real-world documents.\n","authors":["Chuanghao Ding","Xuejing Liu","Wei Tang","Juan Li","Xiaoliang Wang","Rui Zhao","Cam-Tu Nguyen","Fei Tan"],"pdf_url":"https://arxiv.org/pdf/2408.14764v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14735v1","updated":"2024-08-27T02:03:36Z","published":"2024-08-27T02:03:36Z","title":"PPVF: An Efficient Privacy-Preserving Online Video Fetching Framework\n  with Correlated Differential Privacy","summary":"  Online video streaming has evolved into an integral component of the\ncontemporary Internet landscape. Yet, the disclosure of user requests presents\nformidable privacy challenges. As users stream their preferred online videos,\ntheir requests are automatically seized by video content providers, potentially\nleaking users' privacy.\n  Unfortunately, current protection methods are not well-suited to preserving\nuser request privacy from content providers while maintaining high-quality\nonline video services. To tackle this challenge, we introduce a novel\nPrivacy-Preserving Video Fetching (PPVF) framework, which utilizes trusted edge\ndevices to pre-fetch and cache videos, ensuring the privacy of users' requests\nwhile optimizing the efficiency of edge caching. More specifically, we design\nPPVF with three core components: (1) \\textit{Online privacy budget scheduler},\nwhich employs a theoretically guaranteed online algorithm to select\nnon-requested videos as candidates with assigned privacy budgets. Alternative\nvideos are chosen by an online algorithm that is theoretically guaranteed to\nconsider both video utilities and available privacy budgets. (2) \\textit{Noisy\nvideo request generator}, which generates redundant video requests (in addition\nto original ones) utilizing correlated differential privacy to obfuscate\nrequest privacy. (3) \\textit{Online video utility predictor}, which leverages\nfederated learning to collaboratively evaluate video utility in an online\nfashion, aiding in video selection in (1) and noise generation in (2). Finally,\nwe conduct extensive experiments using real-world video request traces from\nTencent Video. The results demonstrate that PPVF effectively safeguards user\nrequest privacy while upholding high video caching performance.\n","authors":["Xianzhi Zhang","Yipeng Zhou","Di Wu","Quan Z. Sheng","Miao Hu","Linchang Xiao"],"pdf_url":"https://arxiv.org/pdf/2408.14735v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13621v5","updated":"2024-08-27T01:23:50Z","published":"2024-04-21T11:21:27Z","title":"Attack on Scene Flow using Point Clouds","summary":"  Deep neural networks have made significant advancements in accurately\nestimating scene flow using point clouds, which is vital for many applications\nlike video analysis, action recognition, and navigation. The robustness of\nthese techniques, however, remains a concern, particularly in the face of\nadversarial attacks that have been proven to deceive state-of-the-art deep\nneural networks in many domains. Surprisingly, the robustness of scene flow\nnetworks against such attacks has not been thoroughly investigated. To address\nthis problem, the proposed approach aims to bridge this gap by introducing\nadversarial white-box attacks specifically tailored for scene flow networks.\nExperimental results show that the generated adversarial examples obtain up to\n33.7 relative degradation in average end-point error on the KITTI and\nFlyingThings3D datasets. The study also reveals the significant impact that\nattacks targeting point clouds in only one dimension or color channel have on\naverage end-point error. Analyzing the success and failure of these attacks on\nthe scene flow networks and their 2D optical flow network variants shows a\nhigher vulnerability for the optical flow networks. Code is available at\nhttps://github.com/aheldis/Attack-on-Scene-Flow-using-Point-Clouds.git.\n","authors":["Haniyeh Ehsani Oskouie","Mohammad-Shahram Moin","Shohreh Kasaei"],"pdf_url":"https://arxiv.org/pdf/2404.13621v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14713v1","updated":"2024-08-27T00:37:07Z","published":"2024-08-27T00:37:07Z","title":"StyleSpeech: Parameter-efficient Fine Tuning for Pre-trained\n  Controllable Text-to-Speech","summary":"  This paper introduces StyleSpeech, a novel Text-to-Speech~(TTS) system that\nenhances the naturalness and accuracy of synthesized speech. Building upon\nexisting TTS technologies, StyleSpeech incorporates a unique Style Decorator\nstructure that enables deep learning models to simultaneously learn style and\nphoneme features, improving adaptability and efficiency through the principles\nof Lower Rank Adaptation~(LoRA). LoRA allows efficient adaptation of style\nfeatures in pre-trained models. Additionally, we introduce a novel automatic\nevaluation metric, the LLM-Guided Mean Opinion Score (LLM-MOS), which employs\nlarge language models to offer an objective and robust protocol for\nautomatically assessing TTS system performance. Extensive testing on benchmark\ndatasets shows that our approach markedly outperforms existing state-of-the-art\nbaseline methods in producing natural, accurate, and high-quality speech. These\nadvancements not only pushes the boundaries of current TTS system capabilities,\nbut also facilitate the application of TTS system in more dynamic and\nspecialized, such as interactive virtual assistants, adaptive audiobooks, and\ncustomized voice for gaming. Speech samples can be found in\nhttps://style-speech.vercel.app\n","authors":["Haowei Lou","Helen Paik","Wen Hu","Lina Yao"],"pdf_url":"https://arxiv.org/pdf/2408.14713v1.pdf","comment":null}]},"2024-08-28T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2408.15992v1","updated":"2024-08-28T17:58:39Z","published":"2024-08-28T17:58:39Z","title":"CoGen: Learning from Feedback with Coupled Comprehension and Generation","summary":"  Systems with both language comprehension and generation capabilities can\nbenefit from the tight connection between the two. This work studies coupling\ncomprehension and generation with focus on continually learning from\ninteraction with users. We propose techniques to tightly integrate the two\ncapabilities for both learning and inference. We situate our studies in\ntwo-player reference games, and deploy various models for thousands of\ninteractions with human users, while learning from interaction feedback\nsignals. We show dramatic improvements in performance over time, with\ncomprehension-generation coupling leading to performance improvements up to 26%\nin absolute terms and up to 17% higher accuracies compared to a non-coupled\nsystem. Our analysis also shows coupling has substantial qualitative impact on\nthe system's language, making it significantly more human-like.\n","authors":["Mustafa Omer Gul","Yoav Artzi"],"pdf_url":"https://arxiv.org/pdf/2408.15992v1.pdf","comment":"17 pages, 9 figures"},{"id":"http://arxiv.org/abs/2408.15971v1","updated":"2024-08-28T17:43:55Z","published":"2024-08-28T17:43:55Z","title":"BattleAgentBench: A Benchmark for Evaluating Cooperation and Competition\n  Capabilities of Language Models in Multi-Agent Systems","summary":"  Large Language Models (LLMs) are becoming increasingly powerful and capable\nof handling complex tasks, e.g., building single agents and multi-agent\nsystems. Compared to single agents, multi-agent systems have higher\nrequirements for the collaboration capabilities of language models. Many\nbenchmarks are proposed to evaluate their collaborative abilities. However,\nthese benchmarks lack fine-grained evaluations of LLM collaborative\ncapabilities. Additionally, multi-agent collaborative and competitive scenarios\nare ignored in existing works. To address these two problems, we propose a\nbenchmark, called BattleAgentBench, which defines seven sub-stages of three\nvarying difficulty levels and conducts a fine-grained evaluation of language\nmodels in terms of single-agent scenario navigation capabilities, paired-agent\ntask execution abilities, and multi-agent collaboration and competition\ncapabilities. We conducted extensive evaluations on leading four closed-source\nand seven open-source models. Experimental results indicate that API-based\nmodels perform excellently on simple tasks but open-source small models\nstruggle with simple tasks. Regarding difficult tasks that require\ncollaborative and competitive abilities, although API-based models have\ndemonstrated some collaborative capabilities, there is still enormous room for\nimprovement.\n","authors":["Wei Wang","Dan Zhang","Tao Feng","Boyan Wang","Jie Tang"],"pdf_url":"https://arxiv.org/pdf/2408.15971v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15966v1","updated":"2024-08-28T17:38:44Z","published":"2024-08-28T17:38:44Z","title":"More Text, Less Point: Towards 3D Data-Efficient Point-Language\n  Understanding","summary":"  Enabling Large Language Models (LLMs) to comprehend the 3D physical world\nremains a significant challenge. Due to the lack of large-scale 3D-text pair\ndatasets, the success of LLMs has yet to be replicated in 3D understanding. In\nthis paper, we rethink this issue and propose a new task: 3D Data-Efficient\nPoint-Language Understanding. The goal is to enable LLMs to achieve robust 3D\nobject understanding with minimal 3D point cloud and text data pairs. To\naddress this task, we introduce GreenPLM, which leverages more text data to\ncompensate for the lack of 3D data. First, inspired by using CLIP to align\nimages and text, we utilize a pre-trained point cloud-text encoder to map the\n3D point cloud space to the text space. This mapping leaves us to seamlessly\nconnect the text space with LLMs. Once the point-text-LLM connection is\nestablished, we further enhance text-LLM alignment by expanding the\nintermediate text space, thereby reducing the reliance on 3D point cloud data.\nSpecifically, we generate 6M free-text descriptions of 3D objects, and design a\nthree-stage training strategy to help LLMs better explore the intrinsic\nconnections between different modalities. To achieve efficient modality\nalignment, we design a zero-parameter cross-attention module for token pooling.\nExtensive experimental results show that GreenPLM requires only 12% of the 3D\ntraining data used by existing state-of-the-art models to achieve superior 3D\nunderstanding. Remarkably, GreenPLM also achieves competitive performance using\ntext-only data. The code and weights are available at:\nhttps://github.com/TangYuan96/GreenPLM.\n","authors":["Yuan Tang","Xu Han","Xianzhi Li","Qiao Yu","Jinfeng Xu","Yixue Hao","Long Hu","Min Chen"],"pdf_url":"https://arxiv.org/pdf/2408.15966v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10260v2","updated":"2024-08-28T17:26:03Z","published":"2024-06-11T01:16:10Z","title":"Flextron: Many-in-One Flexible Large Language Model","summary":"  Training modern LLMs is extremely resource intensive, and customizing them\nfor various deployment scenarios characterized by limited compute and memory\nresources through repeated training is impractical. In this paper, we introduce\nFlextron, a network architecture and post-training model optimization framework\nsupporting flexible model deployment. The Flextron architecture utilizes a\nnested elastic structure to rapidly adapt to specific user-defined latency and\naccuracy targets during inference with no additional fine-tuning required. It\nis also input-adaptive, and can automatically route tokens through its\nsub-networks for improved performance and efficiency. We present a\nsample-efficient training method and associated routing algorithms for\nsystematically transforming an existing trained LLM into a Flextron model. We\nevaluate Flextron on the GPT-3 and LLama-2 family of LLMs, and demonstrate\nsuperior performance over multiple end-to-end trained variants and other\nstate-of-the-art elastic networks, all with a single pretraining run that\nconsumes a mere 7.63% tokens compared to original pretraining.\n","authors":["Ruisi Cai","Saurav Muralidharan","Greg Heinrich","Hongxu Yin","Zhangyang Wang","Jan Kautz","Pavlo Molchanov"],"pdf_url":"https://arxiv.org/pdf/2406.10260v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15915v1","updated":"2024-08-28T16:28:07Z","published":"2024-08-28T16:28:07Z","title":"Leveraging Open Knowledge for Advancing Task Expertise in Large Language\n  Models","summary":"  The cultivation of expertise for large language models (LLMs) to solve tasks\nof specific areas often requires special-purpose tuning with calibrated\nbehaviors on the expected stable outputs. To avoid huge cost brought by manual\npreparation of instruction datasets and training resources up to hundreds of\nhours, the exploitation of open knowledge including a wealth of low rank\nadaptation (LoRA) models and instruction datasets serves as a good starting\npoint. However, existing methods on model and data selection focus on the\nperformance of general-purpose capabilities while neglecting the knowledge gap\nexposed in domain-specific deployment. In the present study, we propose to\nbridge such gap by introducing few human-annotated samples (i.e., K-shot) for\nadvancing task expertise of LLMs with open knowledge. Specifically, we develop\nan efficient and scalable pipeline to cost-efficiently produce task experts\nwhere K-shot data intervene in selecting the most promising expert candidates\nand the task-relevant instructions. A mixture-of-expert (MoE) system is built\nto make the best use of individual-yet-complementary knowledge between multiple\nexperts. We unveil the two keys to the success of a MoE system, 1) the abidance\nby K-shot, and 2) the insistence on diversity. For the former, we ensure that\nmodels that truly possess problem-solving abilities on K-shot are selected\nrather than those blind guessers. Besides, during data selection, instructions\nthat share task-relevant contexts with K-shot are prioritized. For the latter,\nwe highlight the diversity of constituting experts and that of the fine-tuning\ninstructions throughout the model and data selection process. Extensive\nexperimental results confirm the superiority of our approach over existing\nmethods on utilization of open knowledge across various tasks. Codes and models\nwill be released later.\n","authors":["Yuncheng Yang","Yulei Qin","Tong Wu","Zihan Xu","Gang Li","Pengcheng Guo","Hang Shao","Yucheng Shi","Ke Li","Xing Sun","Jie Yang","Yun Gu"],"pdf_url":"https://arxiv.org/pdf/2408.15915v1.pdf","comment":"28 pages, 12 tables, 10 figures"},{"id":"http://arxiv.org/abs/2311.11844v3","updated":"2024-08-28T16:26:16Z","published":"2023-11-20T15:34:45Z","title":"Towards Human-Level Text Coding with LLMs: The Case of Fatherhood Roles\n  in Public Policy Documents","summary":"  Recent advances in large language models (LLMs) like GPT-3.5 and GPT-4\npromise automation with better results and less programming, opening up new\nopportunities for text analysis in political science. In this study, we\nevaluate LLMs on three original coding tasks involving typical complexities\nencountered in political science settings: a non-English language, legal and\npolitical jargon, and complex labels based on abstract constructs. Along the\npaper, we propose a practical workflow to optimize the choice of the model and\nthe prompt. We find that the best prompting strategy consists of providing the\nLLMs with a detailed codebook, as the one provided to human coders. In this\nsetting, an LLM can be as good as or possibly better than a human annotator\nwhile being much faster, considerably cheaper, and much easier to scale to\nlarge amounts of text. We also provide a comparison of GPT and popular\nopen-source LLMs, discussing the trade-offs in the model's choice. Our software\nallows LLMs to be easily used as annotators and is publicly available:\nhttps://github.com/lorelupo/pappa.\n","authors":["Lorenzo Lupo","Oscar Magnusson","Dirk Hovy","Elin Naurin","Lena Wängnerud"],"pdf_url":"https://arxiv.org/pdf/2311.11844v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15903v1","updated":"2024-08-28T16:15:45Z","published":"2024-08-28T16:15:45Z","title":"LLM-Based Multi-Hop Question Answering with Knowledge Graph Integration\n  in Evolving Environments","summary":"  The rapid obsolescence of information in Large Language Models (LLMs) has\ndriven the development of various techniques to incorporate new facts. However,\nexisting methods for knowledge editing still face difficulties with multi-hop\nquestions that require accurate fact identification and sequential logical\nreasoning, particularly among numerous fact updates. To tackle these\nchallenges, this paper introduces Graph Memory-based Editing for Large Language\nModels (GMeLLo), a straitforward and effective method that merges the explicit\nknowledge representation of Knowledge Graphs (KGs) with the linguistic\nflexibility of LLMs. Beyond merely leveraging LLMs for question answering,\nGMeLLo employs these models to convert free-form language into structured\nqueries and fact triples, facilitating seamless interaction with KGs for rapid\nupdates and precise multi-hop reasoning. Our results show that GMeLLo\nsignificantly surpasses current state-of-the-art knowledge editing methods in\nthe multi-hop question answering benchmark, MQuAKE, especially in scenarios\nwith extensive knowledge edits.\n","authors":["Ruirui Chen","Weifeng Jiang","Chengwei Qin","Ishaan Singh Rawal","Cheston Tan","Dongkyu Choi","Bo Xiong","Bo Ai"],"pdf_url":"https://arxiv.org/pdf/2408.15903v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15901v1","updated":"2024-08-28T16:12:55Z","published":"2024-08-28T16:12:55Z","title":"Nexus: Specialization meets Adaptability for Efficiently Training\n  Mixture of Experts","summary":"  Efficiency, specialization, and adaptability to new data distributions are\nqualities that are hard to combine in current Large Language Models. The\nMixture of Experts (MoE) architecture has been the focus of significant\nresearch because its inherent conditional computation enables such desirable\nproperties. In this work, we focus on \"upcycling\" dense expert models into an\nMoE, aiming to improve specialization while also adding the ability to adapt to\nnew tasks easily. We introduce Nexus, an enhanced MoE architecture with\nadaptive routing where the model learns to project expert embeddings from\ndomain representations. This approach allows Nexus to flexibly add new experts\nafter the initial upcycling through separately trained dense models, without\nrequiring large-scale MoE training for unseen data domains. Our experiments\nshow that Nexus achieves a relative gain of up to 2.1% over the baseline for\ninitial upcycling, and a 18.8% relative gain for extending the MoE with a new\nexpert by using limited finetuning data. This flexibility of Nexus is crucial\nto enable an open-source ecosystem where every user continuously assembles\ntheir own MoE-mix according to their needs.\n","authors":["Nikolas Gritsch","Qizhen Zhang","Acyr Locatelli","Sara Hooker","Ahmet Üstün"],"pdf_url":"https://arxiv.org/pdf/2408.15901v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15896v1","updated":"2024-08-28T16:06:12Z","published":"2024-08-28T16:06:12Z","title":"A New Method for Cross-Lingual-based Semantic Role Labeling","summary":"  Semantic role labeling is a crucial task in natural language processing,\nenabling better comprehension of natural language. However, the lack of\nannotated data in multiple languages has posed a challenge for researchers. To\naddress this, a deep learning algorithm based on model transfer has been\nproposed. The algorithm utilizes a dataset consisting of the English portion of\nCoNLL2009 and a corpus of semantic roles in Persian. To optimize the efficiency\nof training, only ten percent of the educational data from each language is\nused. The results of the proposed model demonstrate significant improvements\ncompared to Niksirt et al.'s model. In monolingual mode, the proposed model\nachieved a 2.05 percent improvement on F1-score, while in cross-lingual mode,\nthe improvement was even more substantial, reaching 6.23 percent. Worth noting\nis that the compared model only trained two of the four stages of semantic role\nlabeling and employed golden data for the remaining two stages. This suggests\nthat the actual superiority of the proposed model surpasses the reported\nnumbers by a significant margin. The development of cross-lingual methods for\nsemantic role labeling holds promise, particularly in addressing the scarcity\nof annotated data for various languages. These advancements pave the way for\nfurther research in understanding and processing natural language across\ndifferent linguistic contexts.\n","authors":["Mohammad Ebrahimi","Behrouz Minaei Bidgoli","Nasim Khozouei"],"pdf_url":"https://arxiv.org/pdf/2408.15896v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15895v1","updated":"2024-08-28T16:05:20Z","published":"2024-08-28T16:05:20Z","title":"Bias in LLMs as Annotators: The Effect of Party Cues on Labelling\n  Decision by Large Language Models","summary":"  Human coders are biased. We test similar biases in Large Language Models\n(LLMs) as annotators. By replicating an experiment run by Ennser-Jedenastik and\nMeyer (2018), we find evidence that LLMs use political information, and\nspecifically party cues, to judge political statements. Not only do LLMs use\nrelevant information to contextualize whether a statement is positive,\nnegative, or neutral based on the party cue, they also reflect the biases of\nthe human-generated data upon which they have been trained. We also find that\nunlike humans, who are only biased when faced with statements from extreme\nparties, LLMs exhibit significant bias even when prompted with statements from\ncenter-left and center-right parties. The implications of our findings are\ndiscussed in the conclusion.\n","authors":["Sebastian Vallejo Vera","Hunter Driggers"],"pdf_url":"https://arxiv.org/pdf/2408.15895v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15879v1","updated":"2024-08-28T15:50:41Z","published":"2024-08-28T15:50:41Z","title":"Persuasion Games using Large Language Models","summary":"  Large Language Models (LLMs) have emerged as formidable instruments capable\nof comprehending and producing human-like text. This paper explores the\npotential of LLMs, to shape human perspectives and subsequently influence their\ndecisions on particular tasks. This capability finds applications in diverse\ndomains such as Investment, Credit cards and Insurance, wherein they assist\nusers in selecting appropriate insurance policies, investment plans, Credit\ncards, Retail, as well as in Behavioral Change Support Systems (BCSS).\n  We present a sophisticated multi-agent framework wherein a consortium of\nagents operate in collaborative manner. The primary agent engages directly with\nusers through persuasive dialogue, while the auxiliary agents perform tasks\nsuch as information retrieval, response analysis, development of persuasion\nstrategies, and validation of facts. Empirical evidence from our experiments\ndemonstrates that this collaborative methodology significantly enhances the\npersuasive efficacy of the LLM. We analyze user resistance to persuasive\nefforts continuously and counteract it by employing a combination of rule-based\nand LLM-based resistance-persuasion mapping techniques.\n  We employ simulated personas and generate conversations in insurance,\nbanking, and retail domains to evaluate the proficiency of large language\nmodels (LLMs) in recognizing, adjusting to, and influencing various personality\ntypes. Concurrently, we examine the resistance mechanisms employed by LLM\nsimulated personas. Persuasion is quantified via measurable surveys before and\nafter interaction, LLM-generated scores on conversation, and user decisions\n(purchase or non-purchase).\n","authors":["Ganesh Prasath Ramani","Shirish Karande","Santhosh V","Yash Bhatia"],"pdf_url":"https://arxiv.org/pdf/2408.15879v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02731v3","updated":"2024-08-28T15:40:22Z","published":"2023-09-06T05:33:57Z","title":"HC3 Plus: A Semantic-Invariant Human ChatGPT Comparison Corpus","summary":"  ChatGPT has garnered significant interest due to its impressive performance;\nhowever, there is growing concern about its potential risks, particularly in\nthe detection of AI-generated content (AIGC), which is often challenging for\nuntrained individuals to identify. Current datasets used for detecting\nChatGPT-generated text primarily focus on question-answering tasks, often\noverlooking tasks with semantic-invariant properties, such as summarization,\ntranslation, and paraphrasing. In this paper, we demonstrate that detecting\nmodel-generated text in semantic-invariant tasks is more challenging. To\naddress this gap, we introduce a more extensive and comprehensive dataset that\nincorporates a wider range of tasks than previous work, including those with\nsemantic-invariant properties.\n","authors":["Zhenpeng Su","Xing Wu","Wei Zhou","Guangyuan Ma","Songlin Hu"],"pdf_url":"https://arxiv.org/pdf/2309.02731v3.pdf","comment":"This paper has been accepted by CIKM2023 workshop"},{"id":"http://arxiv.org/abs/2405.00706v3","updated":"2024-08-28T15:29:10Z","published":"2024-04-23T14:43:35Z","title":"From Complexity to Clarity: How AI Enhances Perceptions of Scientists\n  and the Public's Understanding of Science","summary":"  This paper evaluated the effectiveness of using generative AI to simplify\nscience communication and enhance the public's understanding of science. By\ncomparing lay summaries of journal articles from PNAS, yoked to those generated\nby AI, this work first assessed linguistic simplicity differences across such\nsummaries and public perceptions in follow-up experiments. Specifically, Study\n1a analyzed simplicity features of PNAS abstracts (scientific summaries) and\nsignificance statements (lay summaries), observing that lay summaries were\nindeed linguistically simpler, but effect size differences were small. Study 1b\nused a large language model, GPT-4, to create significance statements based on\npaper abstracts and this more than doubled the average effect size without\nfine-tuning. Study 2 experimentally demonstrated that simply-written GPT\nsummaries facilitated more favorable perceptions of scientists (they were\nperceived as more credible and trustworthy, but less intelligent) than more\ncomplexly-written human PNAS summaries. Crucially, Study 3 experimentally\ndemonstrated that participants comprehended scientific writing better after\nreading simple GPT summaries compared to complex PNAS summaries. In their own\nwords, participants also summarized scientific papers in a more detailed and\nconcrete manner after reading GPT summaries compared to PNAS summaries of the\nsame article. AI has the potential to engage scientific communities and the\npublic via a simple language heuristic, advocating for its integration into\nscientific dissemination for a more informed society.\n","authors":["David M. Markowitz"],"pdf_url":"https://arxiv.org/pdf/2405.00706v3.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2404.07839v2","updated":"2024-08-28T15:05:42Z","published":"2024-04-11T15:27:22Z","title":"RecurrentGemma: Moving Past Transformers for Efficient Open Language\n  Models","summary":"  We introduce RecurrentGemma, a family of open language models which uses\nGoogle's novel Griffin architecture. Griffin combines linear recurrences with\nlocal attention to achieve excellent performance on language. It has a\nfixed-sized state, which reduces memory use and enables efficient inference on\nlong sequences. We provide two sizes of models, containing 2B and 9B\nparameters, and provide pre-trained and instruction tuned variants for both.\nOur models achieve comparable performance to similarly-sized Gemma baselines\ndespite being trained on fewer tokens.\n","authors":["Aleksandar Botev","Soham De","Samuel L Smith","Anushan Fernando","George-Cristian Muraru","Ruba Haroun","Leonard Berrada","Razvan Pascanu","Pier Giuseppe Sessa","Robert Dadashi","Léonard Hussenot","Johan Ferret","Sertan Girgin","Olivier Bachem","Alek Andreev","Kathleen Kenealy","Thomas Mesnard","Cassidy Hardin","Surya Bhupatiraju","Shreya Pathak","Laurent Sifre","Morgane Rivière","Mihir Sanjay Kale","Juliette Love","Pouya Tafti","Armand Joulin","Noah Fiedel","Evan Senter","Yutian Chen","Srivatsan Srinivasan","Guillaume Desjardins","David Budden","Arnaud Doucet","Sharad Vikram","Adam Paszke","Trevor Gale","Sebastian Borgeaud","Charlie Chen","Andy Brock","Antonia Paterson","Jenny Brennan","Meg Risdal","Raj Gundluru","Nesh Devanathan","Paul Mooney","Nilay Chauhan","Phil Culliton","Luiz Gustavo Martins","Elisa Bandy","David Huntsperger","Glenn Cameron","Arthur Zucker","Tris Warkentin","Ludovic Peran","Minh Giang","Zoubin Ghahramani","Clément Farabet","Koray Kavukcuoglu","Demis Hassabis","Raia Hadsell","Yee Whye Teh","Nando de Frietas"],"pdf_url":"https://arxiv.org/pdf/2404.07839v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01245v2","updated":"2024-08-28T15:01:04Z","published":"2024-04-01T17:03:41Z","title":"A Statistical Framework of Watermarks for Large Language Models: Pivot,\n  Detection Efficiency and Optimal Rules","summary":"  Since ChatGPT was introduced in November 2022, embedding (nearly)\nunnoticeable statistical signals into text generated by large language models\n(LLMs), also known as watermarking, has been used as a principled approach to\nprovable detection of LLM-generated text from its human-written counterpart. In\nthis paper, we introduce a general and flexible framework for reasoning about\nthe statistical efficiency of watermarks and designing powerful detection\nrules. Inspired by the hypothesis testing formulation of watermark detection,\nour framework starts by selecting a pivotal statistic of the text and a secret\nkey -- provided by the LLM to the verifier -- to enable controlling the false\npositive rate (the error of mistakenly detecting human-written text as\nLLM-generated). Next, this framework allows one to evaluate the power of\nwatermark detection rules by obtaining a closed-form expression of the\nasymptotic false negative rate (the error of incorrectly classifying\nLLM-generated text as human-written). Our framework further reduces the problem\nof determining the optimal detection rule to solving a minimax optimization\nprogram. We apply this framework to two representative watermarks -- one of\nwhich has been internally implemented at OpenAI -- and obtain several findings\nthat can be instrumental in guiding the practice of implementing watermarks. In\nparticular, we derive optimal detection rules for these watermarks under our\nframework. These theoretically derived detection rules are demonstrated to be\ncompetitive and sometimes enjoy a higher power than existing detection\napproaches through numerical experiments.\n","authors":["Xiang Li","Feng Ruan","Huiyuan Wang","Qi Long","Weijie J. Su"],"pdf_url":"https://arxiv.org/pdf/2404.01245v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.00612v2","updated":"2024-08-28T14:59:31Z","published":"2024-08-01T14:52:04Z","title":"Downstream bias mitigation is all you need","summary":"  The advent of transformer-based architectures and large language models\n(LLMs) have significantly advanced the performance of natural language\nprocessing (NLP) models. Since these LLMs are trained on huge corpuses of data\nfrom the web and other sources, there has been a major concern about harmful\nprejudices that may potentially be transferred from the data. In many\napplications, these pre-trained LLMs are fine-tuned on task specific datasets,\nwhich can further contribute to biases. This paper studies the extent of biases\nabsorbed by LLMs during pre-training as well as task-specific behaviour after\nfine-tuning. We found that controlled interventions on pre-trained LLMs, prior\nto fine-tuning, have minimal effect on lowering biases in classifiers. However,\nthe biases present in domain-specific datasets play a much bigger role, and\nhence mitigating them at this stage has a bigger impact. While pre-training\ndoes matter, but after the model has been pre-trained, even slight changes to\nco-occurrence rates in the fine-tuning dataset has a significant effect on the\nbias of the model.\n","authors":["Arkadeep Baksi","Rahul Singh","Tarun Joshi"],"pdf_url":"https://arxiv.org/pdf/2408.00612v2.pdf","comment":"arXiv admin note: This work has been withdrawn by arXiv\n  administrators due to inappropriate text reuse from external sources"},{"id":"http://arxiv.org/abs/2402.16696v3","updated":"2024-08-28T14:54:11Z","published":"2024-02-26T16:11:03Z","title":"Look Before You Leap: Towards Decision-Aware and Generalizable\n  Tool-Usage for Large Language Models","summary":"  Tool-augmented large language models (LLMs) are attracting widespread\nattention when accessing up-to-date knowledge and alleviating hallucination\nissues. Nowadays, advanced closed-source LLMs (e.g., ChatGPT) have demonstrated\nsurprising tool-usage capabilities through prompting and in-context learning\ntechniques. To empower the capabilities of open-source LLMs (e.g., LLaMA) in\nmanipulating tools, current efforts focus on either template-driven or\ntoken-triggered tool-usage. However, the former hampers LLMs' flexibility to\naddress diverse user's queries due to constrained tool interactions, while the\nlatter limits the generalizability when engaging with new tools, since\ntool-usage learning is based on task- and tool-specific datasets. To alleviate\nthese concerns, in this paper, we propose a decision-aware and generalizable\ntool-usage framework (DEER). Specifically, we first construct the tool-usage\nsamples with multiple decision branches via an automatic generation pipeline,\nthereby inspiring the decision-making awareness of LLMs under diverse\nscenarios. Meanwhile, we propose a novel tool sampling strategy to enhance the\ngeneralizability of LLMs over unseen tools. Extensive experiments demonstrate\nthat our proposed DEER is effective and significantly outperforms baselines\nacross various datasets.\n","authors":["Anchun Gui","Jian Li","Yong Dai","Nan Du","Han Xiao"],"pdf_url":"https://arxiv.org/pdf/2402.16696v3.pdf","comment":"20 pages, 18 figures"},{"id":"http://arxiv.org/abs/2408.15836v1","updated":"2024-08-28T14:48:37Z","published":"2024-08-28T14:48:37Z","title":"Knowledge Navigator: LLM-guided Browsing Framework for Exploratory\n  Search in Scientific Literature","summary":"  The exponential growth of scientific literature necessitates advanced tools\nfor effective knowledge exploration. We present Knowledge Navigator, a system\ndesigned to enhance exploratory search abilities by organizing and structuring\nthe retrieved documents from broad topical queries into a navigable, two-level\nhierarchy of named and descriptive scientific topics and subtopics. This\nstructured organization provides an overall view of the research themes in a\ndomain, while also enabling iterative search and deeper knowledge discovery\nwithin specific subtopics by allowing users to refine their focus and retrieve\nadditional relevant documents. Knowledge Navigator combines LLM capabilities\nwith cluster-based methods to enable an effective browsing method. We\ndemonstrate our approach's effectiveness through automatic and manual\nevaluations on two novel benchmarks, CLUSTREC-COVID and SCITOC. Our code,\nprompts, and benchmarks are made publicly available.\n","authors":["Uri Katz","Mosh Levy","Yoav Goldberg"],"pdf_url":"https://arxiv.org/pdf/2408.15836v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13560v2","updated":"2024-08-28T14:45:57Z","published":"2024-03-20T12:52:38Z","title":"eRST: A Signaled Graph Theory of Discourse Relations and Organization","summary":"  In this article we present Enhanced Rhetorical Structure Theory (eRST), a new\ntheoretical framework for computational discourse analysis, based on an\nexpansion of Rhetorical Structure Theory (RST). The framework encompasses\ndiscourse relation graphs with tree-breaking, non-projective and concurrent\nrelations, as well as implicit and explicit signals which give explainable\nrationales to our analyses. We survey shortcomings of RST and other existing\nframeworks, such as Segmented Discourse Representation Theory (SDRT), the Penn\nDiscourse Treebank (PDTB) and Discourse Dependencies, and address these using\nconstructs in the proposed theory. We provide annotation, search and\nvisualization tools for data, and present and evaluate a freely available\ncorpus of English annotated according to our framework, encompassing 12 spoken\nand written genres with over 200K tokens. Finally, we discuss automatic\nparsing, evaluation metrics and applications for data in our framework.\n","authors":["Amir Zeldes","Tatsuya Aoyama","Yang Janet Liu","Siyao Peng","Debopam Das","Luke Gessler"],"pdf_url":"https://arxiv.org/pdf/2403.13560v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15827v1","updated":"2024-08-28T14:40:15Z","published":"2024-08-28T14:40:15Z","title":"Automatic Differential Diagnosis using Transformer-Based Multi-Label\n  Sequence Classification","summary":"  As the field of artificial intelligence progresses, assistive technologies\nare becoming more widely used across all industries. The healthcare industry is\nno different, with numerous studies being done to develop assistive tools for\nhealthcare professionals. Automatic diagnostic systems are one such beneficial\ntool that can assist with a variety of tasks, including collecting patient\ninformation, analyzing test results, and diagnosing patients. However, the idea\nof developing systems that can provide a differential diagnosis has been\nlargely overlooked in most of these research studies. In this study, we propose\na transformer-based approach for providing differential diagnoses based on a\npatient's age, sex, medical history, and symptoms. We use the DDXPlus dataset,\nwhich provides differential diagnosis information for patients based on 49\ndisease types. Firstly, we propose a method to process the tabular patient data\nfrom the dataset and engineer them into patient reports to make them suitable\nfor our research. In addition, we introduce two data modification modules to\ndiversify the training data and consequently improve the robustness of the\nmodels. We approach the task as a multi-label classification problem and\nconduct extensive experiments using four transformer models. All the models\ndisplayed promising results by achieving over 97% F1 score on the held-out test\nset. Moreover, we design additional behavioral tests to get a broader\nunderstanding of the models. In particular, for one of our test cases, we\nprepared a custom test set of 100 samples with the assistance of a doctor. The\nresults on the custom set showed that our proposed data modification modules\nimproved the model's generalization capabilities. We hope our findings will\nprovide future researchers with valuable insights and inspire them to develop\nreliable systems for automatic differential diagnosis.\n","authors":["Abu Adnan Sadi","Mohammad Ashrafuzzaman Khan","Lubaba Binte Saber"],"pdf_url":"https://arxiv.org/pdf/2408.15827v1.pdf","comment":"25 pages, 7 figures"},{"id":"http://arxiv.org/abs/2408.14511v2","updated":"2024-08-28T14:13:41Z","published":"2024-08-25T04:07:18Z","title":"Unveiling the Statistical Foundations of Chain-of-Thought Prompting\n  Methods","summary":"  Chain-of-Thought (CoT) prompting and its variants have gained popularity as\neffective methods for solving multi-step reasoning problems using pretrained\nlarge language models (LLMs). In this work, we analyze CoT prompting from a\nstatistical estimation perspective, providing a comprehensive characterization\nof its sample complexity. To this end, we introduce a multi-step latent\nvariable model that encapsulates the reasoning process, where the latent\nvariable encodes the task information. Under this framework, we demonstrate\nthat when the pretraining dataset is sufficiently large, the estimator formed\nby CoT prompting is equivalent to a Bayesian estimator. This estimator\neffectively solves the multi-step reasoning problem by aggregating a posterior\ndistribution inferred from the demonstration examples in the prompt. Moreover,\nwe prove that the statistical error of the CoT estimator can be decomposed into\ntwo main components: (i) a prompting error, which arises from inferring the\ntrue task using CoT prompts, and (ii) the statistical error of the pretrained\nLLM. We establish that, under appropriate assumptions, the prompting error\ndecays exponentially to zero as the number of demonstrations increases.\nAdditionally, we explicitly characterize the approximation and generalization\nerrors of the pretrained LLM. Notably, we construct a transformer model that\napproximates the target distribution of the multi-step reasoning problem with\nan error that decreases exponentially in the number of transformer blocks. Our\nanalysis extends to other variants of CoT, including Self-Consistent CoT,\nTree-of-Thought, and Selection-Inference, offering a broad perspective on the\nefficacy of these methods. We also provide numerical experiments to validate\nthe theoretical findings.\n","authors":["Xinyang Hu","Fengzhuo Zhang","Siyu Chen","Zhuoran Yang"],"pdf_url":"https://arxiv.org/pdf/2408.14511v2.pdf","comment":"150 pages, 18 figures, 3 tables"},{"id":"http://arxiv.org/abs/2402.14846v4","updated":"2024-08-28T14:04:05Z","published":"2024-02-19T14:53:01Z","title":"Stick to your Role! Stability of Personal Values Expressed in Large\n  Language Models","summary":"  The standard way to study Large Language Models (LLMs) with benchmarks or\npsychology questionnaires is to provide many different queries from similar\nminimal contexts (e.g. multiple choice questions). However, due to LLMs' highly\ncontext-dependent nature, conclusions from such minimal-context evaluations may\nbe little informative about the model's behavior in deployment (where it will\nbe exposed to many new contexts). We argue that context-dependence\n(specifically, value stability) should be studied as a specific property of\nLLMs and used as another dimension of LLM comparison (alongside others such as\ncognitive abilities, knowledge, or model size). We present a case-study on the\nstability of value expression over different contexts (simulated conversations\non different topics) as measured using a standard psychology questionnaire\n(PVQ) and on behavioral downstream tasks. Reusing methods from psychology, we\nstudy Rank-order stability on the population (interpersonal) level, and\nIpsative stability on the individual (intrapersonal) level. We consider two\nsettings (with and without instructing LLMs to simulate particular personas),\ntwo simulated populations, and three downstream tasks. We observe consistent\ntrends in the stability of models and model families - Mixtral, Mistral,\nGPT-3.5 and Qwen families are more stable than LLaMa-2 and Phi. The consistency\nof these trends implies that some models exhibit higher value stability than\nothers, and that stability can be estimated with the set of introduced\nmethodological tools. When instructed to simulate particular personas, LLMs\nexhibit low Rank-order stability, which further diminishes with conversation\nlength. This highlights the need for future research on LLMs that coherently\nsimulate different personas. This paper provides a foundational step in that\ndirection, and, to our knowledge, it is the first study of value stability in\nLLMs.\n","authors":["Grgur Kovač","Rémy Portelas","Masataka Sawayama","Peter Ford Dominey","Pierre-Yves Oudeyer"],"pdf_url":"https://arxiv.org/pdf/2402.14846v4.pdf","comment":"The project website and code are available at\n  https://sites.google.com/view/llmvaluestability Published in PLOS ONE (\n  https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0309114 ),\n  and a shorter version at CogSci 24 (\n  https://escholarship.org/uc/item/7w4823c6 )"},{"id":"http://arxiv.org/abs/2408.15801v1","updated":"2024-08-28T13:52:19Z","published":"2024-08-28T13:52:19Z","title":"Scaling Up Summarization: Leveraging Large Language Models for Long Text\n  Extractive Summarization","summary":"  In an era where digital text is proliferating at an unprecedented rate,\nefficient summarization tools are becoming indispensable. While Large Language\nModels (LLMs) have been successfully applied in various NLP tasks, their role\nin extractive text summarization remains underexplored. This paper introduces\nEYEGLAXS (Easy Yet Efficient larGe LAnguage model for eXtractive\nSummarization), a framework that leverages LLMs, specifically LLAMA2-7B and\nChatGLM2-6B, for extractive summarization of lengthy text documents. Instead of\nabstractive methods, which often suffer from issues like factual inaccuracies\nand hallucinations, EYEGLAXS focuses on extractive summarization to ensure\nfactual and grammatical integrity. Utilizing state-of-the-art techniques such\nas Flash Attention and Parameter-Efficient Fine-Tuning (PEFT), EYEGLAXS\naddresses the computational and resource challenges typically associated with\nLLMs. The system sets new performance benchmarks on well-known datasets like\nPubMed and ArXiv. Furthermore, we extend our research through additional\nanalyses that explore the adaptability of LLMs in handling different sequence\nlengths and their efficiency in training on smaller datasets. These\ncontributions not only set a new standard in the field but also open up\npromising avenues for future research in extractive text summarization.\n","authors":["Léo Hemamou","Mehdi Debiane"],"pdf_url":"https://arxiv.org/pdf/2408.15801v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15793v1","updated":"2024-08-28T13:37:07Z","published":"2024-08-28T13:37:07Z","title":"Language Adaptation on a Tight Academic Compute Budget: Tokenizer\n  Swapping Works and Pure bfloat16 Is Enough","summary":"  We investigate continued pretraining of LLMs for language adaptation on a\ntight academic budget: a setting in which only a few GPUs can be used in\nparallel, for a heavily constrained duration. We focus on adapting Mistral-7B\nto German or Arabic and evaluate several techniques to improve efficiency and\neffectiveness in this setting. Our German models adapted on this tight compute\nbudget underperform compared to the base Mistral-7B, while our Arabic models\noutperform several baselines, showing that for sufficiently well-represented\nlanguages, continued pretraining for specialization is not always helpful. Our\nmain findings focus on training precision and tokenizer swapping. Our results\nshow that pure bfloat16 training is a viable alternative to mixed-precision\ntraining, while being much faster when only using a few GPUs. Swapping the\ntokenizer for a specialized one yields more efficient tokenization and is\ncompetitive with the original tokenizer, which already contains some German\ntokens, but did not significantly increase performance for German. Code and\nmodel weights are available at on GitHub.\n","authors":["Konstantin Dobler","Gerard de Melo"],"pdf_url":"https://arxiv.org/pdf/2408.15793v1.pdf","comment":"WANT@ICML 2024"},{"id":"http://arxiv.org/abs/2408.15787v1","updated":"2024-08-28T13:29:59Z","published":"2024-08-28T13:29:59Z","title":"Interactive Agents: Simulating Counselor-Client Psychological Counseling\n  via Role-Playing LLM-to-LLM Interactions","summary":"  Virtual counselors powered by large language models (LLMs) aim to create\ninteractive support systems that effectively assist clients struggling with\nmental health challenges. To replicate counselor-client conversations,\nresearchers have built an online mental health platform that allows\nprofessional counselors to provide clients with text-based counseling services\nfor about an hour per session. Notwithstanding its effectiveness, challenges\nexist as human annotation is time-consuming, cost-intensive, privacy-protected,\nand not scalable. To address this issue and investigate the applicability of\nLLMs in psychological counseling conversation simulation, we propose a\nframework that employs two LLMs via role-playing for simulating\ncounselor-client interactions. Our framework involves two LLMs, one acting as a\nclient equipped with a specific and real-life user profile and the other\nplaying the role of an experienced counselor, generating professional responses\nusing integrative therapy techniques. We implement both the counselor and the\nclient by zero-shot prompting the GPT-4 model. In order to assess the\neffectiveness of LLMs in simulating counselor-client interactions and\nunderstand the disparities between LLM- and human-generated conversations, we\nevaluate the synthetic data from various perspectives. We begin by assessing\nthe client's performance through automatic evaluations. Next, we analyze and\ncompare the disparities between dialogues generated by the LLM and those\ngenerated by professional counselors. Furthermore, we conduct extensive\nexperiments to thoroughly examine the performance of our LLM-based counselor\ntrained with synthetic interactive dialogues by benchmarking against\nstate-of-the-art models for mental health.\n","authors":["Huachuan Qiu","Zhenzhong Lan"],"pdf_url":"https://arxiv.org/pdf/2408.15787v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14438v2","updated":"2024-08-28T13:19:36Z","published":"2024-08-26T17:25:16Z","title":"Evaluating Large Language Models on Spatial Tasks: A Multi-Task\n  Benchmarking Study","summary":"  The advent of large language models such as ChatGPT, Gemini, and others has\nunderscored the importance of evaluating their diverse capabilities, ranging\nfrom natural language understanding to code generation. However, their\nperformance on spatial tasks has not been comprehensively assessed. This study\naddresses this gap by introducing a novel multi-task spatial evaluation\ndataset, designed to systematically explore and compare the performance of\nseveral advanced models on spatial tasks. The dataset encompasses twelve\ndistinct task types, including spatial understanding and path planning, each\nwith verified, accurate answers. We evaluated multiple models, including\nOpenAI's gpt-3.5-turbo, gpt-4o, and ZhipuAI's glm-4, through a two-phase\ntesting approach. Initially, we conducted zero-shot testing, followed by\ncategorizing the dataset by difficulty and performing prompt tuning tests.\nResults indicate that gpt-4o achieved the highest overall accuracy in the first\nphase, with an average of 71.3%. Although moonshot-v1-8k slightly\nunderperformed overall, it surpassed gpt-4o in place name recognition tasks.\nThe study also highlights the impact of prompt strategies on model performance\nin specific tasks. For example, the Chain-of-Thought (COT) strategy increased\ngpt-4o's accuracy in path planning from 12.4% to 87.5%, while a one-shot\nstrategy enhanced moonshot-v1-8k's accuracy in mapping tasks from 10.1% to\n76.3%.\n","authors":["Liuchang Xu","Shuo Zhao","Qingming Lin","Luyao Chen","Qianqian Luo","Sensen Wu","Xinyue Ye","Hailin Feng","Zhenhong Du"],"pdf_url":"https://arxiv.org/pdf/2408.14438v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15778v1","updated":"2024-08-28T13:16:41Z","published":"2024-08-28T13:16:41Z","title":"LogicGame: Benchmarking Rule-Based Reasoning Abilities of Large Language\n  Models","summary":"  Large Language Models (LLMs) have demonstrated notable capabilities across\nvarious tasks, showcasing complex problem-solving abilities. Understanding and\nexecuting complex rules, along with multi-step planning, are fundamental to\nlogical reasoning and critical for practical LLM agents and decision-making\nsystems. However, evaluating LLMs as effective rule-based executors and\nplanners remains underexplored. In this paper, we introduce LogicGame, a novel\nbenchmark designed to evaluate the comprehensive rule understanding, execution,\nand planning capabilities of LLMs. Unlike traditional benchmarks, LogicGame\nprovides diverse games that contain a series of rules with an initial state,\nrequiring models to comprehend and apply predefined regulations to solve\nproblems. We create simulated scenarios in which models execute or plan\noperations to achieve specific outcomes. These game scenarios are specifically\ndesigned to distinguish logical reasoning from mere knowledge by relying\nexclusively on predefined rules. This separation allows for a pure assessment\nof rule-based reasoning capabilities. The evaluation considers not only final\noutcomes but also intermediate steps, providing a comprehensive assessment of\nmodel performance. Moreover, these intermediate steps are deterministic and can\nbe automatically verified. LogicGame defines game scenarios with varying\ndifficulty levels, from simple rule applications to complex reasoning chains,\nin order to offer a precise evaluation of model performance on rule\nunderstanding and multi-step execution. Utilizing LogicGame, we test various\nLLMs and identify notable shortcomings in their rule-based logical reasoning\nabilities.\n","authors":["Jiayi Gui","Yiming Liu","Jiale Cheng","Xiaotao Gu","Xiao Liu","Hongning Wang","Yuxiao Dong","Jie Tang","Minlie Huang"],"pdf_url":"https://arxiv.org/pdf/2408.15778v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15769v1","updated":"2024-08-28T13:05:55Z","published":"2024-08-28T13:05:55Z","title":"A Survey on Evaluation of Multimodal Large Language Models","summary":"  Multimodal Large Language Models (MLLMs) mimic human perception and reasoning\nsystem by integrating powerful Large Language Models (LLMs) with various\nmodality encoders (e.g., vision, audio), positioning LLMs as the \"brain\" and\nvarious modality encoders as sensory organs. This framework endows MLLMs with\nhuman-like capabilities, and suggests a potential pathway towards achieving\nartificial general intelligence (AGI). With the emergence of all-round MLLMs\nlike GPT-4V and Gemini, a multitude of evaluation methods have been developed\nto assess their capabilities across different dimensions. This paper presents a\nsystematic and comprehensive review of MLLM evaluation methods, covering the\nfollowing key aspects: (1) the background of MLLMs and their evaluation; (2)\n\"what to evaluate\" that reviews and categorizes existing MLLM evaluation tasks\nbased on the capabilities assessed, including general multimodal recognition,\nperception, reasoning and trustworthiness, and domain-specific applications\nsuch as socioeconomic, natural sciences and engineering, medical usage, AI\nagent, remote sensing, video and audio processing, 3D point cloud analysis, and\nothers; (3) \"where to evaluate\" that summarizes MLLM evaluation benchmarks into\ngeneral and specific benchmarks; (4) \"how to evaluate\" that reviews and\nillustrates MLLM evaluation steps and metrics; Our overarching goal is to\nprovide valuable insights for researchers in the field of MLLM evaluation,\nthereby facilitating the development of more capable and reliable MLLMs. We\nemphasize that evaluation should be regarded as a critical discipline,\nessential for advancing the field of MLLMs.\n","authors":["Jiaxing Huang","Jingyi Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.15769v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15766v1","updated":"2024-08-28T12:59:12Z","published":"2024-08-28T12:59:12Z","title":"Harmonized Speculative Sampling","summary":"  Speculative sampling has proven to be an effective solution to accelerate\ndecoding from large language models, where the acceptance rate significantly\ndetermines the performance. Most previous works on improving the acceptance\nrate focus on aligned training and efficient decoding, implicitly paying less\nattention to the linkage of training and decoding. In this work, we first\ninvestigate the linkage of training and decoding for speculative sampling and\nthen propose a solution named HArmonized Speculative Sampling (HASS). HASS\nimproves the acceptance rate without extra inference overhead by harmonizing\ntraining and decoding on their objectives and contexts. Experiments on three\nLLaMA models demonstrate that HASS achieves 2.81x-3.65x wall-clock time speedup\nratio averaging across three datasets, which is 8%-15% faster than EAGLE-2.\n","authors":["Lefan Zhang","Xiaodan Wang","Yanhua Huang","Ruiwen Xu"],"pdf_url":"https://arxiv.org/pdf/2408.15766v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15747v1","updated":"2024-08-28T12:25:45Z","published":"2024-08-28T12:25:45Z","title":"Form and meaning co-determine the realization of tone in Taiwan Mandarin\n  spontaneous speech: the case of Tone 3 sandhi","summary":"  In Standard Chinese, Tone 3 (the dipping tone) becomes Tone 2 (rising tone)\nwhen followed by another Tone 3. Previous studies have noted that this sandhi\nprocess may be incomplete, in the sense that the assimilated Tone 3 is still\ndistinct from a true Tone 2. While Mandarin Tone 3 sandhi is widely studied\nusing carefully controlled laboratory speech (Xu, 1997) and more formal\nregisters of Beijing Mandarin (Yuan and Chen, 2014), less is known about its\nrealization in spontaneous speech, and about the effect of contextual factors\non tonal realization. The present study investigates the pitch contours of\ntwo-character words with T2-T3 and T3-T3 tone patterns in spontaneous Taiwan\nMandarin conversations. Our analysis makes use of the Generative Additive Mixed\nModel (GAMM, Wood, 2017) to examine fundamental frequency (f0) contours as a\nfunction of normalized time. We consider various factors known to influence\npitch contours, including gender, speaking rate, speaker, neighboring tones,\nword position, bigram probability, and also novel predictors, word and word\nsense (Chuang et al., 2024). Our analyses revealed that in spontaneous Taiwan\nMandarin, T3-T3 words become indistinguishable from T2-T3 words, indicating\ncomplete sandhi, once the strong effect of word (or word sense) is taken into\naccount. For our data, the shape of f0 contours is not co-determined by word\nfrequency. In contrast, the effect of word meaning on f0 contours is robust, as\nstrong as the effect of adjacent tones, and is present for both T2-T3 and T3-T3\nwords.\n","authors":["Yuxin Lu","Yu-Ying Chuang","R. Harald Baayen"],"pdf_url":"https://arxiv.org/pdf/2408.15747v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14398v2","updated":"2024-08-28T12:03:54Z","published":"2024-08-26T16:29:13Z","title":"Language-specific Calibration for Pruning Multilingual Language Models","summary":"  Recent advances in large language model (LLM) pruning have shown\nstate-of-the-art compression results in post-training and retraining-free\nsettings while maintaining high predictive performance. However, such research\nmainly considers calibrating pruning using English text, despite the\nmultilingual nature of modern LLMs and their frequent uses in non-English\nlanguages. In this paper, we set out to explore effective strategies for\ncalibrating the pruning of multilingual language models. We present the first\ncomprehensive empirical study, comparing different calibration languages for\npruning multilingual models across diverse tasks, models, and state-of-the-art\npruning techniques. Our results present practical suggestions, for example,\ncalibrating in the target language can efficiently yield lower perplexity, but\ndoes not necessarily benefit downstream tasks. Our further analysis experiments\nunveil that calibration in the target language mainly contributes to preserving\nlanguage-specific features related to fluency and coherence, but might not\ncontribute to capturing language-agnostic features such as language\nunderstanding and reasoning. Last, we provide practical recommendations for\nfuture practitioners.\n","authors":["Simon Kurz","Jian-Jia Chen","Lucie Flek","Zhixue Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.14398v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15729v1","updated":"2024-08-28T11:44:52Z","published":"2024-08-28T11:44:52Z","title":"LM-PUB-QUIZ: A Comprehensive Framework for Zero-Shot Evaluation of\n  Relational Knowledge in Language Models","summary":"  Knowledge probing evaluates the extent to which a language model (LM) has\nacquired relational knowledge during its pre-training phase. It provides a\ncost-effective means of comparing LMs of different sizes and training setups\nand is useful for monitoring knowledge gained or lost during continual learning\n(CL). In prior work, we presented an improved knowledge probe called BEAR\n(Wiland et al., 2024), which enables the comparison of LMs trained with\ndifferent pre-training objectives (causal and masked LMs) and addresses issues\nof skewed distributions in previous probes to deliver a more unbiased reading\nof LM knowledge. With this paper, we present LM-PUB- QUIZ, a Python framework\nand leaderboard built around the BEAR probing mechanism that enables\nresearchers and practitioners to apply it in their work. It provides options\nfor standalone evaluation and direct integration into the widely-used training\npipeline of the Hugging Face TRANSFORMERS library. Further, it provides a\nfine-grained analysis of different knowledge types to assist users in better\nunderstanding the knowledge in each evaluated LM. We publicly release\nLM-PUB-QUIZ as an open-source project.\n","authors":["Max Ploner","Jacek Wiland","Sebastian Pohl","Alan Akbik"],"pdf_url":"https://arxiv.org/pdf/2408.15729v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15720v1","updated":"2024-08-28T11:36:29Z","published":"2024-08-28T11:36:29Z","title":"An Evaluation of Sindhi Word Embedding in Semantic Analogies and\n  Downstream Tasks","summary":"  In this paper, we propose a new word embedding based corpus consisting of\nmore than 61 million words crawled from multiple web resources. We design a\npreprocessing pipeline for the filtration of unwanted text from crawled data.\nAfterwards, the cleaned vocabulary is fed to state-of-the-art\ncontinuous-bag-of-words, skip-gram, and GloVe word embedding algorithms. For\nthe evaluation of pretrained embeddings, we use popular intrinsic and extrinsic\nevaluation approaches. The evaluation results reveal that\ncontinuous-bag-of-words and skip-gram perform better than GloVe and existing\nSindhi fastText word embedding on both intrinsic and extrinsic evaluation\napproaches\n","authors":["Wazir Ali","Saifullah Tumrani","Jay Kumar","Tariq Rahim Soomro"],"pdf_url":"https://arxiv.org/pdf/2408.15720v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:1911.12579"},{"id":"http://arxiv.org/abs/2408.15710v1","updated":"2024-08-28T11:18:06Z","published":"2024-08-28T11:18:06Z","title":"Conan-embedding: General Text Embedding with More and Better Negative\n  Samples","summary":"  With the growing popularity of RAG, the capabilities of embedding models are\ngaining increasing attention. Embedding models are primarily trained through\ncontrastive loss learning, with negative examples being a key component.\nPrevious work has proposed various hard negative mining strategies, but these\nstrategies are typically employed as preprocessing steps. In this paper, we\npropose the conan-embedding model, which maximizes the utilization of more and\nhigher-quality negative examples. Specifically, since the model's ability to\nhandle preprocessed negative examples evolves during training, we propose\ndynamic hard negative mining method to expose the model to more challenging\nnegative examples throughout the training process. Secondly, contrastive\nlearning requires as many negative examples as possible but is limited by GPU\nmemory constraints. Therefore, we use a Cross-GPU balancing Loss to provide\nmore negative examples for embedding training and balance the batch size across\nmultiple tasks. Moreover, we also discovered that the prompt-response pairs\nfrom LLMs can be used for embedding training. Our approach effectively enhances\nthe capabilities of embedding models, currently ranking first on the Chinese\nleaderboard of Massive text embedding benchmark\n","authors":["Shiyu Li","Yang Tang","Shizhe Chen","Xi Chen"],"pdf_url":"https://arxiv.org/pdf/2408.15710v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11239v2","updated":"2024-08-28T11:10:59Z","published":"2024-06-17T06:07:32Z","title":"Evading AI-Generated Content Detectors using Homoglyphs","summary":"  The advent of large language models (LLMs) has enabled the generation of text\nthat increasingly exhibits human-like characteristics. As the detection of such\ncontent is of significant importance, numerous studies have been conducted with\nthe aim of developing reliable AI-generated text detectors. These detectors\nhave demonstrated promising results on test data, but recent research has\nrevealed that they can be circumvented by employing different techniques. In\nthis paper, we present homoglyph-based attacks ($a \\rightarrow {\\alpha}$) as a\nmeans of circumventing existing detectors. A comprehensive evaluation was\nconducted to assess the effectiveness of these attacks on seven detectors,\nincluding ArguGPT, Binoculars, DetectGPT, Fast-DetectGPT, Ghostbuster, OpenAI's\ndetector, and watermarking techniques, on five different datasets. Our findings\ndemonstrate that homoglyph-based attacks can effectively circumvent\nstate-of-the-art detectors, leading them to classify all texts as either\nAI-generated or human-written (decreasing the average Matthews Correlation\nCoefficient from 0.64 to -0.01). We then examine the effectiveness of these\nattacks by analyzing how homoglyphs impact different families of detectors.\nFinally, we discuss the implications of these findings and potential defenses\nagainst such attacks.\n","authors":["Aldan Creo","Shushanta Pudasaini"],"pdf_url":"https://arxiv.org/pdf/2406.11239v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.11537v3","updated":"2024-08-28T10:39:11Z","published":"2024-02-18T10:36:05Z","title":"Deciphering the Impact of Pretraining Data on Large Language Models\n  through Machine Unlearning","summary":"  Through pretraining on a corpus with various sources, Large Language Models\n(LLMs) have gained impressive performance. However, the impact of each\ncomponent of the pretraining corpus remains opaque. As a result, the\norganization of the pretraining corpus is still empirical and may deviate from\nthe optimal. To address this issue, we systematically analyze the impact of 48\ndatasets from 5 major categories of pretraining data of LLMs and measure their\nimpacts on LLMs using benchmarks about nine major categories of model\ncapabilities. Our analyses provide empirical results about the contribution of\nmultiple corpora on the performances of LLMs, along with their joint impact\npatterns, including complementary, orthogonal, and correlational relationships.\nWe also identify a set of ``high-impact data'' such as Books that is\nsignificantly related to a set of model capabilities. These findings provide\ninsights into the organization of data to support more efficient pretraining of\nLLMs.\n","authors":["Yang Zhao","Li Du","Xiao Ding","Kai Xiong","Zhouhao Sun","Jun Shi","Ting Liu","Bing Qin"],"pdf_url":"https://arxiv.org/pdf/2402.11537v3.pdf","comment":"Accepted by ACL 2024 Findings"},{"id":"http://arxiv.org/abs/2408.15689v1","updated":"2024-08-28T10:25:53Z","published":"2024-08-28T10:25:53Z","title":"TempoFormer: A Transformer for Temporally-aware Representations in\n  Change Detection","summary":"  Dynamic representation learning plays a pivotal role in understanding the\nevolution of linguistic content over time. On this front both context and time\ndynamics as well as their interplay are of prime importance. Current approaches\nmodel context via pre-trained representations, which are typically temporally\nagnostic. Previous work on modeling context and temporal dynamics has used\nrecurrent methods, which are slow and prone to overfitting. Here we introduce\nTempoFormer, the fist task-agnostic transformer-based and temporally-aware\nmodel for dynamic representation learning. Our approach is jointly trained on\ninter and intra context dynamics and introduces a novel temporal variation of\nrotary positional embeddings. The architecture is flexible and can be used as\nthe temporal representation foundation of other models or applied to different\ntransformer-based architectures. We show new SOTA performance on three\ndifferent real-time change detection tasks.\n","authors":["Talia Tseriotou","Adam Tsakalidis","Maria Liakata"],"pdf_url":"https://arxiv.org/pdf/2408.15689v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15666v1","updated":"2024-08-28T09:35:15Z","published":"2024-08-28T09:35:15Z","title":"StyleRemix: Interpretable Authorship Obfuscation via Distillation and\n  Perturbation of Style Elements","summary":"  Authorship obfuscation, rewriting a text to intentionally obscure the\nidentity of the author, is an important but challenging task. Current methods\nusing large language models (LLMs) lack interpretability and controllability,\noften ignoring author-specific stylistic features, resulting in less robust\nperformance overall.\n  To address this, we develop StyleRemix, an adaptive and interpretable\nobfuscation method that perturbs specific, fine-grained style elements of the\noriginal input text. StyleRemix uses pre-trained Low Rank Adaptation (LoRA)\nmodules to rewrite an input specifically along various stylistic axes (e.g.,\nformality and length) while maintaining low computational cost. StyleRemix\noutperforms state-of-the-art baselines and much larger LLMs in a variety of\ndomains as assessed by both automatic and human evaluation.\n  Additionally, we release AuthorMix, a large set of 30K high-quality,\nlong-form texts from a diverse set of 14 authors and 4 domains, and DiSC, a\nparallel corpus of 1,500 texts spanning seven style axes in 16 unique\ndirections\n","authors":["Jillian Fisher","Skyler Hallinan","Ximing Lu","Mitchell Gordon","Zaid Harchaoui","Yejin Choi"],"pdf_url":"https://arxiv.org/pdf/2408.15666v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15664v1","updated":"2024-08-28T09:31:09Z","published":"2024-08-28T09:31:09Z","title":"Auxiliary-Loss-Free Load Balancing Strategy for Mixture-of-Experts","summary":"  For Mixture-of-Experts (MoE) models, an unbalanced expert load will lead to\nrouting collapse or increased computational overhead. Existing methods commonly\nemploy an auxiliary loss to encourage load balance, but a large auxiliary loss\nwill introduce non-negligible interference gradients into training and thus\nimpair the model performance. In order to control load balance while not\nproducing undesired gradients during training, we propose Loss-Free Balancing,\nfeatured by an auxiliary-loss-free load balancing strategy. To be specific,\nbefore the top-K routing decision, Loss-Free Balancing will first apply an\nexpert-wise bias to the routing scores of each expert. By dynamically updating\nthe bias of each expert according to its recent load, Loss-Free Balancing can\nconsistently maintain a balanced distribution of expert load. In addition,\nsince Loss-Free Balancing does not produce any interference gradients, it also\nelevates the upper bound of model performance gained from MoE training. We\nvalidate the performance of Loss-Free Balancing on MoE models with up to 3B\nparameters trained on up to 200B tokens. Experimental results show that\nLoss-Free Balancing achieves both better performance and better load balance\ncompared with traditional auxiliary-loss-controlled load balancing strategies.\n","authors":["Lean Wang","Huazuo Gao","Chenggang Zhao","Xu Sun","Damai Dai"],"pdf_url":"https://arxiv.org/pdf/2408.15664v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15650v1","updated":"2024-08-28T09:07:30Z","published":"2024-08-28T09:07:30Z","title":"Harnessing the Intrinsic Knowledge of Pretrained Language Models for\n  Challenging Text Classification Settings","summary":"  Text classification is crucial for applications such as sentiment analysis\nand toxic text filtering, but it still faces challenges due to the complexity\nand ambiguity of natural language. Recent advancements in deep learning,\nparticularly transformer architectures and large-scale pretraining, have\nachieved inspiring success in NLP fields. Building on these advancements, this\nthesis explores three challenging settings in text classification by leveraging\nthe intrinsic knowledge of pretrained language models (PLMs). Firstly, to\naddress the challenge of selecting misleading yet incorrect distractors for\ncloze questions, we develop models that utilize features based on\ncontextualized word representations from PLMs, achieving performance that\nrivals or surpasses human accuracy. Secondly, to enhance model generalization\nto unseen labels, we create small finetuning datasets with domain-independent\ntask label descriptions, improving model performance and robustness. Lastly, we\ntackle the sensitivity of large language models to in-context learning prompts\nby selecting effective demonstrations, focusing on misclassified examples and\nresolving model ambiguity regarding test example labels.\n","authors":["Lingyu Gao"],"pdf_url":"https://arxiv.org/pdf/2408.15650v1.pdf","comment":"PhD thesis"},{"id":"http://arxiv.org/abs/2205.11245v4","updated":"2024-08-28T08:51:57Z","published":"2022-05-18T04:38:15Z","title":"PASH at TREC 2021 Deep Learning Track: Generative Enhanced Model for\n  Multi-stage Ranking","summary":"  This paper describes the PASH participation in TREC 2021 Deep Learning Track.\nIn the recall stage, we adopt a scheme combining sparse and dense retrieval\nmethod. In the multi-stage ranking phase, point-wise and pair-wise ranking\nstrategies are used one after another based on model continual pre-trained on\ngeneral knowledge and document-level data. Compared to TREC 2020 Deep Learning\nTrack, we have additionally introduced the generative model T5 to further\nenhance the performance.\n","authors":["Yixuan Qiao","Hao Chen","Jun Wang","Tuozhen Liu","Xianbin Ye","Xin Tang","Rui Fang","Peng Gao","Wenfeng Xie","Guotong Xie"],"pdf_url":"https://arxiv.org/pdf/2205.11245v4.pdf","comment":"TREC 2021"},{"id":"http://arxiv.org/abs/2405.20770v3","updated":"2024-08-28T08:46:17Z","published":"2024-05-24T07:23:56Z","title":"Large Language Model Sentinel: LLM Agent for Adversarial Purification","summary":"  Over the past two years, the use of large language models (LLMs) has advanced\nrapidly. While these LLMs offer considerable convenience, they also raise\nsecurity concerns, as LLMs are vulnerable to adversarial attacks by some\nwell-designed textual perturbations. In this paper, we introduce a novel\ndefense technique named Large LAnguage MOdel Sentinel (LLAMOS), which is\ndesigned to enhance the adversarial robustness of LLMs by purifying the\nadversarial textual examples before feeding them into the target LLM. Our\nmethod comprises two main components: a) Agent instruction, which can simulate\na new agent for adversarial defense, altering minimal characters to maintain\nthe original meaning of the sentence while defending against attacks; b)\nDefense guidance, which provides strategies for modifying clean or adversarial\nexamples to ensure effective defense and accurate outputs from the target LLMs.\nRemarkably, the defense agent demonstrates robust defensive capabilities even\nwithout learning from adversarial examples. Additionally, we conduct an\nintriguing adversarial experiment where we develop two agents, one for defense\nand one for attack, and engage them in mutual confrontation. During the\nadversarial interactions, neither agent completely beat the other. Extensive\nexperiments on both open-source and closed-source LLMs demonstrate that our\nmethod effectively defends against adversarial attacks, thereby enhancing\nadversarial robustness.\n","authors":["Guang Lin","Qibin Zhao"],"pdf_url":"https://arxiv.org/pdf/2405.20770v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15625v1","updated":"2024-08-28T08:25:22Z","published":"2024-08-28T08:25:22Z","title":"CBF-LLM: Safe Control for LLM Alignment","summary":"  This paper proposes a control-based framework for aligning large language\nmodels (LLMs) by leveraging a control barrier function (CBF) to ensure\nuser-desirable text generation. The presented framework applies the safety\nfilter, designed based on the CBF, to the output generation of the baseline\nLLM, i.e., the sequence of the token, with the aim of intervening in the\ngenerated text. The overall text-generation system is implemented with Llama 3\nand a RoBERTa model, and the source code is available at\nhttps://github.com/Mya-Mya/CBF-LLM. The experiment demonstrates its control\nability and effectiveness in reducing the number of interventions needed for\nuser-specified alignment tasks.\n","authors":["Yuya Miyaoka","Masaki Inoue"],"pdf_url":"https://arxiv.org/pdf/2408.15625v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15616v1","updated":"2024-08-28T08:14:51Z","published":"2024-08-28T08:14:51Z","title":"Beyond Levenshtein: Leveraging Multiple Algorithms for Robust Word Error\n  Rate Computations And Granular Error Classifications","summary":"  The Word Error Rate (WER) is the common measure of accuracy for Automatic\nSpeech Recognition (ASR). Transcripts are usually pre-processed by substituting\nspecific characters to account for non-semantic differences. As a result of\nthis normalisation, information on the accuracy of punctuation or\ncapitalisation is lost. We present a non-destructive, token-based approach\nusing an extended Levenshtein distance algorithm to compute a robust WER and\nadditional orthographic metrics. Transcription errors are also classified more\ngranularly by existing string similarity and phonetic algorithms. An evaluation\non several datasets demonstrates the practical equivalence of our approach\ncompared to common WER computations. We also provide an exemplary analysis of\nderived use cases, such as a punctuation error rate, and a web application for\ninteractive use and visualisation of our implementation. The code is available\nopen-source.\n","authors":["Korbinian Kuhn","Verena Kersken","Gottfried Zimmermann"],"pdf_url":"https://arxiv.org/pdf/2408.15616v1.pdf","comment":"Accepted in INTERSPEECH 2024"},{"id":"http://arxiv.org/abs/2406.18312v4","updated":"2024-08-28T08:07:49Z","published":"2024-06-26T12:51:37Z","title":"AI-native Memory: A Pathway from LLMs Towards AGI","summary":"  Large language models (LLMs) have demonstrated the world with the sparks of\nartificial general intelligence (AGI). One opinion, especially from some\nstartups working on LLMs, argues that an LLM with nearly unlimited context\nlength can realize AGI. However, they might be too optimistic about the\nlong-context capability of (existing) LLMs -- (1) Recent literature has shown\nthat their effective context length is significantly smaller than their claimed\ncontext length; and (2) Our reasoning-in-a-haystack experiments further\ndemonstrate that simultaneously finding the relevant information from a long\ncontext and conducting (simple) reasoning is nearly impossible. In this paper,\nwe envision a pathway from LLMs to AGI through the integration of\n\\emph{memory}. We believe that AGI should be a system where LLMs serve as core\nprocessors. In addition to raw data, the memory in this system would store a\nlarge number of important conclusions derived from reasoning processes.\nCompared with retrieval-augmented generation (RAG) that merely processing raw\ndata, this approach not only connects semantically related information closer,\nbut also simplifies complex inferences at the time of querying. As an\nintermediate stage, the memory will likely be in the form of natural language\ndescriptions, which can be directly consumed by users too. Ultimately, every\nagent/person should have its own large personal model, a deep neural network\nmodel (thus \\emph{AI-native}) that parameterizes and compresses all types of\nmemory, even the ones cannot be described by natural languages. Finally, we\ndiscuss the significant potential of AI-native memory as the transformative\ninfrastructure for (proactive) engagement, personalization, distribution, and\nsocial in the AGI era, as well as the incurred privacy and security challenges\nwith preliminary solutions.\n","authors":["Jingbo Shang","Zai Zheng","Jiale Wei","Xiang Ying","Felix Tao","Mindverse Team"],"pdf_url":"https://arxiv.org/pdf/2406.18312v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09333v2","updated":"2024-08-28T07:34:37Z","published":"2024-08-18T02:27:25Z","title":"SkyScript-100M: 1,000,000,000 Pairs of Scripts and Shooting Scripts for\n  Short Drama","summary":"  Generating high-quality shooting scripts containing information such as scene\nand shot language is essential for short drama script generation. We collect\n6,660 popular short drama episodes from the Internet, each with an average of\n100 short episodes, and the total number of short episodes is about 80,000,\nwith a total duration of about 2,000 hours and totaling 10 terabytes (TB). We\nperform keyframe extraction and annotation on each episode to obtain about\n10,000,000 shooting scripts. We perform 100 script restorations on the\nextracted shooting scripts based on our self-developed large short drama\ngeneration model SkyReels. This leads to a dataset containing 1,000,000,000\npairs of scripts and shooting scripts for short dramas, called SkyScript-100M.\nWe compare SkyScript-100M with the existing dataset in detail and demonstrate\nsome deeper insights that can be achieved based on SkyScript-100M. Based on\nSkyScript-100M, researchers can achieve several deeper and more far-reaching\nscript optimization goals, which may drive a paradigm shift in the entire field\nof text-to-video and significantly advance the field of short drama video\ngeneration. The data and code are available at\nhttps://github.com/vaew/SkyScript-100M.\n","authors":["Jing Tang","Quanlu Jia","Yuqiang Xie","Zeyu Gong","Xiang Wen","Jiayi Zhang","Yalong Guo","Guibin Chen","Jiangping Yang"],"pdf_url":"https://arxiv.org/pdf/2408.09333v2.pdf","comment":"18 pages, 12 figures"},{"id":"http://arxiv.org/abs/2408.13893v2","updated":"2024-08-28T07:16:37Z","published":"2024-08-25T17:07:39Z","title":"SimpleSpeech 2: Towards Simple and Efficient Text-to-Speech with\n  Flow-based Scalar Latent Transformer Diffusion Models","summary":"  Scaling Text-to-speech (TTS) to large-scale datasets has been demonstrated as\nan effective method for improving the diversity and naturalness of synthesized\nspeech. At the high level, previous large-scale TTS models can be categorized\ninto either Auto-regressive (AR) based (\\textit{e.g.}, VALL-E) or\nNon-auto-regressive (NAR) based models (\\textit{e.g.}, NaturalSpeech 2/3).\nAlthough these works demonstrate good performance, they still have potential\nweaknesses. For instance, AR-based models are plagued by unstable generation\nquality and slow generation speed; meanwhile, some NAR-based models need\nphoneme-level duration alignment information, thereby increasing the complexity\nof data pre-processing, model design, and loss design. In this work, we build\nupon our previous publication by implementing a simple and efficient\nnon-autoregressive (NAR) TTS framework, termed SimpleSpeech 2. SimpleSpeech 2\neffectively combines the strengths of both autoregressive (AR) and\nnon-autoregressive (NAR) methods, offering the following key advantages: (1)\nsimplified data preparation; (2) straightforward model and loss design; and (3)\nstable, high-quality generation performance with fast inference speed. Compared\nto our previous publication, we present ({\\romannumeral1}) a detailed analysis\nof the influence of speech tokenizer and noisy label for TTS performance;\n({\\romannumeral2}) four distinct types of sentence duration predictors;\n({\\romannumeral3}) a novel flow-based scalar latent transformer diffusion\nmodel. With these improvement, we show a significant improvement in generation\nperformance and generation speed compared to our previous work and other\nstate-of-the-art (SOTA) large-scale TTS models. Furthermore, we show that\nSimpleSpeech 2 can be seamlessly extended to multilingual TTS by training it on\nmultilingual speech datasets. Demos are available on:\n{https://dongchaoyang.top/SimpleSpeech2\\_demo/}.\n","authors":["Dongchao Yang","Rongjie Huang","Yuanyuan Wang","Haohan Guo","Dading Chong","Songxiang Liu","Xixin Wu","Helen Meng"],"pdf_url":"https://arxiv.org/pdf/2408.13893v2.pdf","comment":"Submit to TASLP"},{"id":"http://arxiv.org/abs/2408.15565v1","updated":"2024-08-28T06:33:03Z","published":"2024-08-28T06:33:03Z","title":"SIaM: Self-Improving Code-Assisted Mathematical Reasoning of Large\n  Language Models","summary":"  There is a growing trend of teaching large language models (LLMs) to solve\nmathematical problems through coding. Existing studies primarily focus on\nprompting powerful, closed-source models to generate seed training data\nfollowed by in-domain data augmentation, equipping LLMs with considerable\ncapabilities for code-aided mathematical reasoning. However, continually\ntraining these models on augmented data derived from a few datasets such as\nGSM8K may impair their generalization abilities and restrict their\neffectiveness to a narrow range of question types. Conversely, the potential of\nimproving such LLMs by leveraging large-scale, expert-written, diverse math\nquestion-answer pairs remains unexplored. To utilize these resources and tackle\nunique challenges such as code response assessment, we propose a novel paradigm\nthat uses a code-based critic model to guide steps including question-code data\nconstruction, quality control, and complementary evaluation. We also explore\ndifferent alignment algorithms with self-generated instruction/preference data\nto foster continuous improvement. Experiments across both in-domain (up to\n+5.7%) and out-of-domain (+4.4%) benchmarks in English and Chinese demonstrate\nthe effectiveness of the proposed paradigm.\n","authors":["Dian Yu","Baolin Peng","Ye Tian","Linfeng Song","Haitao Mi","Dong Yu"],"pdf_url":"https://arxiv.org/pdf/2408.15565v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15562v1","updated":"2024-08-28T06:28:01Z","published":"2024-08-28T06:28:01Z","title":"Boosting Lossless Speculative Decoding via Feature Sampling and Partial\n  Alignment Distillation","summary":"  Lossless speculative decoding accelerates target large language model (LLM)\ninference by employing a lightweight draft model for generating tree-structured\ncandidates, which are subsequently verified in parallel by the target LLM.\nCurrently, effective approaches leverage feature-level rather than token-level\nautoregression within the draft model to facilitate more straightforward\npredictions and enhanced knowledge distillation. In this paper, we reassess\nthese approaches and propose FSPAD (Feature Sampling and Partial Alignment\nDistillation for Lossless Speculative Decoding), which introduces two\nstraightforward and effective components within the existing framework to boost\nlossless speculative decoding. Firstly, FSPAD utilizes token embeddings to\nsample features of the target LLM in high-dimensional space before feeding them\ninto the draft model, due to the inherent uncertainty of the features\npreventing the draft model from obtaining the specific token output by the\ntarget LLM. Secondly, FSPAD introduces partial alignment distillation to weaken\nthe draft model's connection between features and logits, aiming to reduce the\nconflict between feature alignment and logit confidence during training. Our\nexperiments include both greedy and non-greedy decoding on the largest and\nsmallest models from the Vicuna and LLaMA3-Instruct series, as well as tasks in\nmulti-turn conversation, translation, summarization, question answering,\nmathematical reasoning, and retrieval-augmented generation. The results show\nthat FSPAD outperforms the state-of-the-art method across all the\naforementioned tasks and target LLMs.\n","authors":["Lujun Gui","Bin Xiao","Lei Su","Weipeng Chen"],"pdf_url":"https://arxiv.org/pdf/2408.15562v1.pdf","comment":"The work was not submitted to AAAI 2025"},{"id":"http://arxiv.org/abs/2408.15549v1","updated":"2024-08-28T05:53:46Z","published":"2024-08-28T05:53:46Z","title":"WildFeedback: Aligning LLMs With In-situ User Interactions And Feedback","summary":"  As large language models (LLMs) continue to advance, aligning these models\nwith human preferences has emerged as a critical challenge. Traditional\nalignment methods, relying on human or LLM annotated datasets, are limited by\ntheir resource-intensive nature, inherent subjectivity, and the risk of\nfeedback loops that amplify model biases. To overcome these limitations, we\nintroduce WildFeedback, a novel framework that leverages real-time, in-situ\nuser interactions to create preference datasets that more accurately reflect\nauthentic human values. WildFeedback operates through a three-step process:\nfeedback signal identification, preference data construction, and user-guided\nevaluation. We applied this framework to a large corpus of user-LLM\nconversations, resulting in a rich preference dataset that reflects genuine\nuser preferences. This dataset captures the nuances of user preferences by\nidentifying and classifying feedback signals within natural conversations,\nthereby enabling the construction of more representative and context-sensitive\nalignment data. Our extensive experiments demonstrate that LLMs fine-tuned on\nWildFeedback exhibit significantly improved alignment with user preferences, as\nevidenced by both traditional benchmarks and our proposed user-guided\nevaluation. By incorporating real-time feedback from actual users, WildFeedback\naddresses the scalability, subjectivity, and bias challenges that plague\nexisting approaches, marking a significant step toward developing LLMs that are\nmore responsive to the diverse and evolving needs of their users. In summary,\nWildFeedback offers a robust, scalable solution for aligning LLMs with true\nhuman values, setting a new standard for the development and evaluation of\nuser-centric language models.\n","authors":["Taiwei Shi","Zhuoer Wang","Longqi Yang","Ying-Chun Lin","Zexue He","Mengting Wan","Pei Zhou","Sujay Jauhar","Xiaofeng Xu","Xia Song","Jennifer Neville"],"pdf_url":"https://arxiv.org/pdf/2408.15549v1.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2408.15545v1","updated":"2024-08-28T05:41:52Z","published":"2024-08-28T05:41:52Z","title":"SciLitLLM: How to Adapt LLMs for Scientific Literature Understanding","summary":"  Scientific literature understanding is crucial for extracting targeted\ninformation and garnering insights, thereby significantly advancing scientific\ndiscovery. Despite the remarkable success of Large Language Models (LLMs), they\nface challenges in scientific literature understanding, primarily due to (1) a\nlack of scientific knowledge and (2) unfamiliarity with specialized scientific\ntasks.\n  To develop an LLM specialized in scientific literature understanding, we\npropose a hybrid strategy that integrates continual pre-training (CPT) and\nsupervised fine-tuning (SFT), to simultaneously infuse scientific domain\nknowledge and enhance instruction-following capabilities for domain-specific\ntasks.cIn this process, we identify two key challenges: (1) constructing\nhigh-quality CPT corpora, and (2) generating diverse SFT instructions. We\naddress these challenges through a meticulous pipeline, including PDF text\nextraction, parsing content error correction, quality filtering, and synthetic\ninstruction creation. Applying this strategy, we present a suite of LLMs:\nSciLitLLM, specialized in scientific literature understanding. These models\ndemonstrate promising performance on scientific literature understanding\nbenchmarks.\n  Our contributions are threefold: (1) We present an effective framework that\nintegrates CPT and SFT to adapt LLMs to scientific literature understanding,\nwhich can also be easily adapted to other domains. (2) We propose an LLM-based\nsynthesis method to generate diverse and high-quality scientific instructions,\nresulting in a new instruction set -- SciLitIns -- for supervised fine-tuning\nin less-represented scientific domains. (3) SciLitLLM achieves promising\nperformance improvements on scientific literature understanding benchmarks.\n","authors":["Sihang Li","Jian Huang","Jiaxi Zhuang","Yaorui Shi","Xiaochen Cai","Mingjun Xu","Xiang Wang","Linfeng Zhang","Guolin Ke","Hengxing Cai"],"pdf_url":"https://arxiv.org/pdf/2408.15545v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15543v1","updated":"2024-08-28T05:36:25Z","published":"2024-08-28T05:36:25Z","title":"An Investigation of Warning Erroneous Chat Translations in Cross-lingual\n  Communication","summary":"  The complexities of chats pose significant challenges for machine translation\nmodels. Recognizing the need for a precise evaluation metric to address the\nissues of chat translation, this study introduces Multidimensional Quality\nMetrics for Chat Translation (MQM-Chat). Through the experiments of five models\nusing MQM-Chat, we observed that all models generated certain fundamental\nerrors, while each of them has different shortcomings, such as omission, overly\ncorrecting ambiguous source content, and buzzword issues, resulting in the loss\nof stylized information. Our findings underscore the effectiveness of MQM-Chat\nin evaluating chat translation, emphasizing the importance of stylized content\nand dialogue consistency for future studies.\n","authors":["Yunmeng Li","Jun Suzuki","Makoto Morishita","Kaori Abe","Kentaro Inui"],"pdf_url":"https://arxiv.org/pdf/2408.15543v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08872v2","updated":"2024-08-28T05:03:34Z","published":"2024-08-16T17:57:01Z","title":"xGen-MM (BLIP-3): A Family of Open Large Multimodal Models","summary":"  This report introduces xGen-MM (also known as BLIP-3), a framework for\ndeveloping Large Multimodal Models (LMMs). The framework comprises meticulously\ncurated datasets, a training recipe, model architectures, and a resulting suite\nof LMMs. xGen-MM, short for xGen-MultiModal, expands the Salesforce xGen\ninitiative on foundation AI models. Our models undergo rigorous evaluation\nacross a range of tasks, including both single and multi-image benchmarks. Our\npre-trained base model exhibits strong in-context learning capabilities and the\ninstruction-tuned model demonstrates competitive performance among open-source\nLMMs with similar model sizes. In addition, we introduce a safety-tuned model\nwith DPO, aiming to mitigate harmful behaviors such as hallucinations and\nimprove safety. We open-source our models, curated large-scale datasets, and\nour fine-tuning codebase to facilitate further advancements in LMM research.\nAssociated resources will be available on our project page above.\n","authors":["Le Xue","Manli Shu","Anas Awadalla","Jun Wang","An Yan","Senthil Purushwalkam","Honglu Zhou","Viraj Prabhu","Yutong Dai","Michael S Ryoo","Shrikant Kendre","Jieyu Zhang","Can Qin","Shu Zhang","Chia-Chih Chen","Ning Yu","Juntao Tan","Tulika Manoj Awalgaonkar","Shelby Heinecke","Huan Wang","Yejin Choi","Ludwig Schmidt","Zeyuan Chen","Silvio Savarese","Juan Carlos Niebles","Caiming Xiong","Ran Xu"],"pdf_url":"https://arxiv.org/pdf/2408.08872v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15533v1","updated":"2024-08-28T04:44:43Z","published":"2024-08-28T04:44:43Z","title":"LRP4RAG: Detecting Hallucinations in Retrieval-Augmented Generation via\n  Layer-wise Relevance Propagation","summary":"  Retrieval-Augmented Generation (RAG) has become a primary technique for\nmitigating hallucinations in large language models (LLMs). However, incomplete\nknowledge extraction and insufficient understanding can still mislead LLMs to\nproduce irrelevant or even contradictory responses, which means hallucinations\npersist in RAG. In this paper, we propose LRP4RAG, a method based on the\nLayer-wise Relevance Propagation (LRP) algorithm for detecting hallucinations\nin RAG. Specifically, we first utilize LRP to compute the relevance between the\ninput and output of the RAG generator. We then apply further extraction and\nresampling to the relevance matrix. The processed relevance data are input into\nmultiple classifiers to determine whether the output contains hallucinations.\nTo the best of our knowledge, this is the first time that LRP has been used for\ndetecting RAG hallucinations, and extensive experiments demonstrate that\nLRP4RAG outperforms existing baselines.\n","authors":["Haichuan Hu","Yuhan Sun","Qunjun Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.15533v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15518v1","updated":"2024-08-28T04:06:14Z","published":"2024-08-28T04:06:14Z","title":"Dolphin: Long Context as a New Modality for Energy-Efficient On-Device\n  Language Models","summary":"  This paper presents Dolphin, a novel decoder-decoder architecture for\nenergy-efficient processing of long contexts in language models. Our approach\naddresses the significant energy consumption and latency challenges inherent in\non-device models. Dolphin employs a compact 0.5B parameter decoder to distill\nextensive contextual information into a memory embedding, substantially\nreducing the input length for the primary 7B parameter decoder model. Inspired\nby vision-language models, we repurpose the image embedding projector to encode\nlong textual contexts, effectively treating extended context as a distinct\nmodality. This innovative method enables processing of substantially longer\ncontexts without the typical computational overhead associated with extended\ninput sequences. Empirical evaluations demonstrate a 10-fold improvement in\nenergy efficiency and a 5-fold reduction in latency compared to conventional\nfull-length context processing methods without losing quality of the response.\nOur work contributes to the development of more sustainable and scalable\nlanguage models for on-device applications, addressing the critical need for\nenergy-efficient and responsive AI technologies in resource-constrained\nenvironments while maintaining the accuracy to understand long contexts. This\nresearch has implications for the broader field of natural language processing,\nparticularly in the domain of efficient model design for resource-limited\nsettings. By enabling more sophisticated AI capabilities on edge devices,\nDolphin paves the way for advanced language processing in a wide range of\napplications where computational resources are at a premium. The Dolphin model\nis publicly available at https://huggingface.co/NexaAIDev/Dolphin.\n","authors":["Wei Chen","Zhiyuan Li","Shuo Xin","Yihao Wang"],"pdf_url":"https://arxiv.org/pdf/2408.15518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15040v2","updated":"2024-08-28T03:56:37Z","published":"2024-08-27T13:10:05Z","title":"A Survey of Large Language Models for European Languages","summary":"  Large Language Models (LLMs) have gained significant attention due to their\nhigh performance on a wide range of natural language tasks since the release of\nChatGPT. The LLMs learn to understand and generate language by training\nbillions of model parameters on vast volumes of text data. Despite being a\nrelatively new field, LLM research is rapidly advancing in various directions.\nIn this paper, we present an overview of LLM families, including LLaMA, PaLM,\nGPT, and MoE, and the methods developed to create and enhance LLMs for official\nEuropean Union (EU) languages. We provide a comprehensive summary of common\nmonolingual and multilingual datasets used for pretraining large language\nmodels.\n","authors":["Wazir Ali","Sampo Pyysalo"],"pdf_url":"https://arxiv.org/pdf/2408.15040v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15512v1","updated":"2024-08-28T03:48:05Z","published":"2024-08-28T03:48:05Z","title":"Towards Fully Autonomous Research Powered by LLMs: Case Study on\n  Simulations","summary":"  The advent of Large Language Models (LLMs) has created new opportunities for\nthe automation of scientific research, spanning both experimental processes and\ncomputational simulations. This study explores the feasibility of constructing\nan autonomous simulation agent (ASA) powered by LLM, through sophisticated API\nintegration, to automate the entire research process, from experimental design,\nremote upload and simulation execution, data analysis, to report compilation.\nUsing a simulation problem of polymer chain conformations as a case study, we\nassessed the performance of ASAs powered by different LLMs including\nGPT-4-Turbo. Our findings revealed that ASA-GPT-4o achieved near-flawless\nexecution on designated research missions, underscoring the potential of LLMs\nto manage complete scientific investigations autonomously. The outlined\nautomation can be iteratively performed up to twenty cycles without human\nintervention, illustrating the potential of LLMs for large-scale autonomous\nresearch endeavors. Additionally, we discussed the intrinsic traits of ASAs in\nmanaging extensive tasks, focusing on self-validation mechanisms and the\nbalance between local attention and global oversight.\n","authors":["Zhihan Liu","Yubo Chai","Jianfeng Li"],"pdf_url":"https://arxiv.org/pdf/2408.15512v1.pdf","comment":"For additional code and data, please visit our GitHub repository:\n  https://github.com/zokaraa/autonomous_simulation_agent"},{"id":"http://arxiv.org/abs/2408.07611v2","updated":"2024-08-28T03:47:28Z","published":"2024-08-14T15:19:16Z","title":"WeKnow-RAG: An Adaptive Approach for Retrieval-Augmented Generation\n  Integrating Web Search and Knowledge Graphs","summary":"  Large Language Models (LLMs) have greatly contributed to the development of\nadaptive intelligent agents and are positioned as an important way to achieve\nArtificial General Intelligence (AGI). However, LLMs are prone to produce\nfactually incorrect information and often produce \"phantom\" content that\nundermines their reliability, which poses a serious challenge for their\ndeployment in real-world scenarios. Enhancing LLMs by combining external\ndatabases and information retrieval mechanisms is an effective path. To address\nthe above challenges, we propose a new approach called WeKnow-RAG, which\nintegrates Web search and Knowledge Graphs into a \"Retrieval-Augmented\nGeneration (RAG)\" system. First, the accuracy and reliability of LLM responses\nare improved by combining the structured representation of Knowledge Graphs\nwith the flexibility of dense vector retrieval. WeKnow-RAG then utilizes\ndomain-specific knowledge graphs to satisfy a variety of queries and domains,\nthereby improving performance on factual information and complex reasoning\ntasks by employing multi-stage web page retrieval techniques using both sparse\nand dense retrieval methods. Our approach effectively balances the efficiency\nand accuracy of information retrieval, thus improving the overall retrieval\nprocess. Finally, we also integrate a self-assessment mechanism for the LLM to\nevaluate the trustworthiness of the answers it generates. Our approach proves\nits outstanding effectiveness in a wide range of offline experiments and online\nsubmissions.\n","authors":["Weijian Xie","Xuefeng Liang","Yuhui Liu","Kaihua Ni","Hong Cheng","Zetian Hu"],"pdf_url":"https://arxiv.org/pdf/2408.07611v2.pdf","comment":"8 pages, 2 figures, technical report for 3rd place in Task 3 of Meta\n  KDD Cup 2024 CRAG Challenge"},{"id":"http://arxiv.org/abs/2408.15510v1","updated":"2024-08-28T03:45:49Z","published":"2024-08-28T03:45:49Z","title":"Measuring the Reliability of Causal Probing Methods: Tradeoffs,\n  Limitations, and the Plight of Nullifying Interventions","summary":"  Causal probing is an approach to interpreting foundation models, such as\nlarge language models, by training probes to recognize latent properties of\ninterest from embeddings, intervening on probes to modify this representation,\nand analyzing the resulting changes in the model's behavior. While some recent\nworks have cast doubt on the theoretical basis of several leading causal\nprobing intervention methods, it has been unclear how to systematically and\nempirically evaluate their effectiveness in practice. To address this problem,\nwe propose a general empirical analysis framework to evaluate the reliability\nof causal probing interventions, formally defining and quantifying two key\ncausal probing desiderata: completeness (fully transforming the representation\nof the target property) and selectivity (minimally impacting other properties).\nOur formalism allows us to make the first direct comparisons between different\nfamilies of causal probing methods (e.g., linear vs. nonlinear or\ncounterfactual vs. nullifying interventions). We conduct extensive experiments\nacross several leading methods, finding that (1) there is an inherent tradeoff\nbetween these criteria, and no method is able to consistently satisfy both at\nonce; and (2) across the board, nullifying interventions are always far less\ncomplete than counterfactual interventions, indicating that nullifying methods\nmay not be an effective approach to causal probing.\n","authors":["Marc Canby","Adam Davies","Chirag Rastogi","Julia Hockenmaier"],"pdf_url":"https://arxiv.org/pdf/2408.15510v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15496v1","updated":"2024-08-28T02:47:27Z","published":"2024-08-28T02:47:27Z","title":"ReMamba: Equip Mamba with Effective Long-Sequence Modeling","summary":"  While the Mamba architecture demonstrates superior inference efficiency and\ncompetitive performance on short-context natural language processing (NLP)\ntasks, empirical evidence suggests its capacity to comprehend long contexts is\nlimited compared to transformer-based models. In this study, we investigate the\nlong-context efficiency issues of the Mamba models and propose ReMamba, which\nenhances Mamba's ability to comprehend long contexts. ReMamba incorporates\nselective compression and adaptation techniques within a two-stage re-forward\nprocess, incurring minimal additional inference costs overhead. Experimental\nresults on the LongBench and L-Eval benchmarks demonstrate ReMamba's efficacy,\nimproving over the baselines by 3.2 and 1.6 points, respectively, and attaining\nperformance almost on par with same-size transformer models.\n","authors":["Danlong Yuan","Jiahao Liu","Bei Li","Huishuai Zhang","Jingang Wang","Xunliang Cai","Dongyan Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.15496v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15491v1","updated":"2024-08-28T02:31:15Z","published":"2024-08-28T02:31:15Z","title":"Enhancing and Accelerating Large Language Models via Instruction-Aware\n  Contextual Compression","summary":"  Large Language Models (LLMs) have garnered widespread attention due to their\nremarkable performance across various tasks. However, to mitigate the issue of\nhallucinations, LLMs often incorporate retrieval-augmented pipeline to provide\nthem with rich external knowledge and context. Nevertheless, challenges stem\nfrom inaccurate and coarse-grained context retrieved from the retriever.\nSupplying irrelevant context to the LLMs can result in poorer responses,\nincreased inference latency, and higher costs. This paper introduces a method\ncalled Instruction-Aware Contextual Compression, which filters out less\ninformative content, thereby accelerating and enhancing the use of LLMs. The\nexperimental results demonstrate that Instruction-Aware Contextual Compression\nnotably reduces memory consumption and minimizes generation latency while\nmaintaining performance levels comparable to those achieved with the use of the\nfull context. Specifically, we achieved a 50% reduction in context-related\ncosts, resulting in a 5% reduction in inference memory usage and a 2.2-fold\nincrease in inference speed, with only a minor drop of 0.047 in Rouge-1. These\nfindings suggest that our method strikes an effective balance between\nefficiency and performance.\n","authors":["Haowen Hou","Fei Ma","Binwen Bai","Xinxin Zhu","Fei Yu"],"pdf_url":"https://arxiv.org/pdf/2408.15491v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2408.15488v1","updated":"2024-08-28T02:27:07Z","published":"2024-08-28T02:27:07Z","title":"Legilimens: Practical and Unified Content Moderation for Large Language\n  Model Services","summary":"  Given the societal impact of unsafe content generated by large language\nmodels (LLMs), ensuring that LLM services comply with safety standards is a\ncrucial concern for LLM service providers. Common content moderation methods\nare limited by an effectiveness-and-efficiency dilemma, where simple models are\nfragile while sophisticated models consume excessive computational resources.\nIn this paper, we reveal for the first time that effective and efficient\ncontent moderation can be achieved by extracting conceptual features from\nchat-oriented LLMs, despite their initial fine-tuning for conversation rather\nthan content moderation. We propose a practical and unified content moderation\nframework for LLM services, named Legilimens, which features both effectiveness\nand efficiency. Our red-team model-based data augmentation enhances the\nrobustness of Legilimens against state-of-the-art jailbreaking. Additionally,\nwe develop a framework to theoretically analyze the cost-effectiveness of\nLegilimens compared to other methods. We have conducted extensive experiments\non five host LLMs, seventeen datasets, and nine jailbreaking methods to verify\nthe effectiveness, efficiency, and robustness of Legilimens against normal and\nadaptive adversaries. A comparison of Legilimens with both commercial and\nacademic baselines demonstrates the superior performance of Legilimens.\nFurthermore, we confirm that Legilimens can be applied to few-shot scenarios\nand extended to multi-label classification tasks.\n","authors":["Jialin Wu","Jiangyi Deng","Shengyuan Pang","Yanjiao Chen","Jiayang Xu","Xinfeng Li","Wenyuan Xu"],"pdf_url":"https://arxiv.org/pdf/2408.15488v1.pdf","comment":"Accepted by ACM Conference on Computer and Communications Security\n  (CCS) 2024"},{"id":"http://arxiv.org/abs/2407.05750v3","updated":"2024-08-28T02:04:24Z","published":"2024-07-08T09:03:12Z","title":"Large Language Models Understand Layout","summary":"  Large language models (LLMs) demonstrate extraordinary abilities in a wide\nrange of natural language processing (NLP) tasks. In this paper, we show that,\nbeyond text understanding capability, LLMs are capable of processing text\nlayouts that are denoted by spatial markers. They are able to answer questions\nthat require explicit spatial perceiving and reasoning, while a drastic\nperformance drop is observed when the spatial markers from the original data\nare excluded. We perform a series of experiments with the GPT-3.5, Baichuan2,\nLlama2 and ChatGLM3 models on various types of layout-sensitive datasets for\nfurther analysis. The experimental results reveal that the layout understanding\nability of LLMs is mainly introduced by the coding data for pretraining, which\nis further enhanced at the instruction-tuning stage. In addition, layout\nunderstanding can be enhanced by integrating low-cost, auto-generated data\napproached by a novel text game. Finally, we show that layout understanding\nability is beneficial for building efficient visual question-answering (VQA)\nsystems.\n","authors":["Weiming Li","Manni Duan","Dong An","Yan Shao"],"pdf_url":"https://arxiv.org/pdf/2407.05750v3.pdf","comment":"This paper has been accepted by ECAI-2024"},{"id":"http://arxiv.org/abs/2408.14895v2","updated":"2024-08-28T01:56:33Z","published":"2024-08-27T09:18:57Z","title":"VHAKG: A Multi-modal Knowledge Graph Based on Synchronized Multi-view\n  Videos of Daily Activities","summary":"  Multi-modal knowledge graphs (MMKGs), which ground various non-symbolic data\n(e.g., images and videos) into symbols, have attracted attention as resources\nenabling knowledge processing and machine learning across modalities. However,\nthe construction of MMKGs for videos consisting of multiple events, such as\ndaily activities, is still in the early stages. In this paper, we construct an\nMMKG based on synchronized multi-view simulated videos of daily activities.\nBesides representing the content of daily life videos as event-centric\nknowledge, our MMKG also includes frame-by-frame fine-grained changes, such as\nbounding boxes within video frames. In addition, we provide support tools for\nquerying our MMKG. As an application example, we demonstrate that our MMKG\nfacilitates benchmarking vision-language models by providing the necessary\nvision-language datasets for a tailored task.\n","authors":["Shusaku Egami","Takahiro Ugai","Swe Nwe Nwe Htun","Ken Fukuda"],"pdf_url":"https://arxiv.org/pdf/2408.14895v2.pdf","comment":"5 pages, 4 figures, accepted by CIKM2024 Resource Track"},{"id":"http://arxiv.org/abs/2408.16163v1","updated":"2024-08-28T22:51:29Z","published":"2024-08-28T22:51:29Z","title":"FRACTURED-SORRY-Bench: Framework for Revealing Attacks in Conversational\n  Turns Undermining Refusal Efficacy and Defenses over SORRY-Bench","summary":"  This paper introduces FRACTURED-SORRY-Bench, a framework for evaluating the\nsafety of Large Language Models (LLMs) against multi-turn conversational\nattacks. Building upon the SORRY-Bench dataset, we propose a simple yet\neffective method for generating adversarial prompts by breaking down harmful\nqueries into seemingly innocuous sub-questions. Our approach achieves a maximum\nincrease of +46.22\\% in Attack Success Rates (ASRs) across GPT-4, GPT-4o,\nGPT-4o-mini, and GPT-3.5-Turbo models compared to baseline methods. We\ndemonstrate that this technique poses a challenge to current LLM safety\nmeasures and highlights the need for more robust defenses against subtle,\nmulti-turn attacks.\n","authors":["Aman Priyanshu","Supriti Vijay"],"pdf_url":"https://arxiv.org/pdf/2408.16163v1.pdf","comment":"4 pages, 2 tables"},{"id":"http://arxiv.org/abs/2408.16131v1","updated":"2024-08-28T20:36:35Z","published":"2024-08-28T20:36:35Z","title":"Evaluating Computational Representations of Character: An Austen\n  Character Similarity Benchmark","summary":"  Several systems have been developed to extract information about characters\nto aid computational analysis of English literature. We propose character\nsimilarity grouping as a holistic evaluation task for these pipelines. We\npresent AustenAlike, a benchmark suite of character similarities in Jane\nAusten's novels. Our benchmark draws on three notions of character similarity:\na structurally defined notion of similarity; a socially defined notion of\nsimilarity; and an expert defined set extracted from literary criticism.\n  We use AustenAlike to evaluate character features extracted using two\npipelines, BookNLP and FanfictionNLP. We build character representations from\nfour kinds of features and compare them to the three AustenAlike benchmarks and\nto GPT-4 similarity rankings. We find that though computational representations\ncapture some broad similarities based on shared social and narrative roles, the\nexpert pairings in our third benchmark are challenging for all systems,\nhighlighting the subtler aspects of similarity noted by human readers.\n","authors":["Funing Yang","Carolyn Jane Anderson"],"pdf_url":"https://arxiv.org/pdf/2408.16131v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16098v1","updated":"2024-08-28T19:03:41Z","published":"2024-08-28T19:03:41Z","title":"Structured Event Reasoning with Large Language Models","summary":"  Reasoning about real-life events is a unifying challenge in AI and NLP that\nhas profound utility in a variety of domains, while fallacy in high-stake\napplications could be catastrophic. Able to work with diverse text in these\ndomains, large language models (LLMs) have proven capable of answering\nquestions and solving problems. However, I show that end-to-end LLMs still\nsystematically fail to reason about complex events, and they lack\ninterpretability due to their black-box nature. To address these issues, I\npropose three general approaches to use LLMs in conjunction with a structured\nrepresentation of events. The first is a language-based representation\ninvolving relations of sub-events that can be learned by LLMs via fine-tuning.\nThe second is a semi-symbolic representation involving states of entities that\ncan be predicted and leveraged by LLMs via few-shot prompting. The third is a\nfully symbolic representation that can be predicted by LLMs trained with\nstructured data and be executed by symbolic solvers. On a suite of event\nreasoning tasks spanning common-sense inference and planning, I show that each\napproach greatly outperforms end-to-end LLMs with more interpretability. These\nresults suggest manners of synergy between LLMs and structured representations\nfor event reasoning and beyond.\n","authors":["Li Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.16098v1.pdf","comment":"PhD thesis"},{"id":"http://arxiv.org/abs/2408.16089v1","updated":"2024-08-28T18:43:07Z","published":"2024-08-28T18:43:07Z","title":"Is Personality Prediction Possible Based on Reddit Comments?","summary":"  In this assignment, we examine whether there is a correlation between the\npersonality type of a person and the texts they wrote. In order to do this, we\naggregated datasets of Reddit comments labeled with the Myers-Briggs Type\nIndicator (MBTI) of the author and built different supervised classifiers based\non BERT to try to predict the personality of an author given a text. Despite\nexperiencing issues with the unfiltered character of the dataset, we can\nobserve potential in the classification.\n","authors":["Robert Deimann","Till Preidt","Shaptarshi Roy","Jan Stanicki"],"pdf_url":"https://arxiv.org/pdf/2408.16089v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16081v1","updated":"2024-08-28T18:25:35Z","published":"2024-08-28T18:25:35Z","title":"Logic-Enhanced Language Model Agents for Trustworthy Social Simulations","summary":"  We introduce the Logic-Enhanced Language Model Agents (LELMA) framework, a\nnovel approach to enhance the trustworthiness of social simulations that\nutilize large language models (LLMs). While LLMs have gained attention as\nagents for simulating human behaviour, their applicability in this role is\nlimited by issues such as inherent hallucinations and logical inconsistencies.\nLELMA addresses these challenges by integrating LLMs with symbolic AI, enabling\nlogical verification of the reasoning generated by LLMs. This verification\nprocess provides corrective feedback, refining the reasoning output. The\nframework consists of three main components: an LLM-Reasoner for producing\nstrategic reasoning, an LLM-Translator for mapping natural language reasoning\nto logic queries, and a Solver for evaluating these queries. This study focuses\non decision-making in game-theoretic scenarios as a model of human interaction.\nExperiments involving the Hawk-Dove game, Prisoner's Dilemma, and Stag Hunt\nhighlight the limitations of state-of-the-art LLMs, GPT-4 Omni and Gemini 1.0\nPro, in producing correct reasoning in these contexts. LELMA demonstrates high\naccuracy in error detection and improves the reasoning correctness of LLMs via\nself-refinement, particularly in GPT-4 Omni.\n","authors":["Agnieszka Mensfelt","Kostas Stathis","Vince Trencsenyi"],"pdf_url":"https://arxiv.org/pdf/2408.16081v1.pdf","comment":"Source code: https://github.com/dicelab-rhul/LELMA"},{"id":"http://arxiv.org/abs/2408.16073v1","updated":"2024-08-28T18:14:39Z","published":"2024-08-28T18:14:39Z","title":"Using Large Language Models to Create AI Personas for Replication and\n  Prediction of Media Effects: An Empirical Test of 133 Published Experimental\n  Research Findings","summary":"  This report analyzes the potential for large language models (LLMs) to\nexpedite accurate replication of published message effects studies. We tested\nLLM-powered participants (personas) by replicating 133 experimental findings\nfrom 14 papers containing 45 recent studies in the Journal of Marketing\n(January 2023-May 2024). We used a new software tool, Viewpoints AI\n(https://viewpoints.ai/), that takes study designs, stimuli, and measures as\ninput, automatically generates prompts for LLMs to act as a specified sample of\nunique personas, and collects their responses to produce a final output in the\nform of a complete dataset and statistical analysis. The underlying LLM used\nwas Anthropic's Claude Sonnet 3.5. We generated 19,447 AI personas to replicate\nthese studies with the exact same sample attributes, study designs, stimuli,\nand measures reported in the original human research. Our LLM replications\nsuccessfully reproduced 76% of the original main effects (84 out of 111),\ndemonstrating strong potential for AI-assisted replication of studies in which\npeople respond to media stimuli. When including interaction effects, the\noverall replication rate was 68% (90 out of 133). The use of LLMs to replicate\nand accelerate marketing research on media effects is discussed with respect to\nthe replication crisis in social science, potential solutions to\ngeneralizability problems in sampling subjects and experimental conditions, and\nthe ability to rapidly test consumer responses to various media stimuli. We\nalso address the limitations of this approach, particularly in replicating\ncomplex interaction effects in media response studies, and suggest areas for\nfuture research and improvement in AI-assisted experimental replication of\nmedia effects.\n","authors":["Leo Yeykelis","Kaavya Pichai","James J. Cummings","Byron Reeves"],"pdf_url":"https://arxiv.org/pdf/2408.16073v1.pdf","comment":"24 pages, 3 figures, 2 tables"},{"id":"http://arxiv.org/abs/2408.14028v2","updated":"2024-08-28T18:06:50Z","published":"2024-08-26T05:38:27Z","title":"SurGen: Text-Guided Diffusion Model for Surgical Video Generation","summary":"  Diffusion-based video generation models have made significant strides,\nproducing outputs with improved visual fidelity, temporal coherence, and user\ncontrol. These advancements hold great promise for improving surgical education\nby enabling more realistic, diverse, and interactive simulation environments.\nIn this study, we introduce SurGen, a text-guided diffusion model tailored for\nsurgical video synthesis, producing the highest resolution and longest duration\nvideos among existing surgical video generation models. We validate the visual\nand temporal quality of the outputs using standard image and video generation\nmetrics. Additionally, we assess their alignment to the corresponding text\nprompts through a deep learning classifier trained on surgical data. Our\nresults demonstrate the potential of diffusion models to serve as valuable\neducational tools for surgical trainees.\n","authors":["Joseph Cho","Samuel Schmidgall","Cyril Zakka","Mrudang Mathur","Rohan Shad","William Hiesinger"],"pdf_url":"https://arxiv.org/pdf/2408.14028v2.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2408.15998v1","updated":"2024-08-28T17:59:31Z","published":"2024-08-28T17:59:31Z","title":"Eagle: Exploring The Design Space for Multimodal LLMs with Mixture of\n  Encoders","summary":"  The ability to accurately interpret complex visual information is a crucial\ntopic of multimodal large language models (MLLMs). Recent work indicates that\nenhanced visual perception significantly reduces hallucinations and improves\nperformance on resolution-sensitive tasks, such as optical character\nrecognition and document analysis. A number of recent MLLMs achieve this goal\nusing a mixture of vision encoders. Despite their success, there is a lack of\nsystematic comparisons and detailed ablation studies addressing critical\naspects, such as expert selection and the integration of multiple vision\nexperts. This study provides an extensive exploration of the design space for\nMLLMs using a mixture of vision encoders and resolutions. Our findings reveal\nseveral underlying principles common to various existing strategies, leading to\na streamlined yet effective design approach. We discover that simply\nconcatenating visual tokens from a set of complementary vision encoders is as\neffective as more complex mixing architectures or strategies. We additionally\nintroduce Pre-Alignment to bridge the gap between vision-focused encoders and\nlanguage tokens, enhancing model coherence. The resulting family of MLLMs,\nEagle, surpasses other leading open-source models on major MLLM benchmarks.\nModels and code: https://github.com/NVlabs/Eagle\n","authors":["Min Shi","Fuxiao Liu","Shihao Wang","Shijia Liao","Subhashree Radhakrishnan","De-An Huang","Hongxu Yin","Karan Sapra","Yaser Yacoob","Humphrey Shi","Bryan Catanzaro","Andrew Tao","Jan Kautz","Zhiding Yu","Guilin Liu"],"pdf_url":"https://arxiv.org/pdf/2408.15998v1.pdf","comment":"Github: https://github.com/NVlabs/Eagle, HuggingFace:\n  https://huggingface.co/NVEagle"},{"id":"http://arxiv.org/abs/2408.15996v1","updated":"2024-08-28T17:59:05Z","published":"2024-08-28T17:59:05Z","title":"Spatio-Temporal Context Prompting for Zero-Shot Action Detection","summary":"  Spatio-temporal action detection encompasses the tasks of localizing and\nclassifying individual actions within a video. Recent works aim to enhance this\nprocess by incorporating interaction modeling, which captures the relationship\nbetween people and their surrounding context. However, these approaches have\nprimarily focused on fully-supervised learning, and the current limitation lies\nin the lack of generalization capability to recognize unseen action categories.\nIn this paper, we aim to adapt the pretrained image-language models to detect\nunseen actions. To this end, we propose a method which can effectively leverage\nthe rich knowledge of visual-language models to perform Person-Context\nInteraction. Meanwhile, our Context Prompting module will utilize contextual\ninformation to prompt labels, thereby enhancing the generation of more\nrepresentative text features. Moreover, to address the challenge of recognizing\ndistinct actions by multiple people at the same timestamp, we design the\nInterest Token Spotting mechanism which employs pretrained visual knowledge to\nfind each person's interest context tokens, and then these tokens will be used\nfor prompting to generate text features tailored to each individual. To\nevaluate the ability to detect unseen actions, we propose a comprehensive\nbenchmark on J-HMDB, UCF101-24, and AVA datasets. The experiments show that our\nmethod achieves superior results compared to previous approaches and can be\nfurther extended to multi-action videos, bringing it closer to real-world\napplications. The code and data can be found in\nhttps://webber2933.github.io/ST-CLIP-project-page.\n","authors":["Wei-Jhe Huang","Min-Hung Chen","Shang-Hong Lai"],"pdf_url":"https://arxiv.org/pdf/2408.15996v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15995v1","updated":"2024-08-28T17:59:02Z","published":"2024-08-28T17:59:02Z","title":"TEDRA: Text-based Editing of Dynamic and Photoreal Actors","summary":"  Over the past years, significant progress has been made in creating\nphotorealistic and drivable 3D avatars solely from videos of real humans.\nHowever, a core remaining challenge is the fine-grained and user-friendly\nediting of clothing styles by means of textual descriptions. To this end, we\npresent TEDRA, the first method allowing text-based edits of an avatar, which\nmaintains the avatar's high fidelity, space-time coherency, as well as\ndynamics, and enables skeletal pose and view control. We begin by training a\nmodel to create a controllable and high-fidelity digital replica of the real\nactor. Next, we personalize a pretrained generative diffusion model by\nfine-tuning it on various frames of the real character captured from different\ncamera angles, ensuring the digital representation faithfully captures the\ndynamics and movements of the real person. This two-stage process lays the\nfoundation for our approach to dynamic human avatar editing. Utilizing this\npersonalized diffusion model, we modify the dynamic avatar based on a provided\ntext prompt using our Personalized Normal Aligned Score Distillation Sampling\n(PNA-SDS) within a model-based guidance framework. Additionally, we propose a\ntime step annealing strategy to ensure high-quality edits. Our results\ndemonstrate a clear improvement over prior work in functionality and visual\nquality.\n","authors":["Basavaraj Sunagad","Heming Zhu","Mohit Mendiratta","Adam Kortylewski","Christian Theobalt","Marc Habermann"],"pdf_url":"https://arxiv.org/pdf/2408.15995v1.pdf","comment":"For project page, see this https://vcai.mpi-inf.mpg.de/projects/Tedra"},{"id":"http://arxiv.org/abs/2408.15994v1","updated":"2024-08-28T17:58:54Z","published":"2024-08-28T17:58:54Z","title":"Perceive-IR: Learning to Perceive Degradation Better for All-in-One\n  Image Restoration","summary":"  The limitations of task-specific and general image restoration methods for\nspecific degradation have prompted the development of all-in-one image\nrestoration techniques. However, the diversity of patterns among multiple\ndegradation, along with the significant uncertainties in mapping between\ndegraded images of different severities and their corresponding undistorted\nversions, pose significant challenges to the all-in-one restoration tasks. To\naddress these challenges, we propose Perceive-IR, an all-in-one image restorer\ndesigned to achieve fine-grained quality control that enables restored images\nto more closely resemble their undistorted counterparts, regardless of the type\nor severity of degradation. Specifically, Perceive-IR contains two stages: (1)\nprompt learning stage and (2) restoration stage. In the prompt learning stage,\nwe leverage prompt learning to acquire a fine-grained quality perceiver capable\nof distinguishing three-tier quality levels by constraining the prompt-image\nsimilarity in the CLIP perception space. Subsequently, this quality perceiver\nand difficulty-adaptive perceptual loss are integrated as a quality-aware\nlearning strategy to realize fine-grained quality control in restoration stage.\nFor the restoration stage, a semantic guidance module (SGM) and compact feature\nextraction (CFE) are proposed to further promote the restoration process by\nutilizing the robust semantic information from the pre-trained large scale\nvision models and distinguishing degradation-specific features. Extensive\nexperiments demonstrate that our Perceive-IR outperforms state-of-the-art\nmethods in all-in-one image restoration tasks and exhibit superior\ngeneralization ability when dealing with unseen tasks.\n","authors":["Xu Zhang","Jiaqi Ma","Guoli Wang","Qian Zhang","Huan Zhang","Lefei Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.15994v1.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2408.15993v1","updated":"2024-08-28T17:58:53Z","published":"2024-08-28T17:58:53Z","title":"ClimDetect: A Benchmark Dataset for Climate Change Detection and\n  Attribution","summary":"  Detecting and attributing temperature increases due to climate change is\ncrucial for understanding global warming and guiding adaptation strategies. The\ncomplexity of distinguishing human-induced climate signals from natural\nvariability has challenged traditional detection and attribution (D&A)\napproaches, which seek to identify specific \"fingerprints\" in climate response\nvariables. Deep learning offers potential for discerning these complex patterns\nin expansive spatial datasets. However, lack of standard protocols has hindered\nconsistent comparisons across studies. We introduce ClimDetect, a standardized\ndataset of over 816k daily climate snapshots, designed to enhance model\naccuracy in identifying climate change signals. ClimDetect integrates various\ninput and target variables used in past research, ensuring comparability and\nconsistency. We also explore the application of vision transformers (ViT) to\nclimate data, a novel and modernizing approach in this context. Our open-access\ndata and code serve as a benchmark for advancing climate science through\nimproved model evaluations. ClimDetect is publicly accessible via Huggingface\ndataet respository at: https://huggingface.co/datasets/ClimDetect/ClimDetect.\n","authors":["Sungduk Yu","Brian L. White","Anahita Bhiwandiwalla","Musashi Hinck","Matthew Lyle Olson","Tung Nguyen","Vasudev Lal"],"pdf_url":"https://arxiv.org/pdf/2408.15993v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15992v1","updated":"2024-08-28T17:58:39Z","published":"2024-08-28T17:58:39Z","title":"CoGen: Learning from Feedback with Coupled Comprehension and Generation","summary":"  Systems with both language comprehension and generation capabilities can\nbenefit from the tight connection between the two. This work studies coupling\ncomprehension and generation with focus on continually learning from\ninteraction with users. We propose techniques to tightly integrate the two\ncapabilities for both learning and inference. We situate our studies in\ntwo-player reference games, and deploy various models for thousands of\ninteractions with human users, while learning from interaction feedback\nsignals. We show dramatic improvements in performance over time, with\ncomprehension-generation coupling leading to performance improvements up to 26%\nin absolute terms and up to 17% higher accuracies compared to a non-coupled\nsystem. Our analysis also shows coupling has substantial qualitative impact on\nthe system's language, making it significantly more human-like.\n","authors":["Mustafa Omer Gul","Yoav Artzi"],"pdf_url":"https://arxiv.org/pdf/2408.15992v1.pdf","comment":"17 pages, 9 figures"},{"id":"http://arxiv.org/abs/2408.15991v1","updated":"2024-08-28T17:58:17Z","published":"2024-08-28T17:58:17Z","title":"Distribution Backtracking Builds A Faster Convergence Trajectory for\n  One-step Diffusion Distillation","summary":"  Accelerating the sampling speed of diffusion models remains a significant\nchallenge. Recent score distillation methods distill a heavy teacher model into\nan one-step student generator, which is optimized by calculating the difference\nbetween the two score functions on the samples generated by the student model.\nHowever, there is a score mismatch issue in the early stage of the distillation\nprocess, because existing methods mainly focus on using the endpoint of\npre-trained diffusion models as teacher models, overlooking the importance of\nthe convergence trajectory between the student generator and the teacher model.\nTo address this issue, we extend the score distillation process by introducing\nthe entire convergence trajectory of teacher models and propose Distribution\nBacktracking Distillation (DisBack) for distilling student generators. DisBask\nis composed of two stages: Degradation Recording and Distribution Backtracking.\nDegradation Recording is designed to obtain the convergence trajectory of\nteacher models, which records the degradation path from the trained teacher\nmodel to the untrained initial student generator. The degradation path\nimplicitly represents the intermediate distributions of teacher models. Then\nDistribution Backtracking trains a student generator to backtrack the\nintermediate distributions for approximating the convergence trajectory of\nteacher models. Extensive experiments show that DisBack achieves faster and\nbetter convergence than the existing distillation method and accomplishes\ncomparable generation performance. Notably, DisBack is easy to implement and\ncan be generalized to existing distillation methods to boost performance. Our\ncode is publicly available on https://github.com/SYZhang0805/DisBack.\n","authors":["Shengyuan Zhang","Ling Yang","Zejian Li","An Zhao","Chenye Meng","Changyuan Yang","Guang Yang","Zhiyuan Yang","Lingyun Sun"],"pdf_url":"https://arxiv.org/pdf/2408.15991v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15966v1","updated":"2024-08-28T17:38:44Z","published":"2024-08-28T17:38:44Z","title":"More Text, Less Point: Towards 3D Data-Efficient Point-Language\n  Understanding","summary":"  Enabling Large Language Models (LLMs) to comprehend the 3D physical world\nremains a significant challenge. Due to the lack of large-scale 3D-text pair\ndatasets, the success of LLMs has yet to be replicated in 3D understanding. In\nthis paper, we rethink this issue and propose a new task: 3D Data-Efficient\nPoint-Language Understanding. The goal is to enable LLMs to achieve robust 3D\nobject understanding with minimal 3D point cloud and text data pairs. To\naddress this task, we introduce GreenPLM, which leverages more text data to\ncompensate for the lack of 3D data. First, inspired by using CLIP to align\nimages and text, we utilize a pre-trained point cloud-text encoder to map the\n3D point cloud space to the text space. This mapping leaves us to seamlessly\nconnect the text space with LLMs. Once the point-text-LLM connection is\nestablished, we further enhance text-LLM alignment by expanding the\nintermediate text space, thereby reducing the reliance on 3D point cloud data.\nSpecifically, we generate 6M free-text descriptions of 3D objects, and design a\nthree-stage training strategy to help LLMs better explore the intrinsic\nconnections between different modalities. To achieve efficient modality\nalignment, we design a zero-parameter cross-attention module for token pooling.\nExtensive experimental results show that GreenPLM requires only 12% of the 3D\ntraining data used by existing state-of-the-art models to achieve superior 3D\nunderstanding. Remarkably, GreenPLM also achieves competitive performance using\ntext-only data. The code and weights are available at:\nhttps://github.com/TangYuan96/GreenPLM.\n","authors":["Yuan Tang","Xu Han","Xianzhi Li","Qiao Yu","Jinfeng Xu","Yixue Hao","Long Hu","Min Chen"],"pdf_url":"https://arxiv.org/pdf/2408.15966v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15958v1","updated":"2024-08-28T17:20:56Z","published":"2024-08-28T17:20:56Z","title":"Efficient Slice Anomaly Detection Network for 3D Brain MRI Volume","summary":"  Current anomaly detection methods excel with benchmark industrial data but\nstruggle with natural images and medical data due to varying definitions of\n'normal' and 'abnormal.' This makes accurate identification of deviations in\nthese fields particularly challenging. Especially for 3D brain MRI data, all\nthe state-of-the-art models are reconstruction-based with 3D convolutional\nneural networks which are memory-intensive, time-consuming and producing noisy\noutputs that require further post-processing. We propose a framework called\nSimple Slice-based Network (SimpleSliceNet), which utilizes a model pre-trained\non ImageNet and fine-tuned on a separate MRI dataset as a 2D slice feature\nextractor to reduce computational cost. We aggregate the extracted features to\nperform anomaly detection tasks on 3D brain MRI volumes. Our model integrates a\nconditional normalizing flow to calculate log likelihood of features and\nemploys the Semi-Push-Pull Mechanism to enhance anomaly detection accuracy. The\nresults indicate improved performance, showcasing our model's remarkable\nadaptability and effectiveness when addressing the challenges exists in brain\nMRI data. In addition, for the large-scale 3D brain volumes, our model\nSimpleSliceNet outperforms the state-of-the-art 2D and 3D models in terms of\naccuracy, memory usage and time consumption. Code is available at:\nhttps://anonymous.4open.science/r/SimpleSliceNet-8EA3.\n","authors":["Zeduo Zhang","Yalda Mohsenzadeh"],"pdf_url":"https://arxiv.org/pdf/2408.15958v1.pdf","comment":"15 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.13818v2","updated":"2024-08-28T17:19:34Z","published":"2024-08-25T12:22:50Z","title":"HER2 and FISH Status Prediction in Breast Biopsy H&E-Stained Images\n  Using Deep Learning","summary":"  The current standard for detecting human epidermal growth factor receptor 2\n(HER2) status in breast cancer patients relies on HER2 amplification,\nidentified through fluorescence in situ hybridization (FISH) or\nimmunohistochemistry (IHC). However, hematoxylin and eosin (H\\&E) tumor stains\nare more widely available, and accurately predicting HER2 status using H\\&E\ncould reduce costs and expedite treatment selection. Deep Learning algorithms\nfor H&E have shown effectiveness in predicting various cancer features and\nclinical outcomes, including moderate success in HER2 status prediction. In\nthis work, we employed a customized weak supervision classification technique\ncombined with MoCo-v2 contrastive learning to predict HER2 status. We trained\nour pipeline on 182 publicly available H&E Whole Slide Images (WSIs) from The\nCancer Genome Atlas (TCGA), for which annotations by the pathology team at Yale\nSchool of Medicine are publicly available. Our pipeline achieved an Area Under\nthe Curve (AUC) of 0.85 across four different test folds. Additionally, we\ntested our model on 44 H&E slides from the TCGA-BRCA dataset, which had an HER2\nscore of 2+ and included corresponding HER2 status and FISH test results. These\ncases are considered equivocal for IHC, requiring an expensive FISH test on\ntheir IHC slides for disambiguation. Our pipeline demonstrated an AUC of 0.81\non these challenging H&E slides. Reducing the need for FISH test can have\nsignificant implications in cancer treatment equity for underserved\npopulations.\n","authors":["Ardhendu Sekhar","Vrinda Goel","Garima Jain","Abhijeet Patil","Ravi Kant Gupta","Amit Sethi"],"pdf_url":"https://arxiv.org/pdf/2408.13818v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15956v1","updated":"2024-08-28T17:17:20Z","published":"2024-08-28T17:17:20Z","title":"Generating Binary Species Range Maps","summary":"  Accurately predicting the geographic ranges of species is crucial for\nassisting conservation efforts. Traditionally, range maps were manually created\nby experts. However, species distribution models (SDMs) and, more recently,\ndeep learning-based variants offer a potential automated alternative. Deep\nlearning-based SDMs generate a continuous probability representing the\npredicted presence of a species at a given location, which must be binarized by\nsetting per-species thresholds to obtain binary range maps. However, selecting\nappropriate per-species thresholds to binarize these predictions is non-trivial\nas different species can require distinct thresholds. In this work, we evaluate\ndifferent approaches for automatically identifying the best thresholds for\nbinarizing range maps using presence-only data. This includes approaches that\nrequire the generation of additional pseudo-absence data, along with ones that\nonly require presence data. We also propose an extension of an existing\npresence-only technique that is more robust to outliers. We perform a detailed\nevaluation of different thresholding techniques on the tasks of binary range\nestimation and large-scale fine-grained visual classification, and we\ndemonstrate improved performance over existing pseudo-absence free approaches\nusing our method.\n","authors":["Filip Dorm","Christian Lange","Scott Loarie","Oisin Mac Aodha"],"pdf_url":"https://arxiv.org/pdf/2408.15956v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15955v1","updated":"2024-08-28T17:14:51Z","published":"2024-08-28T17:14:51Z","title":"Fall Detection for Smart Living using YOLOv5","summary":"  This work introduces a fall detection system using the YOLOv5mu model, which\nachieved a mean average precision (mAP) of 0.995, demonstrating exceptional\naccuracy in identifying fall events within smart home environments. Enhanced by\nadvanced data augmentation techniques, the model demonstrates significant\nrobustness and adaptability across various conditions. The integration of\nYOLOv5mu offers precise, real-time fall detection, which is crucial for\nimproving safety and emergency response for residents. Future research will\nfocus on refining the system by incorporating contextual data and exploring\nmulti-sensor approaches to enhance its performance and practical applicability\nin diverse environments.\n","authors":["Gracile Astlin Pereira"],"pdf_url":"https://arxiv.org/pdf/2408.15955v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15954v1","updated":"2024-08-28T17:14:21Z","published":"2024-08-28T17:14:21Z","title":"InstanSeg: an embedding-based instance segmentation algorithm optimized\n  for accurate, efficient and portable cell segmentation","summary":"  Cell and nucleus segmentation are fundamental tasks for quantitative bioimage\nanalysis. Despite progress in recent years, biologists and other domain experts\nstill require novel algorithms to handle increasingly large and complex\nreal-world datasets. These algorithms must not only achieve state-of-the-art\naccuracy, but also be optimized for efficiency, portability and\nuser-friendliness. Here, we introduce InstanSeg: a novel embedding-based\ninstance segmentation pipeline designed to identify cells and nuclei in\nmicroscopy images. Using six public cell segmentation datasets, we demonstrate\nthat InstanSeg can significantly improve accuracy when compared to the most\nwidely used alternative methods, while reducing the processing time by at least\n60%. Furthermore, InstanSeg is designed to be fully serializable as TorchScript\nand supports GPU acceleration on a range of hardware. We provide an open-source\nimplementation of InstanSeg in Python, in addition to a user-friendly,\ninteractive QuPath extension for inference written in Java. Our code and\npre-trained models are available at https://github.com/instanseg/instanseg .\n","authors":["Thibaut Goldsborough","Ben Philps","Alan O'Callaghan","Fiona Inglis","Leo Leplat","Andrew Filby","Hakan Bilen","Peter Bankhead"],"pdf_url":"https://arxiv.org/pdf/2408.15954v1.pdf","comment":"12 pages,6 figures"},{"id":"http://arxiv.org/abs/2408.15947v1","updated":"2024-08-28T17:05:38Z","published":"2024-08-28T17:05:38Z","title":"Auxiliary Input in Training: Incorporating Catheter Features into Deep\n  Learning Models for ECG-Free Dynamic Coronary Roadmapping","summary":"  Dynamic coronary roadmapping is a technology that overlays the vessel maps\n(the \"roadmap\") extracted from an offline image sequence of X-ray angiography\nonto a live stream of X-ray fluoroscopy in real-time. It aims to offer\nnavigational guidance for interventional surgeries without the need for\nrepeated contrast agent injections, thereby reducing the risks associated with\nradiation exposure and kidney failure. The precision of the roadmaps is\ncontingent upon the accurate alignment of angiographic and fluoroscopic images\nbased on their cardiac phases, as well as precise catheter tip tracking. The\nformer ensures the selection of a roadmap that closely matches the vessel shape\nin the current frame, while the latter uses catheter tips as reference points\nto adjust for translational motion between the roadmap and the present vessel\ntree. Training deep learning models for both tasks is challenging and\nunderexplored. However, incorporating catheter features into the models could\noffer substantial benefits, given humans heavily rely on catheters to complete\nthe tasks. To this end, we introduce a simple but effective method, auxiliary\ninput in training (AIT), and demonstrate that it enhances model performance\nacross both tasks, outperforming baseline methods in knowledge incorporation\nand transfer learning.\n","authors":["Yikang Liu","Lin Zhao","Eric Z. Chen","Xiao Chen","Terrence Chen","Shanhui Sun"],"pdf_url":"https://arxiv.org/pdf/2408.15947v1.pdf","comment":"MICCAI 2024"},{"id":"http://arxiv.org/abs/2408.15946v1","updated":"2024-08-28T17:04:56Z","published":"2024-08-28T17:04:56Z","title":"Sigma Flows for Image and Data Labeling and Learning Structured\n  Prediction","summary":"  This paper introduces the sigma flow model for the prediction of structured\nlabelings of data observed on Riemannian manifolds, including Euclidean image\ndomains as special case. The approach combines the Laplace-Beltrami framework\nfor image denoising and enhancement, introduced by Sochen, Kimmel and Malladi\nabout 25 years ago, and the assignment flow approach introduced and studied by\nthe authors.\n  The sigma flow arises as Riemannian gradient flow of generalized harmonic\nenergies and thus is governed by a nonlinear geometric PDE which determines a\nharmonic map from a closed Riemannian domain manifold to a statistical\nmanifold, equipped with the Fisher-Rao metric from information geometry. A\nspecific ingredient of the sigma flow is the mutual dependency of the\nRiemannian metric of the domain manifold on the evolving state. This makes the\napproach amenable to machine learning in a specific way, by realizing this\ndependency through a mapping with compact time-variant parametrization that can\nbe learned from data. Proof of concept experiments demonstrate the expressivity\nof the sigma flow model and prediction performance.\n  Structural similarities to transformer network architectures and networks\ngenerated by the geometric integration of sigma flows are pointed out, which\nhighlights the connection to deep learning and, conversely, may stimulate the\nuse of geometric design principles for structured prediction in other areas of\nscientific machine learning.\n","authors":["Jonas Cassel","Bastian Boll","Stefania Petra","Peter Albers","Christoph Schnörr"],"pdf_url":"https://arxiv.org/pdf/2408.15946v1.pdf","comment":"51 pages"},{"id":"http://arxiv.org/abs/2305.12437v4","updated":"2024-08-28T16:56:02Z","published":"2023-05-21T11:51:09Z","title":"SCP: Soft Conditional Prompt Learning for Aerial Video Action\n  Recognition","summary":"  We present a new learning approach, Soft Conditional Prompt Learning (SCP),\nwhich leverages the strengths of prompt learning for aerial video action\nrecognition. Our approach is designed to predict the action of each agent by\nhelping the models focus on the descriptions or instructions associated with\nactions in the input videos for aerial/robot visual perception. Our formulation\nsupports various prompts, including learnable prompts, auxiliary visual\ninformation, and large vision models to improve the recognition performance. We\npresent a soft conditional prompt method that learns to dynamically generate\nprompts from a pool of prompt experts under different video inputs. By sharing\nthe same objective with the task, our proposed SCP can optimize prompts that\nguide the model's predictions while explicitly learning input-invariant (prompt\nexperts pool) and input-specific (data-dependent) prompt knowledge. In\npractice, we observe a 3.17-10.2% accuracy improvement on the aerial video\ndatasets (Okutama, NECDrone), which consist of scenes with single-agent and\nmulti-agent actions. We further evaluate our approach on ground camera videos\nto verify the effectiveness and generalization and achieve a 1.0-3.6%\nimprovement on dataset SSV2. We integrate our method into the ROS2 as well.\n","authors":["Xijun Wang","Ruiqi Xian","Tianrui Guan","Fuxiao Liu","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2305.12437v4.pdf","comment":"IROS2024"},{"id":"http://arxiv.org/abs/2402.09786v4","updated":"2024-08-28T16:48:06Z","published":"2024-02-15T08:34:21Z","title":"Examining Pathological Bias in a Generative Adversarial Network\n  Discriminator: A Case Study on a StyleGAN3 Model","summary":"  Generative adversarial networks (GANs) generate photorealistic faces that are\noften indistinguishable by humans from real faces. While biases in machine\nlearning models are often assumed to be due to biases in training data, we find\npathological internal color and luminance biases in the discriminator of a\npre-trained StyleGAN3-r model that are not explicable by the training data. We\nalso find that the discriminator systematically stratifies scores by both\nimage- and face-level qualities and that this disproportionately affects images\nacross gender, race, and other categories. We examine axes common in research\non stereotyping in social psychology.\n","authors":["Alvin Grissom II","Ryan F. Lei","Matt Gusdorff","Jeova Farias Sales Rocha Neto","Bailey Lin","Ryan Trotter"],"pdf_url":"https://arxiv.org/pdf/2402.09786v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15924v1","updated":"2024-08-28T16:36:23Z","published":"2024-08-28T16:36:23Z","title":"Local Descriptors Weighted Adaptive Threshold Filtering For Few-Shot\n  Learning","summary":"  Few-shot image classification is a challenging task in the field of machine\nlearning, involving the identification of new categories using a limited number\nof labeled samples. In recent years, methods based on local descriptors have\nmade significant progress in this area. However, the key to improving\nclassification accuracy lies in effectively filtering background noise and\naccurately selecting critical local descriptors highly relevant to image\ncategory information.\n  To address this challenge, we propose an innovative weighted adaptive\nthreshold filtering (WATF) strategy for local descriptors. This strategy can\ndynamically adjust based on the current task and image context, thereby\nselecting local descriptors most relevant to the image category. This enables\nthe model to better focus on category-related information while effectively\nmitigating interference from irrelevant background regions.\n  To evaluate the effectiveness of our method, we adopted the N-way K-shot\nexperimental framework. Experimental results show that our method not only\nimproves the clustering effect of selected local descriptors but also\nsignificantly enhances the discriminative ability between image categories.\nNotably, our method maintains a simple and lightweight design philosophy\nwithout introducing additional learnable parameters. This feature ensures\nconsistency in filtering capability during both training and testing phases,\nfurther enhancing the reliability and practicality of the method.\n","authors":["Bingchen Yan"],"pdf_url":"https://arxiv.org/pdf/2408.15924v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15922v1","updated":"2024-08-28T16:36:09Z","published":"2024-08-28T16:36:09Z","title":"DiffAge3D: Diffusion-based 3D-aware Face Aging","summary":"  Face aging is the process of converting an individual's appearance to a\nyounger or older version of themselves. Existing face aging techniques have\nbeen limited to 2D settings, which often weaken their applications as there is\na growing demand for 3D face modeling. Moreover, existing aging methods\nstruggle to perform faithful aging, maintain identity, and retain the fine\ndetails of the input images. Given these limitations and the need for a\n3D-aware aging method, we propose DiffAge3D, the first 3D-aware aging framework\nthat not only performs faithful aging and identity preservation but also\noperates in a 3D setting. Our aging framework allows to model the aging and\ncamera pose separately by only taking a single image with a target age. Our\nframework includes a robust 3D-aware aging dataset generation pipeline by\nutilizing a pre-trained 3D GAN and the rich text embedding capabilities within\nCLIP model. Notably, we do not employ any inversion bottleneck in dataset\ngeneration. Instead, we randomly generate training samples from the latent\nspace of 3D GAN, allowing us to manipulate the rich latent space of GAN to\ngenerate ages even with large gaps. With the generated dataset, we train a\nviewpoint-aware diffusion-based aging model to control the camera pose and\nfacial age. Through quantitative and qualitative evaluations, we demonstrate\nthat DiffAge3D outperforms existing methods, particularly in\nmultiview-consistent aging and fine details preservation.\n","authors":["Junaid Wahid","Fangneng Zhan","Pramod Rao","Christian Theobalt"],"pdf_url":"https://arxiv.org/pdf/2408.15922v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15915v1","updated":"2024-08-28T16:28:07Z","published":"2024-08-28T16:28:07Z","title":"Leveraging Open Knowledge for Advancing Task Expertise in Large Language\n  Models","summary":"  The cultivation of expertise for large language models (LLMs) to solve tasks\nof specific areas often requires special-purpose tuning with calibrated\nbehaviors on the expected stable outputs. To avoid huge cost brought by manual\npreparation of instruction datasets and training resources up to hundreds of\nhours, the exploitation of open knowledge including a wealth of low rank\nadaptation (LoRA) models and instruction datasets serves as a good starting\npoint. However, existing methods on model and data selection focus on the\nperformance of general-purpose capabilities while neglecting the knowledge gap\nexposed in domain-specific deployment. In the present study, we propose to\nbridge such gap by introducing few human-annotated samples (i.e., K-shot) for\nadvancing task expertise of LLMs with open knowledge. Specifically, we develop\nan efficient and scalable pipeline to cost-efficiently produce task experts\nwhere K-shot data intervene in selecting the most promising expert candidates\nand the task-relevant instructions. A mixture-of-expert (MoE) system is built\nto make the best use of individual-yet-complementary knowledge between multiple\nexperts. We unveil the two keys to the success of a MoE system, 1) the abidance\nby K-shot, and 2) the insistence on diversity. For the former, we ensure that\nmodels that truly possess problem-solving abilities on K-shot are selected\nrather than those blind guessers. Besides, during data selection, instructions\nthat share task-relevant contexts with K-shot are prioritized. For the latter,\nwe highlight the diversity of constituting experts and that of the fine-tuning\ninstructions throughout the model and data selection process. Extensive\nexperimental results confirm the superiority of our approach over existing\nmethods on utilization of open knowledge across various tasks. Codes and models\nwill be released later.\n","authors":["Yuncheng Yang","Yulei Qin","Tong Wu","Zihan Xu","Gang Li","Pengcheng Guo","Hang Shao","Yucheng Shi","Ke Li","Xing Sun","Jie Yang","Yun Gu"],"pdf_url":"https://arxiv.org/pdf/2408.15915v1.pdf","comment":"28 pages, 12 tables, 10 figures"},{"id":"http://arxiv.org/abs/2408.15914v1","updated":"2024-08-28T16:27:58Z","published":"2024-08-28T16:27:58Z","title":"CoRe: Context-Regularized Text Embedding Learning for Text-to-Image\n  Personalization","summary":"  Recent advances in text-to-image personalization have enabled high-quality\nand controllable image synthesis for user-provided concepts. However, existing\nmethods still struggle to balance identity preservation with text alignment.\nOur approach is based on the fact that generating prompt-aligned images\nrequires a precise semantic understanding of the prompt, which involves\naccurately processing the interactions between the new concept and its\nsurrounding context tokens within the CLIP text encoder. To address this, we\naim to embed the new concept properly into the input embedding space of the\ntext encoder, allowing for seamless integration with existing tokens. We\nintroduce Context Regularization (CoRe), which enhances the learning of the new\nconcept's text embedding by regularizing its context tokens in the prompt. This\nis based on the insight that appropriate output vectors of the text encoder for\nthe context tokens can only be achieved if the new concept's text embedding is\ncorrectly learned. CoRe can be applied to arbitrary prompts without requiring\nthe generation of corresponding images, thus improving the generalization of\nthe learned text embedding. Additionally, CoRe can serve as a test-time\noptimization technique to further enhance the generations for specific prompts.\nComprehensive experiments demonstrate that our method outperforms several\nbaseline methods in both identity preservation and text alignment. Code will be\nmade publicly available.\n","authors":["Feize Wu","Yun Pang","Junyi Zhang","Lianyu Pang","Jian Yin","Baoquan Zhao","Qing Li","Xudong Mao"],"pdf_url":"https://arxiv.org/pdf/2408.15914v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.01090v3","updated":"2024-08-28T16:23:19Z","published":"2023-11-02T08:55:11Z","title":"Infusion: internal diffusion for inpainting of dynamic textures and\n  complex motion","summary":"  Video inpainting is the task of filling a region in a video in a visually\nconvincing manner. It is very challenging due to the high dimensionality of the\ndata and the temporal consistency required for obtaining convincing results.\nRecently, diffusion models have shown impressive results in modeling complex\ndata distributions, including images and videos. Such models remain nonetheless\nvery expensive to train and to perform inference with, which strongly reduce\ntheir applicability to videos, and yields unreasonable computational loads. We\nshow that in the case of video inpainting, thanks to the highly auto-similar\nnature of videos, the training data of a diffusion model can be restricted to\nthe input video and still produce very satisfying results. This leads us to\nadopt an internal learning approach, which also allows us to greatly reduce the\nneural network size by about three orders of magnitude less than current\ndiffusion models used for image inpainting. We also introduce a new method for\nefficient training and inference of diffusion models in the context of internal\nlearning, by splitting the diffusion process into different learning intervals\ncorresponding to different noise levels of the diffusion process. To the best\nof our knowledge, this is the first video inpainting method based purely on\ndiffusion. Other methods require additional components such as optical flow\nestimation, which limits their performance in the case of dynamic textures and\ncomplex motions. We show qualitative and quantitative results, demonstrating\nthat our method reaches state of the art performance in the case of dynamic\ntextures and complex dynamic backgrounds.\n","authors":["Nicolas Cherel","Andrés Almansa","Yann Gousseau","Alasdair Newson"],"pdf_url":"https://arxiv.org/pdf/2311.01090v3.pdf","comment":"11 pages, 10 figures"},{"id":"http://arxiv.org/abs/2408.15899v1","updated":"2024-08-28T16:12:28Z","published":"2024-08-28T16:12:28Z","title":"Gen-Swarms: Adapting Deep Generative Models to Swarms of Drones","summary":"  Gen-Swarms is an innovative method that leverages and combines the\ncapabilities of deep generative models with reactive navigation algorithms to\nautomate the creation of drone shows. Advancements in deep generative models,\nparticularly diffusion models, have demonstrated remarkable effectiveness in\ngenerating high-quality 2D images. Building on this success, various works have\nextended diffusion models to 3D point cloud generation. In contrast,\nalternative generative models such as flow matching have been proposed,\noffering a simple and intuitive transition from noise to meaningful outputs.\nHowever, the application of flow matching models to 3D point cloud generation\nremains largely unexplored. Gen-Swarms adapts these models to automatically\ngenerate drone shows. Existing 3D point cloud generative models create point\ntrajectories which are impractical for drone swarms. In contrast, our method\nnot only generates accurate 3D shapes but also guides the swarm motion,\nproducing smooth trajectories and accounting for potential collisions through a\nreactive navigation algorithm incorporated into the sampling process. For\nexample, when given a text category like Airplane, Gen-Swarms can rapidly and\ncontinuously generate numerous variations of 3D airplane shapes. Our\nexperiments demonstrate that this approach is particularly well-suited for\ndrone shows, providing feasible trajectories, creating representative final\nshapes, and significantly enhancing the overall performance of drone show\ngeneration.\n","authors":["Carlos Plou","Pablo Pueyo","Ruben Martinez-Cantin","Mac Schwager","Ana C. Murillo","Eduardo Montijano"],"pdf_url":"https://arxiv.org/pdf/2408.15899v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15890v1","updated":"2024-08-28T16:03:18Z","published":"2024-08-28T16:03:18Z","title":"Disentangled Diffusion Autoencoder for Harmonization of Multi-site\n  Neuroimaging Data","summary":"  Combining neuroimaging datasets from multiple sites and scanners can help\nincrease statistical power and thus provide greater insight into subtle\nneuroanatomical effects. However, site-specific effects pose a challenge by\npotentially obscuring the biological signal and introducing unwanted variance.\nExisting harmonization techniques, which use statistical models to remove such\neffects, have been shown to incompletely remove site effects while also failing\nto preserve biological variability. More recently, generative models using GANs\nor autoencoder-based approaches, have been proposed for site adjustment.\nHowever, such methods are known for instability during training or blurry image\ngeneration. In recent years, diffusion models have become increasingly popular\nfor their ability to generate high-quality synthetic images. In this work, we\nintroduce the disentangled diffusion autoencoder (DDAE), a novel diffusion\nmodel designed for controlling specific aspects of an image. We apply the DDAE\nto the task of harmonizing MR images by generating high-quality site-adjusted\nimages that preserve biological variability. We use data from 7 different sites\nand demonstrate the DDAE's superiority in generating high-resolution,\nharmonized 2D MR images over previous approaches. As far as we are aware, this\nwork marks the first diffusion-based model for site adjustment of neuroimaging\ndata.\n","authors":["Ayodeji Ijishakin","Ana Lawry Aguila","Elizabeth Levitis","Ahmed Abdulaal","Andre Altmann","James Cole"],"pdf_url":"https://arxiv.org/pdf/2408.15890v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15887v1","updated":"2024-08-28T15:59:40Z","published":"2024-08-28T15:59:40Z","title":"SpineMamba: Enhancing 3D Spinal Segmentation in Clinical Imaging through\n  Residual Visual Mamba Layers and Shape Priors","summary":"  Accurate segmentation of 3D clinical medical images is critical in the\ndiagnosis and treatment of spinal diseases. However, the inherent complexity of\nspinal anatomy and uncertainty inherent in current imaging technologies, poses\nsignificant challenges for semantic segmentation of spinal images. Although\nconvolutional neural networks (CNNs) and Transformer-based models have made\nsome progress in spinal segmentation, their limitations in handling long-range\ndependencies hinder further improvements in segmentation accuracy.To address\nthese challenges, we introduce a residual visual Mamba layer to effectively\ncapture and model the deep semantic features and long-range spatial\ndependencies of 3D spinal data. To further enhance the structural semantic\nunderstanding of the vertebrae, we also propose a novel spinal shape prior\nmodule that captures specific anatomical information of the spine from medical\nimages, significantly enhancing the model's ability to extract structural\nsemantic information of the vertebrae. Comparative and ablation experiments on\ntwo datasets demonstrate that SpineMamba outperforms existing state-of-the-art\nmodels. On the CT dataset, the average Dice similarity coefficient for\nsegmentation reaches as high as 94.40, while on the MR dataset, it reaches\n86.95. Notably, compared to the renowned nnU-Net, SpineMamba achieves superior\nsegmentation performance, exceeding it by up to 2 percentage points. This\nunderscores its accuracy, robustness, and excellent generalization\ncapabilities.\n","authors":["Zhiqing Zhang","Tianyong Liu","Guojia Fan","Bin Li","Qianjin Feng","Shoujun Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.15887v1.pdf","comment":"17 pages, 11 figures"},{"id":"http://arxiv.org/abs/2408.15881v1","updated":"2024-08-28T15:52:23Z","published":"2024-08-28T15:52:23Z","title":"LLaVA-MoD: Making LLaVA Tiny via MoE Knowledge Distillation","summary":"  We introduce LLaVA-MoD, a novel framework designed to enable the efficient\ntraining of small-scale Multimodal Language Models (s-MLLM) by distilling\nknowledge from large-scale MLLM (l-MLLM). Our approach tackles two fundamental\nchallenges in MLLM distillation. First, we optimize the network structure of\ns-MLLM by integrating a sparse Mixture of Experts (MoE) architecture into the\nlanguage model, striking a balance between computational efficiency and model\nexpressiveness. Second, we propose a progressive knowledge transfer strategy to\nensure comprehensive knowledge migration. This strategy begins with mimic\ndistillation, where we minimize the Kullback-Leibler (KL) divergence between\noutput distributions to enable the student model to emulate the teacher\nnetwork's understanding. Following this, we introduce preference distillation\nvia Direct Preference Optimization (DPO), where the key lies in treating l-MLLM\nas the reference model. During this phase, the s-MLLM's ability to discriminate\nbetween superior and inferior examples is significantly enhanced beyond l-MLLM,\nleading to a better student that surpasses its teacher, particularly in\nhallucination benchmarks. Extensive experiments demonstrate that LLaVA-MoD\noutperforms existing models across various multimodal benchmarks while\nmaintaining a minimal number of activated parameters and low computational\ncosts. Remarkably, LLaVA-MoD, with only 2B activated parameters, surpasses\nQwen-VL-Chat-7B by an average of 8.8% across benchmarks, using merely 0.3% of\nthe training data and 23% trainable parameters. These results underscore\nLLaVA-MoD's ability to effectively distill comprehensive knowledge from its\nteacher model, paving the way for the development of more efficient MLLMs. The\ncode will be available on: https://github.com/shufangxun/LLaVA-MoD.\n","authors":["Fangxun Shu","Yue Liao","Le Zhuo","Chenning Xu","Guanghao Zhang","Haonan Shi","Long Chen","Tao Zhong","Wanggui He","Siming Fu","Haoyuan Li","Bolin Li","Zhelun Yu","Si Liu","Hongsheng Li","Hao Jiang"],"pdf_url":"https://arxiv.org/pdf/2408.15881v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15876v1","updated":"2024-08-28T15:47:32Z","published":"2024-08-28T15:47:32Z","title":"Unleashing the Temporal-Spatial Reasoning Capacity of GPT for\n  Training-Free Audio and Language Referenced Video Object Segmentation","summary":"  In this paper, we propose an Audio-Language-Referenced SAM 2 (AL-Ref-SAM 2)\npipeline to explore the training-free paradigm for audio and\nlanguage-referenced video object segmentation, namely AVS and RVOS tasks. The\nintuitive solution leverages GroundingDINO to identify the target object from a\nsingle frame and SAM 2 to segment the identified object throughout the video,\nwhich is less robust to spatiotemporal variations due to a lack of video\ncontext exploration. Thus, in our AL-Ref-SAM 2 pipeline, we propose a novel\nGPT-assisted Pivot Selection (GPT-PS) module to instruct GPT-4 to perform\ntwo-step temporal-spatial reasoning for sequentially selecting pivot frames and\npivot boxes, thereby providing SAM 2 with a high-quality initial object prompt.\nWithin GPT-PS, two task-specific Chain-of-Thought prompts are designed to\nunleash GPT's temporal-spatial reasoning capacity by guiding GPT to make\nselections based on a comprehensive understanding of video and reference\ninformation. Furthermore, we propose a Language-Binded Reference Unification\n(LBRU) module to convert audio signals into language-formatted references,\nthereby unifying the formats of AVS and RVOS tasks in the same pipeline.\nExtensive experiments on both tasks show that our training-free AL-Ref-SAM 2\npipeline achieves performances comparable to or even better than\nfully-supervised fine-tuning methods. The code is available at:\nhttps://github.com/appletea233/AL-Ref-SAM2.\n","authors":["Shaofei Huang","Rui Ling","Hongyu Li","Tianrui Hui","Zongheng Tang","Xiaoming Wei","Jizhong Han","Si Liu"],"pdf_url":"https://arxiv.org/pdf/2408.15876v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15868v1","updated":"2024-08-28T15:37:44Z","published":"2024-08-28T15:37:44Z","title":"GenDDS: Generating Diverse Driving Video Scenarios with Prompt-to-Video\n  Generative Model","summary":"  Autonomous driving training requires a diverse range of datasets encompassing\nvarious traffic conditions, weather scenarios, and road types. Traditional data\naugmentation methods often struggle to generate datasets that represent rare\noccurrences. To address this challenge, we propose GenDDS, a novel approach for\ngenerating driving scenarios generation by leveraging the capabilities of\nStable Diffusion XL (SDXL), an advanced latent diffusion model. Our methodology\ninvolves the use of descriptive prompts to guide the synthesis process, aimed\nat producing realistic and diverse driving scenarios. With the power of the\nlatest computer vision techniques, such as ControlNet and Hotshot-XL, we have\nbuilt a complete pipeline for video generation together with SDXL. We employ\nthe KITTI dataset, which includes real-world driving videos, to train the\nmodel. Through a series of experiments, we demonstrate that our model can\ngenerate high-quality driving videos that closely replicate the complexity and\nvariability of real-world driving scenarios. This research contributes to the\ndevelopment of sophisticated training data for autonomous driving systems and\nopens new avenues for creating virtual environments for simulation and\nvalidation purposes.\n","authors":["Yongjie Fu","Yunlong Li","Xuan Di"],"pdf_url":"https://arxiv.org/pdf/2408.15868v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15865v1","updated":"2024-08-28T15:29:27Z","published":"2024-08-28T15:29:27Z","title":"microYOLO: Towards Single-Shot Object Detection on Microcontrollers","summary":"  This work-in-progress paper presents results on the feasibility of\nsingle-shot object detection on microcontrollers using YOLO. Single-shot object\ndetectors like YOLO are widely used, however due to their complexity mainly on\nlarger GPU-based platforms. We present microYOLO, which can be used on Cortex-M\nbased microcontrollers, such as the OpenMV H7 R2, achieving about 3.5 FPS when\nclassifying 128x128 RGB images while using less than 800 KB Flash and less than\n350 KB RAM. Furthermore, we share experimental results for three different\nobject detection tasks, analyzing the accuracy of microYOLO on them.\n","authors":["Mark Deutel","Christopher Mutschler","Jürgen Teich"],"pdf_url":"https://arxiv.org/pdf/2408.15865v1.pdf","comment":"Published at the ECML PKDD Conference 2023, at the 4th Workshop on\n  IoT, Edge, and Mobile for Embedded Machine Learning"},{"id":"http://arxiv.org/abs/2310.10835v3","updated":"2024-08-28T15:29:17Z","published":"2023-10-16T21:17:29Z","title":"Provable Probabilistic Imaging using Score-Based Generative Priors","summary":"  Estimating high-quality images while also quantifying their uncertainty are\ntwo desired features in an image reconstruction algorithm for solving ill-posed\ninverse problems. In this paper, we propose plug-and-play Monte Carlo (PMC) as\na principled framework for characterizing the space of possible solutions to a\ngeneral inverse problem. PMC is able to incorporate expressive score-based\ngenerative priors for high-quality image reconstruction while also performing\nuncertainty quantification via posterior sampling. In particular, we develop\ntwo PMC algorithms that can be viewed as the sampling analogues of the\ntraditional plug-and-play priors (PnP) and regularization by denoising (RED)\nalgorithms. To improve the sampling efficiency, we introduce weighted annealing\ninto these PMC algorithms, further developing two additional annealed PMC\nalgorithms (APMC). We establish a theoretical analysis for characterizing the\nconvergence behavior of PMC algorithms. Our analysis provides non-asymptotic\nstationarity guarantees in terms of the Fisher information, fully compatible\nwith the joint presence of weighted annealing, potentially non-log-concave\nlikelihoods, and imperfect score networks. We demonstrate the performance of\nthe PMC algorithms on multiple representative inverse problems with both linear\nand nonlinear forward models. Experimental results show that PMC significantly\nimproves reconstruction quality and enables high-fidelity uncertainty\nquantification.\n","authors":["Yu Sun","Zihui Wu","Yifan Chen","Berthy T. Feng","Katherine L. Bouman"],"pdf_url":"https://arxiv.org/pdf/2310.10835v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15857v1","updated":"2024-08-28T15:18:46Z","published":"2024-08-28T15:18:46Z","title":"What is YOLOv8: An In-Depth Exploration of the Internal Features of the\n  Next-Generation Object Detector","summary":"  This study presents a detailed analysis of the YOLOv8 object detection model,\nfocusing on its architecture, training techniques, and performance improvements\nover previous iterations like YOLOv5. Key innovations, including the CSPNet\nbackbone for enhanced feature extraction, the FPN+PAN neck for superior\nmulti-scale object detection, and the transition to an anchor-free approach,\nare thoroughly examined. The paper reviews YOLOv8's performance across\nbenchmarks like Microsoft COCO and Roboflow 100, highlighting its high accuracy\nand real-time capabilities across diverse hardware platforms. Additionally, the\nstudy explores YOLOv8's developer-friendly enhancements, such as its unified\nPython package and CLI, which streamline model training and deployment.\nOverall, this research positions YOLOv8 as a state-of-the-art solution in the\nevolving object detection field.\n","authors":["Muhammad Yaseen"],"pdf_url":"https://arxiv.org/pdf/2408.15857v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19254v2","updated":"2024-08-28T15:13:45Z","published":"2024-03-28T09:21:00Z","title":"Imperceptible Protection against Style Imitation from Diffusion Models","summary":"  Recent progress in diffusion models has profoundly enhanced the fidelity of\nimage generation, but it has raised concerns about copyright infringements.\nWhile prior methods have introduced adversarial perturbations to prevent style\nimitation, most are accompanied by the degradation of artworks' visual quality.\nRecognizing the importance of maintaining this, we introduce a visually\nimproved protection method while preserving its protection capability. To this\nend, we devise a perceptual map to highlight areas sensitive to human eyes,\nguided by instance-aware refinement, which refines the protection intensity\naccordingly. We also introduce a difficulty-aware protection by predicting how\ndifficult the artwork is to protect and dynamically adjusting the intensity\nbased on this. Lastly, we integrate a perceptual constraints bank to further\nimprove the imperceptibility. Results show that our method substantially\nelevates the quality of the protected image without compromising on protection\nefficacy.\n","authors":["Namhyuk Ahn","Wonhyuk Ahn","KiYoon Yoo","Daesik Kim","Seung-Hun Nam"],"pdf_url":"https://arxiv.org/pdf/2403.19254v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15844v1","updated":"2024-08-28T15:04:52Z","published":"2024-08-28T15:04:52Z","title":"Shot Segmentation Based on Von Neumann Entropy for Key Frame Extraction","summary":"  Video key frame extraction is important in various fields, such as video\nsummary, retrieval, and compression. Therefore, we suggest a video key frame\nextraction algorithm based on shot segmentation using Von Neumann entropy. The\nsegmentation of shots is achieved through the computation of Von Neumann\nentropy of the similarity matrix among frames within the video sequence. The\ninitial frame of each shot is selected as key frames, which combines the\ntemporal sequence information of frames. The experimental results show the\nextracted key frames can fully and accurately represent the original video\ncontent while minimizing the number of repeated frames.\n","authors":["Xueqing Zhang. Di Fu","Naihao Liu"],"pdf_url":"https://arxiv.org/pdf/2408.15844v1.pdf","comment":"14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.15833v1","updated":"2024-08-28T14:47:34Z","published":"2024-08-28T14:47:34Z","title":"Network transferability of adversarial patches in real-time object\n  detection","summary":"  Adversarial patches in computer vision can be used, to fool deep neural\nnetworks and manipulate their decision-making process. One of the most\nprominent examples of adversarial patches are evasion attacks for object\ndetectors. By covering parts of objects of interest, these patches suppress the\ndetections and thus make the target object 'invisible' to the object detector.\nSince these patches are usually optimized on a specific network with a specific\ntrain dataset, the transferability across multiple networks and datasets is not\ngiven. This paper addresses these issues and investigates the transferability\nacross numerous object detector architectures. Our extensive evaluation across\nvarious models on two distinct datasets indicates that patches optimized with\nlarger models provide better network transferability than patches that are\noptimized with smaller models.\n","authors":["Jens Bayer","Stefan Becker","David Münch","Michael Arens"],"pdf_url":"https://arxiv.org/pdf/2408.15833v1.pdf","comment":"7 pages, 6 figures, 1 table"},{"id":"http://arxiv.org/abs/2408.15829v1","updated":"2024-08-28T14:44:42Z","published":"2024-08-28T14:44:42Z","title":"SITransformer: Shared Information-Guided Transformer for Extreme\n  Multimodal Summarization","summary":"  Extreme Multimodal Summarization with Multimodal Output (XMSMO) becomes an\nattractive summarization approach by integrating various types of information\nto create extremely concise yet informative summaries for individual\nmodalities. Existing methods overlook the issue that multimodal data often\ncontains more topic irrelevant information, which can mislead the model into\nproducing inaccurate summaries especially for extremely short ones. In this\npaper, we propose SITransformer, a \\textbf{S}hared \\textbf{I}nformation-guided\n\\textbf{T}ransformer for extreme multimodal summarization. It has a shared\ninformation guided pipeline which involves a cross-modal shared information\nextractor and a cross-modal interaction module. The extractor formulates\nsemantically shared salient information from different modalities by devising a\nnovel filtering process consisting of a differentiable top-k selector and a\nshared-information guided gating unit. As a result, the common, salient, and\nrelevant contents across modalities are identified. Next, a transformer with\ncross-modal attentions is developed for intra- and inter-modality learning with\nthe shared information guidance to produce the extreme summary. Comprehensive\nexperiments demonstrate that SITransformer significantly enhances the\nsummarization quality for both video and text summaries for XMSMO. Our code\nwill be publicly available at https://github.com/SichengLeoLiu/MMAsia24-XMSMO.\n","authors":["Sicheng Liu","Lintao Wang","Xiaogan Zhu","Xuequan Lu","Zhiyong Wang","Kun Hu"],"pdf_url":"https://arxiv.org/pdf/2408.15829v1.pdf","comment":"8 pages, 5 figures, submitted to ACM Multimedia Asia 2024"},{"id":"http://arxiv.org/abs/2408.15823v1","updated":"2024-08-28T14:34:45Z","published":"2024-08-28T14:34:45Z","title":"Benchmarking foundation models as feature extractors for\n  weakly-supervised computational pathology","summary":"  Advancements in artificial intelligence have driven the development of\nnumerous pathology foundation models capable of extracting clinically relevant\ninformation. However, there is currently limited literature independently\nevaluating these foundation models on truly external cohorts and\nclinically-relevant tasks to uncover adjustments for future improvements. In\nthis study, we benchmarked ten histopathology foundation models on 13 patient\ncohorts with 6,791 patients and 9,493 slides from lung, colorectal, gastric,\nand breast cancers. The models were evaluated on weakly-supervised tasks\nrelated to biomarkers, morphological properties, and prognostic outcomes. We\nshow that a vision-language foundation model, CONCH, yielded the highest\nperformance in 42% of tasks when compared to vision-only foundation models. The\nexperiments reveal that foundation models trained on distinct cohorts learn\ncomplementary features to predict the same label, and can be fused to\noutperform the current state of the art. Creating an ensemble of complementary\nfoundation models outperformed CONCH in 66% of tasks. Moreover, our findings\nsuggest that data diversity outweighs data volume for foundation models. Our\nwork highlights actionable adjustments to improve pathology foundation models.\n","authors":["Peter Neidlinger","Omar S. M. El Nahhas","Hannah Sophie Muti","Tim Lenz","Michael Hoffmeister","Hermann Brenner","Marko van Treeck","Rupert Langer","Bastian Dislich","Hans Michael Behrens","Christoph Röcken","Sebastian Foersch","Daniel Truhn","Antonio Marra","Oliver Lester Saldanha","Jakob Nikolas Kather"],"pdf_url":"https://arxiv.org/pdf/2408.15823v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.05348v4","updated":"2024-08-28T14:26:07Z","published":"2023-11-09T13:18:27Z","title":"u-LLaVA: Unifying Multi-Modal Tasks via Large Language Model","summary":"  Recent advancements in multi-modal large language models (MLLMs) have led to\nsubstantial improvements in visual understanding, primarily driven by\nsophisticated modality alignment strategies. However, predominant approaches\nprioritize global or regional comprehension, with less focus on fine-grained,\npixel-level tasks. To address this gap, we introduce u-LLaVA, an innovative\nunifying multi-task framework that integrates pixel, regional, and global\nfeatures to refine the perceptual faculties of MLLMs. We commence by leveraging\nan efficient modality alignment approach, harnessing both image and video\ndatasets to bolster the model's foundational understanding across diverse\nvisual contexts. Subsequently, a joint instruction tuning method with\ntask-specific projectors and decoders for end-to-end downstream training is\npresented. Furthermore, this work contributes a novel mask-based multi-task\ndataset comprising 277K samples, crafted to challenge and assess the\nfine-grained perception capabilities of MLLMs. The overall framework is simple,\neffective, and achieves state-of-the-art performance across multiple\nbenchmarks. We also make our model, data, and code publicly accessible at\nhttps://github.com/OPPOMKLab/u-LLaVA.\n","authors":["Jinjin Xu","Liwu Xu","Yuzhe Yang","Xiang Li","Fanyi Wang","Yanchun Xie","Yi-Jie Huang","Yaqian Li"],"pdf_url":"https://arxiv.org/pdf/2311.05348v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15816v1","updated":"2024-08-28T14:25:35Z","published":"2024-08-28T14:25:35Z","title":"Mining Field Data for Tree Species Recognition at Scale","summary":"  Individual tree species labels are particularly hard to acquire due to the\nexpert knowledge needed and the limitations of photointerpretation. Here, we\npresent a methodology to automatically mine species labels from public forest\ninventory data, using available pretrained tree detection models. We identify\ntree instances in aerial imagery and match them with field data with close to\nzero human involvement. We conduct a series of experiments on the resulting\ndataset, and show a beneficial effect when adding noisy or even unlabeled data\npoints, highlighting a strong potential for large-scale individual species\nmapping.\n","authors":["Dimitri Gominski","Daniel Ortiz-Gonzalo","Martin Brandt","Maurice Mugabowindekwe","Rasmus Fensholt"],"pdf_url":"https://arxiv.org/pdf/2408.15816v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15813v1","updated":"2024-08-28T14:14:33Z","published":"2024-08-28T14:14:33Z","title":"DQFormer: Towards Unified LiDAR Panoptic Segmentation with Decoupled\n  Queries","summary":"  LiDAR panoptic segmentation, which jointly performs instance and semantic\nsegmentation for things and stuff classes, plays a fundamental role in LiDAR\nperception tasks. While most existing methods explicitly separate these two\nsegmentation tasks and utilize different branches (i.e., semantic and instance\nbranches), some recent methods have embraced the query-based paradigm to unify\nLiDAR panoptic segmentation. However, the distinct spatial distribution and\ninherent characteristics of objects(things) and their surroundings(stuff) in 3D\nscenes lead to challenges, including the mutual competition of things/stuff and\nthe ambiguity of classification/segmentation. In this paper, we propose\ndecoupling things/stuff queries according to their intrinsic properties for\nindividual decoding and disentangling classification/segmentation to mitigate\nambiguity. To this end, we propose a novel framework dubbed DQFormer to\nimplement semantic and instance segmentation in a unified workflow.\nSpecifically, we design a decoupled query generator to propose informative\nqueries with semantics by localizing things/stuff positions and fusing\nmulti-level BEV embeddings. Moreover, a query-oriented mask decoder is\nintroduced to decode corresponding segmentation masks by performing masked\ncross-attention between queries and mask embeddings. Finally, the decoded masks\nare combined with the semantics of the queries to produce panoptic results.\nExtensive experiments on nuScenes and SemanticKITTI datasets demonstrate the\nsuperiority of our DQFormer framework.\n","authors":["Yu Yang","Jianbiao Mei","Liang Liu","Siliang Du","Yilin Xiao","Jongwon Ra","Yong Liu","Xiao Xu","Huifeng Wu"],"pdf_url":"https://arxiv.org/pdf/2408.15813v1.pdf","comment":"13 pages, 10 figures"},{"id":"http://arxiv.org/abs/2408.15810v1","updated":"2024-08-28T14:10:57Z","published":"2024-08-28T14:10:57Z","title":"Multi-view Pose Fusion for Occlusion-Aware 3D Human Pose Estimation","summary":"  Robust 3D human pose estimation is crucial to ensure safe and effective\nhuman-robot collaboration. Accurate human perception,however, is particularly\nchallenging in these scenarios due to strong occlusions and limited camera\nviewpoints. Current 3D human pose estimation approaches are rather vulnerable\nin such conditions. In this work we present a novel approach for robust 3D\nhuman pose estimation in the context of human-robot collaboration. Instead of\nrelying on noisy 2D features triangulation, we perform multi-view fusion on 3D\nskeletons provided by absolute monocular methods. Accurate 3D pose estimation\nis then obtained via reprojection error optimization, introducing limbs length\nsymmetry constraints. We evaluate our approach on the public dataset Human3.6M\nand on a novel version Human3.6M-Occluded, derived adding synthetic occlusions\non the camera views with the purpose of testing pose estimation algorithms\nunder severe occlusions. We further validate our method on real human-robot\ncollaboration workcells, in which we strongly surpass current 3D human pose\nestimation methods. Our approach outperforms state-of-the-art multi-view human\npose estimation techniques and demonstrates superior capabilities in handling\nchallenging scenarios with strong occlusions, representing a reliable and\neffective solution for real human-robot collaboration setups.\n","authors":["Laura Bragagnolo","Matteo Terreran","Davide Allegro","Stefano Ghidoni"],"pdf_url":"https://arxiv.org/pdf/2408.15810v1.pdf","comment":"ECCV workshops 2024"},{"id":"http://arxiv.org/abs/2408.15809v1","updated":"2024-08-28T14:08:24Z","published":"2024-08-28T14:08:24Z","title":"Object Detection for Vehicle Dashcams using Transformers","summary":"  The use of intelligent automation is growing significantly in the automotive\nindustry, as it assists drivers and fleet management companies, thus increasing\ntheir productivity. Dash cams are now been used for this purpose which enables\nthe instant identification and understanding of multiple objects and\noccurrences in the surroundings. In this paper, we propose a novel approach for\nobject detection in dashcams using transformers. Our system is based on the\nstate-of-the-art DEtection TRansformer (DETR), which has demonstrated strong\nperformance in a variety of conditions, including different weather and\nillumination scenarios. The use of transformers allows for the consideration of\ncontextual information in decisionmaking, improving the accuracy of object\ndetection. To validate our approach, we have trained our DETR model on a\ndataset that represents real-world conditions. Our results show that the use of\nintelligent automation through transformers can significantly enhance the\ncapabilities of dashcam systems. The model achieves an mAP of 0.95 on\ndetection.\n","authors":["Osama Mustafa","Khizer Ali","Anam Bibi","Imran Siddiqi","Momina Moetesum"],"pdf_url":"https://arxiv.org/pdf/2408.15809v1.pdf","comment":"7 Pages, and 6 Figures"},{"id":"http://arxiv.org/abs/2408.15802v1","updated":"2024-08-28T13:53:27Z","published":"2024-08-28T13:53:27Z","title":"Visual Prompt Engineering for Medical Vision Language Models in\n  Radiology","summary":"  Medical image classification in radiology faces significant challenges,\nparticularly in generalizing to unseen pathologies. In contrast, CLIP offers a\npromising solution by leveraging multimodal learning to improve zero-shot\nclassification performance. However, in the medical domain, lesions can be\nsmall and might not be well represented in the embedding space. Therefore, in\nthis paper, we explore the potential of visual prompt engineering to enhance\nthe capabilities of Vision Language Models (VLMs) in radiology. Leveraging\nBiomedCLIP, trained on extensive biomedical image-text pairs, we investigate\nthe impact of embedding visual markers directly within radiological images to\nguide the model's attention to critical regions. Our evaluation on the JSRT\ndataset, focusing on lung nodule malignancy classification, demonstrates that\nincorporating visual prompts $\\unicode{x2013}$ such as arrows, circles, and\ncontours $\\unicode{x2013}$ significantly improves classification metrics\nincluding AUROC, AUPRC, F1 score, and accuracy. Moreover, the study provides\nattention maps, showcasing enhanced model interpretability and focus on\nclinically relevant areas. These findings underscore the efficacy of visual\nprompt engineering as a straightforward yet powerful approach to advance VLM\nperformance in medical image analysis.\n","authors":["Stefan Denner","Markus Bujotzek","Dimitrios Bounias","David Zimmerer","Raphael Stock","Paul F. Jäger","Klaus Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2408.15802v1.pdf","comment":"Accepted at ECCV 2024 Workshop on Emergent Visual Abilities and\n  Limits of Foundation Models"},{"id":"http://arxiv.org/abs/2405.18064v2","updated":"2024-08-28T13:41:34Z","published":"2024-05-28T11:24:20Z","title":"Automated Real-World Sustainability Data Generation from Images of\n  Buildings","summary":"  When data on building features is unavailable, the task of determining how to\nimprove that building in terms of carbon emissions becomes infeasible. We show\nthat from only a set of images, a Large Language Model with appropriate prompt\nengineering and domain knowledge can successfully estimate a range of building\nfeatures relevant for sustainability calculations. We compare our novel\nimage-to-data method with a ground truth comprising real building data for 47\napartments and achieve accuracy better than a human performing the same task.\nWe also demonstrate that the method can generate tailored recommendations to\nthe owner on how best to improve their properties and discuss methods to scale\nthe approach.\n","authors":["Peter J Bentley","Soo Ling Lim","Rajat Mathur","Sid Narang"],"pdf_url":"https://arxiv.org/pdf/2405.18064v2.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2307.14382v2","updated":"2024-08-28T13:30:36Z","published":"2023-07-25T20:08:41Z","title":"When Multi-Task Learning Meets Partial Supervision: A Computer Vision\n  Review","summary":"  Multi-Task Learning (MTL) aims to learn multiple tasks simultaneously while\nexploiting their mutual relationships. By using shared resources to\nsimultaneously calculate multiple outputs, this learning paradigm has the\npotential to have lower memory requirements and inference times compared to the\ntraditional approach of using separate methods for each task. Previous work in\nMTL has mainly focused on fully-supervised methods, as task relationships can\nnot only be leveraged to lower the level of data-dependency of those methods\nbut they can also improve performance. However, MTL introduces a set of\nchallenges due to a complex optimisation scheme and a higher labeling\nrequirement. This review focuses on how MTL could be utilised under different\npartial supervision settings to address these challenges. First, this review\nanalyses how MTL traditionally uses different parameter sharing techniques to\ntransfer knowledge in between tasks. Second, it presents the different\nchallenges arising from such a multi-objective optimisation scheme. Third, it\nintroduces how task groupings can be achieved by analysing task relationships.\nFourth, it focuses on how partially supervised methods applied to MTL can\ntackle the aforementioned challenges. Lastly, this review presents the\navailable datasets, tools and benchmarking results of such methods.\n","authors":["Maxime Fontana","Michael Spratling","Miaojing Shi"],"pdf_url":"https://arxiv.org/pdf/2307.14382v2.pdf","comment":"Accepted by Proceedings of the IEEE"},{"id":"http://arxiv.org/abs/2408.15777v1","updated":"2024-08-28T13:15:25Z","published":"2024-08-28T13:15:25Z","title":"A Survey on Facial Expression Recognition of Static and Dynamic Emotions","summary":"  Facial expression recognition (FER) aims to analyze emotional states from\nstatic images and dynamic sequences, which is pivotal in enhancing\nanthropomorphic communication among humans, robots, and digital avatars by\nleveraging AI technologies. As the FER field evolves from controlled laboratory\nenvironments to more complex in-the-wild scenarios, advanced methods have been\nrapidly developed and new challenges and apporaches are encounted, which are\nnot well addressed in existing reviews of FER. This paper offers a\ncomprehensive survey of both image-based static FER (SFER) and video-based\ndynamic FER (DFER) methods, analyzing from model-oriented development to\nchallenge-focused categorization. We begin with a critical comparison of recent\nreviews, an introduction to common datasets and evaluation criteria, and an\nin-depth workflow on FER to establish a robust research foundation. We then\nsystematically review representative approaches addressing eight main\nchallenges in SFER (such as expression disturbance, uncertainties, compound\nemotions, and cross-domain inconsistency) as well as seven main challenges in\nDFER (such as key frame sampling, expression intensity variations, and\ncross-modal alignment). Additionally, we analyze recent advancements, benchmark\nperformances, major applications, and ethical considerations. Finally, we\npropose five promising future directions and development trends to guide\nongoing research. The project page for this paper can be found at\nhttps://github.com/wangyanckxx/SurveyFER.\n","authors":["Yan Wang","Shaoqi Yan","Yang Liu","Wei Song","Jing Liu","Yang Chang","Xinji Mai","Xiping Hu","Wenqiang Zhang","Zhongxue Gan"],"pdf_url":"https://arxiv.org/pdf/2408.15777v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15769v1","updated":"2024-08-28T13:05:55Z","published":"2024-08-28T13:05:55Z","title":"A Survey on Evaluation of Multimodal Large Language Models","summary":"  Multimodal Large Language Models (MLLMs) mimic human perception and reasoning\nsystem by integrating powerful Large Language Models (LLMs) with various\nmodality encoders (e.g., vision, audio), positioning LLMs as the \"brain\" and\nvarious modality encoders as sensory organs. This framework endows MLLMs with\nhuman-like capabilities, and suggests a potential pathway towards achieving\nartificial general intelligence (AGI). With the emergence of all-round MLLMs\nlike GPT-4V and Gemini, a multitude of evaluation methods have been developed\nto assess their capabilities across different dimensions. This paper presents a\nsystematic and comprehensive review of MLLM evaluation methods, covering the\nfollowing key aspects: (1) the background of MLLMs and their evaluation; (2)\n\"what to evaluate\" that reviews and categorizes existing MLLM evaluation tasks\nbased on the capabilities assessed, including general multimodal recognition,\nperception, reasoning and trustworthiness, and domain-specific applications\nsuch as socioeconomic, natural sciences and engineering, medical usage, AI\nagent, remote sensing, video and audio processing, 3D point cloud analysis, and\nothers; (3) \"where to evaluate\" that summarizes MLLM evaluation benchmarks into\ngeneral and specific benchmarks; (4) \"how to evaluate\" that reviews and\nillustrates MLLM evaluation steps and metrics; Our overarching goal is to\nprovide valuable insights for researchers in the field of MLLM evaluation,\nthereby facilitating the development of more capable and reliable MLLMs. We\nemphasize that evaluation should be regarded as a critical discipline,\nessential for advancing the field of MLLMs.\n","authors":["Jiaxing Huang","Jingyi Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.15769v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19730v5","updated":"2024-08-28T13:05:41Z","published":"2024-05-30T06:21:34Z","title":"Research on the Spatial Data Intelligent Foundation Model","summary":"  This report focuses on spatial data intelligent large models, delving into\nthe principles, methods, and cutting-edge applications of these models. It\nprovides an in-depth discussion on the definition, development history, current\nstatus, and trends of spatial data intelligent large models, as well as the\nchallenges they face. The report systematically elucidates the key technologies\nof spatial data intelligent large models and their applications in urban\nenvironments, aerospace remote sensing, geography, transportation, and other\nscenarios. Additionally, it summarizes the latest application cases of spatial\ndata intelligent large models in themes such as urban development, multimodal\nsystems, remote sensing, smart transportation, and resource environments.\nFinally, the report concludes with an overview and outlook on the development\nprospects of spatial data intelligent large models.\n","authors":["Shaohua Wang","Xing Xie","Yong Li","Danhuai Guo","Zhi Cai","Yu Liu","Yang Yue","Xiao Pan","Feng Lu","Huayi Wu","Zhipeng Gui","Zhiming Ding","Bolong Zheng","Fuzheng Zhang","Jingyuan Wang","Zhengchao Chen","Hao Lu","Jiayi Li","Peng Yue","Wenhao Yu","Yao Yao","Leilei Sun","Yong Zhang","Longbiao Chen","Xiaoping Du","Xiang Li","Xueying Zhang","Kun Qin","Zhaoya Gong","Weihua Dong","Xiaofeng Meng"],"pdf_url":"https://arxiv.org/pdf/2405.19730v5.pdf","comment":"V1 and V2 are in Chinese language, other versions are in English"},{"id":"http://arxiv.org/abs/2408.15761v1","updated":"2024-08-28T12:56:00Z","published":"2024-08-28T12:56:00Z","title":"Addressing the challenges of loop detection in agricultural environments","summary":"  While visual SLAM systems are well studied and achieve impressive results in\nindoor and urban settings, natural, outdoor and open-field environments are\nmuch less explored and still present relevant research challenges. Visual\nnavigation and local mapping have shown a relatively good performance in\nopen-field environments. However, globally consistent mapping and long-term\nlocalization still depend on the robustness of loop detection and closure, for\nwhich the literature is scarce. In this work we propose a novel method to pave\nthe way towards robust loop detection in open fields, particularly in\nagricultural settings, based on local feature search and stereo geometric\nrefinement, with a final stage of relative pose estimation. Our method\nconsistently achieves good loop detections, with a median error of 15cm. We aim\nto characterize open fields as a novel environment for loop detection,\nunderstanding the limitations and problems that arise when dealing with them.\n","authors":["Nicolás Soncini","Javier Civera","Taihú Pire"],"pdf_url":"https://arxiv.org/pdf/2408.15761v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18006v2","updated":"2024-08-28T12:43:26Z","published":"2024-04-27T20:54:15Z","title":"FRAME: A Modular Framework for Autonomous Map Merging: Advancements in\n  the Field","summary":"  In this article, a novel approach for merging 3D point cloud maps in the\ncontext of egocentric multi-robot exploration is presented. Unlike traditional\nmethods, the proposed approach leverages state-of-the-art place recognition and\nlearned descriptors to efficiently detect overlap between maps, eliminating the\nneed for the time-consuming global feature extraction and feature matching\nprocess. The estimated overlapping regions are used to calculate a homogeneous\nrigid transform, which serves as an initial condition for the GICP point cloud\nregistration algorithm to refine the alignment between the maps. The advantages\nof this approach include faster processing time, improved accuracy, and\nincreased robustness in challenging environments. Furthermore, the\neffectiveness of the proposed framework is successfully demonstrated through\nmultiple field missions of robot exploration in a variety of different\nunderground environments.\n","authors":["Nikolaos Stathoulopoulos","Björn Lindqvist","Anton Koval","Ali-akbar Agha-mohammadi","George Nikolakopoulos"],"pdf_url":"https://arxiv.org/pdf/2404.18006v2.pdf","comment":"28 pages, 24 figures. Accepted to the IEEE Transactions on Field\n  Robotics"},{"id":"http://arxiv.org/abs/2312.02255v3","updated":"2024-08-28T12:43:10Z","published":"2023-12-04T18:56:08Z","title":"Re-Nerfing: Improving Novel View Synthesis through Novel View Synthesis","summary":"  Recent neural rendering and reconstruction techniques, such as NeRFs or\nGaussian Splatting, have shown remarkable novel view synthesis capabilities but\nrequire hundreds of images of the scene from diverse viewpoints to render\nhigh-quality novel views. With fewer images available, these methods start to\nfail since they can no longer correctly triangulate the underlying 3D geometry\nand converge to a non-optimal solution. These failures can manifest as floaters\nor blurry renderings in sparsely observed areas of the scene. In this paper, we\npropose Re-Nerfing, a simple and general add-on approach that leverages novel\nview synthesis itself to tackle this problem. Using an already trained NVS\nmethod, we render novel views between existing ones and augment the training\ndata to optimize a second model. This introduces additional multi-view\nconstraints and allows the second model to converge to a better solution. With\nRe-Nerfing we achieve significant improvements upon multiple pipelines based on\nNeRF and Gaussian-Splatting in sparse view settings of the mip-NeRF 360 and\nLLFF datasets. Notably, Re-Nerfing does not require prior knowledge or extra\nsupervision signals, making it a flexible and practical add-on.\n","authors":["Felix Tristram","Stefano Gasperini","Nassir Navab","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2312.02255v3.pdf","comment":"Code will be released upon acceptance"},{"id":"http://arxiv.org/abs/2408.15750v1","updated":"2024-08-28T12:33:26Z","published":"2024-08-28T12:33:26Z","title":"Str-L Pose: Integrating Point and Structured Line for Relative Pose\n  Estimation in Dual-Graph","summary":"  Relative pose estimation is crucial for various computer vision applications,\nincluding Robotic and Autonomous Driving. Current methods primarily depend on\nselecting and matching feature points prone to incorrect matches, leading to\npoor performance. Consequently, relying solely on point-matching relationships\nfor pose estimation is a huge challenge. To overcome these limitations, we\npropose a Geometric Correspondence Graph neural network that integrates point\nfeatures with extra structured line segments. This integration of matched\npoints and line segments further exploits the geometry constraints and enhances\nmodel performance across different environments. We employ the Dual-Graph\nmodule and Feature Weighted Fusion Module to aggregate geometric and visual\nfeatures effectively, facilitating complex scene understanding. We demonstrate\nour approach through extensive experiments on the DeMoN and KITTI Odometry\ndatasets. The results show that our method is competitive with state-of-the-art\ntechniques.\n","authors":["Zherong Zhang","Chunyu Lin","Shujuan Huang","Shangrong Yang","Yao Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.15750v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15741v1","updated":"2024-08-28T12:08:25Z","published":"2024-08-28T12:08:25Z","title":"Segmentation-guided Layer-wise Image Vectorization with Gradient Fills","summary":"  The widespread use of vector graphics creates a significant demand for\nvectorization methods. While recent learning-based techniques have shown their\ncapability to create vector images of clear topology, filling these primitives\nwith gradients remains a challenge. In this paper, we propose a\nsegmentation-guided vectorization framework to convert raster images into\nconcise vector graphics with radial gradient fills. With the guidance of an\nembedded gradient-aware segmentation subroutine, our approach progressively\nappends gradient-filled B\\'ezier paths to the output, where primitive\nparameters are initiated with our newly designed initialization technique and\nare optimized to minimize our novel loss function. We build our method on a\ndifferentiable renderer with traditional segmentation algorithms to develop it\nas a model-free tool for raster-to-vector conversion. It is tested on various\ninputs to demonstrate its feasibility, independent of datasets, to synthesize\nvector graphics with improved visual quality and layer-wise topology compared\nto prior work.\n","authors":["Hengyu Zhou","Hui Zhang","Bin Wang"],"pdf_url":"https://arxiv.org/pdf/2408.15741v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15740v1","updated":"2024-08-28T12:06:11Z","published":"2024-08-28T12:06:11Z","title":"MambaPlace:Text-to-Point-Cloud Cross-Modal Place Recognition with\n  Attention Mamba Mechanisms","summary":"  Vision Language Place Recognition (VLVPR) enhances robot localization\nperformance by incorporating natural language descriptions from images. By\nutilizing language information, VLVPR directs robot place matching, overcoming\nthe constraint of solely depending on vision. The essence of multimodal fusion\nlies in mining the complementary information between different modalities.\nHowever, general fusion methods rely on traditional neural architectures and\nare not well equipped to capture the dynamics of cross modal interactions,\nespecially in the presence of complex intra modal and inter modal correlations.\nTo this end, this paper proposes a novel coarse to fine and end to end\nconnected cross modal place recognition framework, called MambaPlace. In the\ncoarse localization stage, the text description and 3D point cloud are encoded\nby the pretrained T5 and instance encoder, respectively. They are then\nprocessed using Text Attention Mamba (TAM) and Point Clouds Mamba (PCM) for\ndata enhancement and alignment. In the subsequent fine localization stage, the\nfeatures of the text description and 3D point cloud are cross modally fused and\nfurther enhanced through cascaded Cross Attention Mamba (CCAM). Finally, we\npredict the positional offset from the fused text point cloud features,\nachieving the most accurate localization. Extensive experiments show that\nMambaPlace achieves improved localization accuracy on the KITTI360Pose dataset\ncompared to the state of the art methods.\n","authors":["Tianyi Shang","Zhenyu Li","Wenhao Pei","Pengjie Xu","ZhaoJun Deng","Fanchen Kong"],"pdf_url":"https://arxiv.org/pdf/2408.15740v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2408.14035v2","updated":"2024-08-28T12:03:50Z","published":"2024-08-26T06:01:54Z","title":"FAST-LIVO2: Fast, Direct LiDAR-Inertial-Visual Odometry","summary":"  This paper proposes FAST-LIVO2: a fast, direct LiDAR-inertial-visual odometry\nframework to achieve accurate and robust state estimation in SLAM tasks and\nprovide great potential in real-time, onboard robotic applications. FAST-LIVO2\nfuses the IMU, LiDAR and image measurements efficiently through an ESIKF. To\naddress the dimension mismatch between the heterogeneous LiDAR and image\nmeasurements, we use a sequential update strategy in the Kalman filter. To\nenhance the efficiency, we use direct methods for both the visual and LiDAR\nfusion, where the LiDAR module registers raw points without extracting edge or\nplane features and the visual module minimizes direct photometric errors\nwithout extracting ORB or FAST corner features. The fusion of both visual and\nLiDAR measurements is based on a single unified voxel map where the LiDAR\nmodule constructs the geometric structure for registering new LiDAR scans and\nthe visual module attaches image patches to the LiDAR points. To enhance the\naccuracy of image alignment, we use plane priors from the LiDAR points in the\nvoxel map (and even refine the plane prior) and update the reference patch\ndynamically after new images are aligned. Furthermore, to enhance the\nrobustness of image alignment, FAST-LIVO2 employs an on-demanding raycast\noperation and estimates the image exposure time in real time. Lastly, we detail\nthree applications of FAST-LIVO2: UAV onboard navigation demonstrating the\nsystem's computation efficiency for real-time onboard navigation, airborne\nmapping showcasing the system's mapping accuracy, and 3D model rendering\n(mesh-based and NeRF-based) underscoring the suitability of our reconstructed\ndense map for subsequent rendering tasks. We open source our code, dataset and\napplication on GitHub to benefit the robotics community.\n","authors":["Chunran Zheng","Wei Xu","Zuhao Zou","Tong Hua","Chongjian Yuan","Dongjiao He","Bingyang Zhou","Zheng Liu","Jiarong Lin","Fangcheng Zhu","Yunfan Ren","Rong Wang","Fanle Meng","Fu Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.14035v2.pdf","comment":"30 pages, 31 figures, due to the limitation that 'The abstract field\n  cannot exceed 1,920 characters', the abstract presented here is shorter than\n  the one in the PDF file"},{"id":"http://arxiv.org/abs/2408.15721v1","updated":"2024-08-28T11:36:43Z","published":"2024-08-28T11:36:43Z","title":"Defending Text-to-image Diffusion Models: Surprising Efficacy of Textual\n  Perturbations Against Backdoor Attacks","summary":"  Text-to-image diffusion models have been widely adopted in real-world\napplications due to their ability to generate realistic images from textual\ndescriptions. However, recent studies have shown that these methods are\nvulnerable to backdoor attacks. Despite the significant threat posed by\nbackdoor attacks on text-to-image diffusion models, countermeasures remain\nunder-explored. In this paper, we address this research gap by demonstrating\nthat state-of-the-art backdoor attacks against text-to-image diffusion models\ncan be effectively mitigated by a surprisingly simple defense strategy -\ntextual perturbation. Experiments show that textual perturbations are effective\nin defending against state-of-the-art backdoor attacks with minimal sacrifice\nto generation quality. We analyze the efficacy of textual perturbation from two\nangles: text embedding space and cross-attention maps. They further explain how\nbackdoor attacks have compromised text-to-image diffusion models, providing\ninsights for studying future attack and defense strategies. Our code is\navailable at https://github.com/oscarchew/t2i-backdoor-defense.\n","authors":["Oscar Chew","Po-Yi Lu","Jayden Lin","Hsuan-Tien Lin"],"pdf_url":"https://arxiv.org/pdf/2408.15721v1.pdf","comment":"ECCV 2024 Workshop The Dark Side of Generative AIs and Beyond"},{"id":"http://arxiv.org/abs/2408.15714v1","updated":"2024-08-28T11:21:23Z","published":"2024-08-28T11:21:23Z","title":"Pixels to Prose: Understanding the art of Image Captioning","summary":"  In the era of evolving artificial intelligence, machines are increasingly\nemulating human-like capabilities, including visual perception and linguistic\nexpression. Image captioning stands at the intersection of these domains,\nenabling machines to interpret visual content and generate descriptive text.\nThis paper provides a thorough review of image captioning techniques, catering\nto individuals entering the field of machine learning who seek a comprehensive\nunderstanding of available options, from foundational methods to\nstate-of-the-art approaches. Beginning with an exploration of primitive\narchitectures, the review traces the evolution of image captioning models to\nthe latest cutting-edge solutions. By dissecting the components of these\narchitectures, readers gain insights into the underlying mechanisms and can\nselect suitable approaches tailored to specific problem requirements without\nduplicating efforts. The paper also delves into the application of image\ncaptioning in the medical domain, illuminating its significance in various\nreal-world scenarios.\n  Furthermore, the review offers guidance on evaluating the performance of\nimage captioning systems, highlighting key metrics for assessment. By\nsynthesizing theoretical concepts with practical application, this paper equips\nreaders with the knowledge needed to navigate the complex landscape of image\ncaptioning and harness its potential for diverse applications in machine\nlearning and beyond.\n","authors":["Hrishikesh Singh","Aarti Sharma","Millie Pant"],"pdf_url":"https://arxiv.org/pdf/2408.15714v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15708v1","updated":"2024-08-28T11:13:27Z","published":"2024-08-28T11:13:27Z","title":"Towards Realistic Example-based Modeling via 3D Gaussian Stitching","summary":"  Using parts of existing models to rebuild new models, commonly termed as\nexample-based modeling, is a classical methodology in the realm of computer\ngraphics. Previous works mostly focus on shape composition, making them very\nhard to use for realistic composition of 3D objects captured from real-world\nscenes. This leads to combining multiple NeRFs into a single 3D scene to\nachieve seamless appearance blending. However, the current SeamlessNeRF method\nstruggles to achieve interactive editing and harmonious stitching for\nreal-world scenes due to its gradient-based strategy and grid-based\nrepresentation. To this end, we present an example-based modeling method that\ncombines multiple Gaussian fields in a point-based representation using\nsample-guided synthesis. Specifically, as for composition, we create a GUI to\nsegment and transform multiple fields in real time, easily obtaining a\nsemantically meaningful composition of models represented by 3D Gaussian\nSplatting (3DGS). For texture blending, due to the discrete and irregular\nnature of 3DGS, straightforwardly applying gradient propagation as SeamlssNeRF\nis not supported. Thus, a novel sampling-based cloning method is proposed to\nharmonize the blending while preserving the original rich texture and content.\nOur workflow consists of three steps: 1) real-time segmentation and\ntransformation of a Gaussian model using a well-tailored GUI, 2) KNN analysis\nto identify boundary points in the intersecting area between the source and\ntarget models, and 3) two-phase optimization of the target model using\nsampling-based cloning and gradient constraints. Extensive experimental results\nvalidate that our approach significantly outperforms previous works in terms of\nrealistic synthesis, demonstrating its practicality. More demos are available\nat https://ingra14m.github.io/gs_stitching_website.\n","authors":["Xinyu Gao","Ziyi Yang","Bingchen Gong","Xiaoguang Han","Sipeng Yang","Xiaogang Jin"],"pdf_url":"https://arxiv.org/pdf/2408.15708v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.10534v2","updated":"2024-08-28T11:12:35Z","published":"2024-07-15T08:42:10Z","title":"Automated Label Unification for Multi-Dataset Semantic Segmentation with\n  GNNs","summary":"  Deep supervised models possess significant capability to assimilate extensive\ntraining data, thereby presenting an opportunity to enhance model performance\nthrough training on multiple datasets. However, conflicts arising from\ndifferent label spaces among datasets may adversely affect model performance.\nIn this paper, we propose a novel approach to automatically construct a unified\nlabel space across multiple datasets using graph neural networks. This enables\nsemantic segmentation models to be trained simultaneously on multiple datasets,\nresulting in performance improvements. Unlike existing methods, our approach\nfacilitates seamless training without the need for additional manual\nreannotation or taxonomy reconciliation. This significantly enhances the\nefficiency and effectiveness of multi-dataset segmentation model training. The\nresults demonstrate that our method significantly outperforms other\nmulti-dataset training methods when trained on seven datasets simultaneously,\nand achieves state-of-the-art performance on the WildDash 2 benchmark.\n","authors":["Rong Ma","Jie Chen","Xiangyang Xue","Jian Pu"],"pdf_url":"https://arxiv.org/pdf/2407.10534v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11982v2","updated":"2024-08-28T11:01:16Z","published":"2024-08-21T20:32:45Z","title":"AIM 2024 Challenge on Compressed Video Quality Assessment: Methods and\n  Results","summary":"  Video quality assessment (VQA) is a crucial task in the development of video\ncompression standards, as it directly impacts the viewer experience. This paper\npresents the results of the Compressed Video Quality Assessment challenge, held\nin conjunction with the Advances in Image Manipulation (AIM) workshop at ECCV\n2024. The challenge aimed to evaluate the performance of VQA methods on a\ndiverse dataset of 459 videos, encoded with 14 codecs of various compression\nstandards (AVC/H.264, HEVC/H.265, AV1, and VVC/H.266) and containing a\ncomprehensive collection of compression artifacts. To measure the methods\nperformance, we employed traditional correlation coefficients between their\npredictions and subjective scores, which were collected via large-scale\ncrowdsourced pairwise human comparisons. For training purposes, participants\nwere provided with the Compressed Video Quality Assessment Dataset (CVQAD), a\npreviously developed dataset of 1022 videos. Up to 30 participating teams\nregistered for the challenge, while we report the results of 6 teams, which\nsubmitted valid final solutions and code for reproducing the results. Moreover,\nwe calculated and present the performance of state-of-the-art VQA methods on\nthe developed dataset, providing a comprehensive benchmark for future research.\nThe dataset, results, and online leaderboard are publicly available at\nhttps://challenges.videoprocessing.ai/challenges/compressedvideo-quality-assessment.html.\n","authors":["Maksim Smirnov","Aleksandr Gushchin","Anastasia Antsiferova","Dmitry Vatolin","Radu Timofte","Ziheng Jia","Zicheng Zhang","Wei Sun","Jiaying Qian","Yuqin Cao","Yinan Sun","Yuxin Zhu","Xiongkuo Min","Guangtao Zhai","Kanjar De","Qing Luo","Ao-Xiang Zhang","Peng Zhang","Haibo Lei","Linyan Jiang","Yaqing Li","Wenhui Meng","Xiaoheng Tan","Haiqiang Wang","Xiaozhong Xu","Shan Liu","Zhenzhong Chen","Zhengxue Cheng","Jiahao Xiao","Jun Xu","Chenlong He","Qi Zheng","Ruoxi Zhu","Min Li","Yibo Fan","Zhengzhong Tu"],"pdf_url":"https://arxiv.org/pdf/2408.11982v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17550v2","updated":"2024-08-28T10:52:32Z","published":"2024-03-26T09:58:06Z","title":"DeepMIF: Deep Monotonic Implicit Fields for Large-Scale LiDAR 3D Mapping","summary":"  Recently, significant progress has been achieved in sensing real large-scale\noutdoor 3D environments, particularly by using modern acquisition equipment\nsuch as LiDAR sensors. Unfortunately, they are fundamentally limited in their\nability to produce dense, complete 3D scenes. To address this issue, recent\nlearning-based methods integrate neural implicit representations and\noptimizable feature grids to approximate surfaces of 3D scenes. However,\nnaively fitting samples along raw LiDAR rays leads to noisy 3D mapping results\ndue to the nature of sparse, conflicting LiDAR measurements. Instead, in this\nwork we depart from fitting LiDAR data exactly, instead letting the network\noptimize a non-metric monotonic implicit field defined in 3D space. To fit our\nfield, we design a learning system integrating a monotonicity loss that enables\noptimizing neural monotonic fields and leverages recent progress in large-scale\n3D mapping. Our algorithm achieves high-quality dense 3D mapping performance as\ncaptured by multiple quantitative and perceptual measures and visual results\nobtained for Mai City, Newer College, and KITTI benchmarks. The code of our\napproach will be made publicly available.\n","authors":["Kutay Yılmaz","Matthias Nießner","Anastasiia Kornilova","Alexey Artemov"],"pdf_url":"https://arxiv.org/pdf/2403.17550v2.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2408.15695v1","updated":"2024-08-28T10:43:42Z","published":"2024-08-28T10:43:42Z","title":"G-Style: Stylized Gaussian Splatting","summary":"  We introduce G-Style, a novel algorithm designed to transfer the style of an\nimage onto a 3D scene represented using Gaussian Splatting. Gaussian Splatting\nis a powerful 3D representation for novel view synthesis, as -- compared to\nother approaches based on Neural Radiance Fields -- it provides fast scene\nrenderings and user control over the scene. Recent pre-prints have demonstrated\nthat the style of Gaussian Splatting scenes can be modified using an image\nexemplar. However, since the scene geometry remains fixed during the\nstylization process, current solutions fall short of producing satisfactory\nresults. Our algorithm aims to address these limitations by following a\nthree-step process: In a pre-processing step, we remove undesirable Gaussians\nwith large projection areas or highly elongated shapes. Subsequently, we\ncombine several losses carefully designed to preserve different scales of the\nstyle in the image, while maintaining as much as possible the integrity of the\noriginal scene content. During the stylization process and following the\noriginal design of Gaussian Splatting, we split Gaussians where additional\ndetail is necessary within our scene by tracking the gradient of the stylized\ncolor. Our experiments demonstrate that G-Style generates high-quality\nstylizations within just a few minutes, outperforming existing methods both\nqualitatively and quantitatively.\n","authors":["Áron Samuel Kovács","Pedro Hermosilla","Renata G. Raidou"],"pdf_url":"https://arxiv.org/pdf/2408.15695v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15693v1","updated":"2024-08-28T10:33:00Z","published":"2024-08-28T10:33:00Z","title":"Synthetic Forehead-creases Biometric Generation for Reliable User\n  Verification","summary":"  Recent studies have emphasized the potential of forehead-crease patterns as\nan alternative for face, iris, and periocular recognition, presenting\ncontactless and convenient solutions, particularly in situations where faces\nare covered by surgical masks. However, collecting forehead data presents\nchallenges, including cost and time constraints, as developing and optimizing\nforehead verification methods requires a substantial number of high-quality\nimages. To tackle these challenges, the generation of synthetic biometric data\nhas gained traction due to its ability to protect privacy while enabling\neffective training of deep learning-based biometric verification methods. In\nthis paper, we present a new framework to synthesize forehead-crease image data\nwhile maintaining important features, such as uniqueness and realism. The\nproposed framework consists of two main modules: a Subject-Specific Generation\nModule (SSGM), based on an image-to-image Brownian Bridge Diffusion Model\n(BBDM), which learns a one-to-many mapping between image pairs to generate\nidentity-aware synthetic forehead creases corresponding to real subjects, and a\nSubject-Agnostic Generation Module (SAGM), which samples new synthetic\nidentities with assistance from the SSGM. We evaluate the diversity and realism\nof the generated forehead-crease images primarily using the Fr\\'echet Inception\nDistance (FID) and the Structural Similarity Index Measure (SSIM). In addition,\nwe assess the utility of synthetically generated forehead-crease images using a\nforehead-crease verification system (FHCVS). The results indicate an\nimprovement in the verification accuracy of the FHCVS by utilizing synthetic\ndata.\n","authors":["Abhishek Tandon","Geetanjali Sharma","Gaurav Jaswal","Aditya Nigam","Raghavendra Ramachandra"],"pdf_url":"https://arxiv.org/pdf/2408.15693v1.pdf","comment":"Accepted at Generative AI for Futuristic Biometrics - IJCB'24 Special\n  Session"},{"id":"http://arxiv.org/abs/2408.15682v1","updated":"2024-08-28T10:12:44Z","published":"2024-08-28T10:12:44Z","title":"A quantitative model of takeover request time budget for conditionally\n  automated driving","summary":"  In conditional automation, the automated driving system assumes full control\nand only issues a takeover request to a human driver to resume driving in\ncritical situations. Previous studies have concluded that the time budget\nrequired by drivers to resume driving after a takeover request varies with\nsituations and different takeover variables. However, no comprehensive\ngeneralized approaches for estimating in advance the time budget required by\ndrivers to takeover have been provided. In this contribution, fixed (7 s) and\nvariable time budgets (6 s, 5 s, and 4 s) with and without visual imagery\nassistance were investigated for suitability in three takeover scenarios using\nperformance measures such as average lateral displacement. The results indicate\nthat 7 s is suitable for two of the studied scenarios based on their\ncharacteristics. Using the obtained results and known relations between\ntakeover variables, a mathematical formula for estimating takeover request time\nbudget is proposed. The proposed formula integrates individual stimulus\nresponse time, driving experience, scenario specific requirements and allows\nincreased safety for takeover maneuvers. Furthermore, the visual imagery\nresulted in increased takeover time which invariably increases the time budget.\nThus the time demand of the visualized information if applicable (such as\nvisual imagery) should be included in the time budget.\n","authors":["Foghor Tanshi","Dirk Söffker"],"pdf_url":"https://arxiv.org/pdf/2408.15682v1.pdf","comment":"Manuscript: 12 pages, 12 figures, 7 tables"},{"id":"http://arxiv.org/abs/2408.15679v1","updated":"2024-08-28T10:08:38Z","published":"2024-08-28T10:08:38Z","title":"DEAR: Depth-Enhanced Action Recognition","summary":"  Detecting actions in videos, particularly within cluttered scenes, poses\nsignificant challenges due to the limitations of 2D frame analysis from a\ncamera perspective. Unlike human vision, which benefits from 3D understanding,\nrecognizing actions in such environments can be difficult. This research\nintroduces a novel approach integrating 3D features and depth maps alongside\nRGB features to enhance action recognition accuracy. Our method involves\nprocessing estimated depth maps through a separate branch from the RGB feature\nencoder and fusing the features to understand the scene and actions\ncomprehensively. Using the Side4Video framework and VideoMamba, which employ\nCLIP and VisionMamba for spatial feature extraction, our approach outperformed\nour implementation of the Side4Video network on the Something-Something V2\ndataset. Our code is available at: https://github.com/SadeghRahmaniB/DEAR\n","authors":["Sadegh Rahmaniboldaji","Filip Rybansky","Quoc Vuong","Frank Guerin","Andrew Gilbert"],"pdf_url":"https://arxiv.org/pdf/2408.15679v1.pdf","comment":"5 pages, 1 figure, 1 table, accepted at Human-inspired Computer\n  Vision, ECCV"},{"id":"http://arxiv.org/abs/2408.15678v1","updated":"2024-08-28T10:07:17Z","published":"2024-08-28T10:07:17Z","title":"Deep Learning Based Speckle Filtering for Polarimetric SAR Images.\n  Application to Sentinel-1","summary":"  Speckle suppression in synthetic aperture radar (SAR) images is a key\nprocessing step which continues to be a research topic. A wide variety of\nmethods, using either spatially-based approaches or transform-based strategies,\nhave been developed and have shown to provide outstanding results. However,\nrecent advances in deep learning techniques and their application to SAR image\ndespeckling have been demonstrated to offer state-of-the-art results.\nUnfortunately, they have been mostly applied to single-polarimetric images. The\nextension of a deep learning-based approach for speckle removal to polarimetric\nSAR (PolSAR) images is complicated because of the complex nature of the\nmeasured covariance matrices for every image pixel, the properties of which\nmust be preserved during filtering. In this work, we propose a complete\nframework to remove speckle in polarimetric SAR images using a convolutional\nneural network. The methodology includes a reversible transformation of the\noriginal complex covariance matrix to obtain a set of real-valued intensity\nbands which are fed to the neural network. In addition, the proposed method\nincludes a change detection strategy to avoid the neural network to learn\nerroneous features in areas strongly affected by temporal changes, so that the\nnetwork only learns the underlying speckle component present in the data. The\nmethod is implemented and tested with dual-polarimetric images acquired by\nSentinel-1. Experiments show that the proposed approach offers exceptional\nresults in both speckle reduction and resolution preservation. More\nimportantly, it is also shown that the neural network is not generating\nartifacts or introducing bias in the filtered images, making them suitable for\nfurther polarimetric processing and exploitation.\n","authors":["Alejandro Mestre-Quereda","Juan M. Lopez-Sanchez"],"pdf_url":"https://arxiv.org/pdf/2408.15678v1.pdf","comment":"23 pages, 32 figures"},{"id":"http://arxiv.org/abs/2312.03187v3","updated":"2024-08-28T10:00:01Z","published":"2023-12-05T23:33:49Z","title":"FERGI: Automatic Annotation of User Preferences for Text-to-Image\n  Generation from Spontaneous Facial Expression Reaction","summary":"  Researchers have proposed to use data of human preference feedback to\nfine-tune text-to-image generative models. However, the scalability of human\nfeedback collection has been limited by its reliance on manual annotation.\nTherefore, we develop and test a method to automatically score user preferences\nfrom their spontaneous facial expression reaction to the generated images. We\ncollect a dataset of Facial Expression Reaction to Generated Images (FERGI) and\nshow that the activations of multiple facial action units (AUs) are highly\ncorrelated with user evaluations of the generated images. We develop an FAU-Net\n(Facial Action Units Neural Network), which receives inputs from an AU\nestimation model, to automatically score user preferences for text-to-image\ngeneration based on their facial expression reactions, which is complementary\nto the pre-trained scoring models based on the input text prompts and generated\nimages. Integrating our FAU-Net valence score with the pre-trained scoring\nmodels improves their consistency with human preferences. This method of\nautomatic annotation with facial expression analysis can be potentially\ngeneralized to other generation tasks. The code is available at\nhttps://github.com/ShuangquanFeng/FERGI, and the dataset is also available at\nthe same link for research purposes.\n","authors":["Shuangquan Feng","Junhua Ma","Virginia R. de Sa"],"pdf_url":"https://arxiv.org/pdf/2312.03187v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14930v2","updated":"2024-08-28T09:50:00Z","published":"2024-08-27T10:09:17Z","title":"CMTA: Cross-Modal Temporal Alignment for Event-guided Video Deblurring","summary":"  Video deblurring aims to enhance the quality of restored results in\nmotion-blurred videos by effectively gathering information from adjacent video\nframes to compensate for the insufficient data in a single blurred frame.\nHowever, when faced with consecutively severe motion blur situations,\nframe-based video deblurring methods often fail to find accurate temporal\ncorrespondence among neighboring video frames, leading to diminished\nperformance. To address this limitation, we aim to solve the video deblurring\ntask by leveraging an event camera with micro-second temporal resolution. To\nfully exploit the dense temporal resolution of the event camera, we propose two\nmodules: 1) Intra-frame feature enhancement operates within the exposure time\nof a single blurred frame, iteratively enhancing cross-modality features in a\nrecurrent manner to better utilize the rich temporal information of events, 2)\nInter-frame temporal feature alignment gathers valuable long-range temporal\ninformation to target frames, aggregating sharp features leveraging the\nadvantages of the events. In addition, we present a novel dataset composed of\nreal-world blurred RGB videos, corresponding sharp videos, and event data. This\ndataset serves as a valuable resource for evaluating event-guided deblurring\nmethods. We demonstrate that our proposed methods outperform state-of-the-art\nframe-based and event-based motion deblurring methods through extensive\nexperiments conducted on both synthetic and real-world deblurring datasets. The\ncode and dataset are available at https://github.com/intelpro/CMTA.\n","authors":["Taewoo Kim","Hoonhee Cho","Kuk-Jin Yoon"],"pdf_url":"https://arxiv.org/pdf/2408.14930v2.pdf","comment":"Accepted in ECCV2024"},{"id":"http://arxiv.org/abs/2401.12471v2","updated":"2024-08-28T09:48:24Z","published":"2024-01-23T03:45:05Z","title":"Training-Free Action Recognition and Goal Inference with Dynamic Frame\n  Selection","summary":"  We introduce VidTFS, a Training-free, open-vocabulary video goal and action\ninference framework that combines the frozen vision foundational model (VFM)\nand large language model (LLM) with a novel dynamic Frame Selection module. Our\nexperiments demonstrate that the proposed frame selection module improves the\nperformance of the framework significantly. We validate the performance of the\nproposed VidTFS on four widely used video datasets, including CrossTask, COIN,\nUCF101, and ActivityNet, covering goal inference and action recognition tasks\nunder open-vocabulary settings without requiring any training or fine-tuning.\nThe results show that VidTFS outperforms pretrained and instruction-tuned\nmultimodal language models that directly stack LLM and VFM for downstream video\ninference tasks. Our VidTFS with its adaptability shows the future potential\nfor generalizing to new training-free video inference tasks.\n","authors":["Ee Yeo Keat","Zhang Hao","Alexander Matyasko","Basura Fernando"],"pdf_url":"https://arxiv.org/pdf/2401.12471v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.15048v2","updated":"2024-08-28T09:42:44Z","published":"2024-01-26T18:20:53Z","title":"Unrecognizable Yet Identifiable: Image Distortion with Preserved\n  Embeddings","summary":"  Biometric authentication systems play a crucial role in modern security\nsystems. However, maintaining the balance of privacy and integrity of stored\nbiometrics derivative data while achieving high recognition accuracy is often\nchallenging. Addressing this issue, we introduce an innovative image\ntransformation technique that effectively renders facial images unrecognizable\nto the eye while maintaining their identifiability by neural network models,\nwhich allows the distorted photo version to be stored for further verification.\nWhile initially intended for biometrics systems, the proposed methodology can\nbe used in various artificial intelligence applications to distort the visual\ndata and keep the derived features close. By experimenting with widely used\ndatasets LFW and MNIST, we show that it is possible to build the distortion\nthat changes the image content by more than 70% while maintaining the same\nrecognition accuracy. We compare our method with previously state-of-the-art\napproaches. We publically release the source code.\n","authors":["Dmytro Zakharov","Oleksandr Kuznetsov","Emanuele Frontoni"],"pdf_url":"https://arxiv.org/pdf/2401.15048v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15667v1","updated":"2024-08-28T09:40:40Z","published":"2024-08-28T09:40:40Z","title":"Towards reliable respiratory disease diagnosis based on cough sounds and\n  vision transformers","summary":"  Recent advancements in deep learning techniques have sparked performance\nboosts in various real-world applications including disease diagnosis based on\nmulti-modal medical data. Cough sound data-based respiratory disease (e.g.,\nCOVID-19 and Chronic Obstructive Pulmonary Disease) diagnosis has also\nattracted much attention. However, existing works usually utilise traditional\nmachine learning or deep models of moderate scales. On the other hand, the\ndeveloped approaches are trained and evaluated on small-scale data due to the\ndifficulty of curating and annotating clinical data on scale. To address these\nissues in prior works, we create a unified framework to evaluate various deep\nmodels from lightweight Convolutional Neural Networks (e.g., ResNet18) to\nmodern vision transformers and compare their performance in respiratory disease\nclassification. Based on the observations from such an extensive empirical\nstudy, we propose a novel approach to cough-based disease classification based\non both self-supervised and supervised learning on a large-scale cough data\nset. Experimental results demonstrate our proposed approach outperforms prior\narts consistently on two benchmark datasets for COVID-19 diagnosis and a\nproprietary dataset for COPD/non-COPD classification with an AUROC of 92.5%.\n","authors":["Qian Wang","Zhaoyang Bu","Jiaxuan Mao","Wenyu Zhu","Jingya Zhao","Wei Du","Guochao Shi","Min Zhou","Si Chen","Jieming Qu"],"pdf_url":"https://arxiv.org/pdf/2408.15667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02044v4","updated":"2024-08-28T09:34:33Z","published":"2023-10-03T13:35:49Z","title":"How Physics and Background Attributes Impact Video Transformers in\n  Robotic Manipulation: A Case Study on Planar Pushing","summary":"  As model and dataset sizes continue to scale in robot learning, the need to\nunderstand how the composition and properties of a dataset affect model\nperformance becomes increasingly urgent to ensure cost-effective data\ncollection and model performance. In this work, we empirically investigate how\nphysics attributes (color, friction coefficient, shape) and scene background\ncharacteristics, such as the complexity and dynamics of interactions with\nbackground objects, influence the performance of Video Transformers in\npredicting planar pushing trajectories. We investigate three primary questions:\nHow do physics attributes and background scene characteristics influence model\nperformance? What kind of changes in attributes are most detrimental to model\ngeneralization? What proportion of fine-tuning data is required to adapt models\nto novel scenarios? To facilitate this research, we present\nCloudGripper-Push-1K, a large real-world vision-based robot pushing dataset\ncomprising 1278 hours and 460,000 videos of planar pushing interactions with\nobjects with different physics and background attributes. We also propose Video\nOcclusion Transformer (VOT), a generic modular video-transformer-based\ntrajectory prediction framework which features 3 choices of 2D-spatial encoders\nas the subject of our case study. The dataset and source code are available at\nhttps://cloudgripper.org.\n","authors":["Shutong Jin","Ruiyu Wang","Muhammad Zahid","Florian T. Pokorny"],"pdf_url":"https://arxiv.org/pdf/2310.02044v4.pdf","comment":"IEEE/RSJ IROS 2024"},{"id":"http://arxiv.org/abs/2408.15660v1","updated":"2024-08-28T09:22:32Z","published":"2024-08-28T09:22:32Z","title":"Merging and Splitting Diffusion Paths for Semantically Coherent\n  Panoramas","summary":"  Diffusion models have become the State-of-the-Art for text-to-image\ngeneration, and increasing research effort has been dedicated to adapting the\ninference process of pretrained diffusion models to achieve zero-shot\ncapabilities. An example is the generation of panorama images, which has been\ntackled in recent works by combining independent diffusion paths over\noverlapping latent features, which is referred to as joint diffusion, obtaining\nperceptually aligned panoramas. However, these methods often yield semantically\nincoherent outputs and trade-off diversity for uniformity. To overcome this\nlimitation, we propose the Merge-Attend-Diffuse operator, which can be plugged\ninto different types of pretrained diffusion models used in a joint diffusion\nsetting to improve the perceptual and semantical coherence of the generated\npanorama images. Specifically, we merge the diffusion paths, reprogramming\nself- and cross-attention to operate on the aggregated latent space. Extensive\nquantitative and qualitative experimental analysis, together with a user study,\ndemonstrate that our method maintains compatibility with the input prompt and\nvisual quality of the generated images while increasing their semantic\ncoherence. We release the code at https://github.com/aimagelab/MAD.\n","authors":["Fabio Quattrini","Vittorio Pippi","Silvia Cascianelli","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2408.15660v1.pdf","comment":"Accepted at ECCV 2024"},{"id":"http://arxiv.org/abs/2408.15657v1","updated":"2024-08-28T09:18:36Z","published":"2024-08-28T09:18:36Z","title":"TeFF: Tracking-enhanced Forgetting-free Few-shot 3D LiDAR Semantic\n  Segmentation","summary":"  In autonomous driving, 3D LiDAR plays a crucial role in understanding the\nvehicle's surroundings. However, the newly emerged, unannotated objects\npresents few-shot learning problem for semantic segmentation. This paper\naddresses the limitations of current few-shot semantic segmentation by\nexploiting the temporal continuity of LiDAR data. Employing a tracking model to\ngenerate pseudo-ground-truths from a sequence of LiDAR frames, our method\nsignificantly augments the dataset, enhancing the model's ability to learn on\nnovel classes. However, this approach introduces a data imbalance biased to\nnovel data that presents a new challenge of catastrophic forgetting. To\nmitigate this, we incorporate LoRA, a technique that reduces the number of\ntrainable parameters, thereby preserving the model's performance on base\nclasses while improving its adaptability to novel classes. This work represents\na significant step forward in few-shot 3D LiDAR semantic segmentation for\nautonomous driving. Our code is available at\nhttps://github.com/junbao-zhou/Track-no-forgetting.\n","authors":["Junbao Zhou","Jilin Mei","Pengze Wu","Liang Chen","Fangzhou Zhao","Xijun Zhao","Yu Hu"],"pdf_url":"https://arxiv.org/pdf/2408.15657v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13123v2","updated":"2024-08-28T09:18:00Z","published":"2024-08-23T14:50:49Z","title":"Evidential Deep Partial Multi-View Classification With Discount Fusion","summary":"  Incomplete multi-view data classification poses significant challenges due to\nthe common issue of missing views in real-world scenarios. Despite\nadvancements, existing methods often fail to provide reliable predictions,\nlargely due to the uncertainty of missing views and the inconsistent quality of\nimputed data. To tackle these problems, we propose a novel framework called\nEvidential Deep Partial Multi-View Classification (EDP-MVC). Initially, we use\nK-means imputation to address missing views, creating a complete set of\nmulti-view data. However, the potential conflicts and uncertainties within this\nimputed data can affect the reliability of downstream inferences. To manage\nthis, we introduce a Conflict-Aware Evidential Fusion Network (CAEFN), which\ndynamically adjusts based on the reliability of the evidence, ensuring\ntrustworthy discount fusion and producing reliable inference outcomes.\nComprehensive experiments on various benchmark datasets reveal EDP-MVC not only\nmatches but often surpasses the performance of state-of-the-art methods.\n","authors":["Haojian Huang","Zhe Liu","Sukumar Letchmunan","Muhammet Deveci","Mingwei Lin","Weizhong Wang"],"pdf_url":"https://arxiv.org/pdf/2408.13123v2.pdf","comment":"Ongoing work. 13 pages, 3 figures, 6 tables"},{"id":"http://arxiv.org/abs/2408.15656v1","updated":"2024-08-28T09:17:25Z","published":"2024-08-28T09:17:25Z","title":"Realigned Softmax Warping for Deep Metric Learning","summary":"  Deep Metric Learning (DML) loss functions traditionally aim to control the\nforces of separability and compactness within an embedding space so that the\nsame class data points are pulled together and different class ones are pushed\napart. Within the context of DML, a softmax operation will typically normalize\ndistances into a probability for optimization, thus coupling all the push/pull\nforces together. This paper proposes a potential new class of loss functions\nthat operate within a euclidean domain and aim to take full advantage of the\ncoupled forces governing embedding space formation under a softmax. These\nforces of compactness and separability can be boosted or mitigated within\ncontrolled locations at will by using a warping function. In this work, we\nprovide a simple example of a warping function and use it to achieve\ncompetitive, state-of-the-art results on various metric learning benchmarks.\n","authors":["Michael G. DeMoor","John J. Prevost"],"pdf_url":"https://arxiv.org/pdf/2408.15656v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2408.15119v2","updated":"2024-08-28T09:11:55Z","published":"2024-08-27T14:58:13Z","title":"Urdu Digital Text Word Optical Character Recognition Using Permuted Auto\n  Regressive Sequence Modeling","summary":"  This research paper presents a novel word-level Optical Character Recognition\n(OCR) model developed specifically for digital Urdu text. The model utilizes\ntransformer-based architectures and attention mechanisms to address the unique\nchallenges of recognizing Urdu script, which includes handling a diverse range\nof text styles, fonts, and variations. Trained on a comprehensive dataset of\napproximately 160,000 Urdu text images, the model incorporates a permuted\nautoregressive sequence (PARSeq) architecture. This design enables\ncontext-aware inference and iterative refinement by leveraging bidirectional\ncontext information, significantly enhancing its ability to accurately\nrecognize Urdu characters. The model achieves a character error rate (CER) of\n0.178, highlighting its effectiveness and precision in real-world applications.\nHowever, the model has some limitations, such as difficulties with blurred\nimages, non-horizontal orientations, and the presence of trailing punctuation\nmarks, which can introduce noise into the recognition process. Addressing these\nchallenges will be a key focus of future work. Future research will aim to\nfurther refine the model through advanced data augmentation techniques,\noptimization of hyperparameters, and the integration of context-aware language\nmodels, ultimately enhancing the model's performance and robustness in Urdu\ntext recognition.\n","authors":["Ahmed Mustafa","Muhammad Tahir Rafique","Muhammad Ijlal Baig","Hasan Sajid","Muhammad Jawad Khan","Karam Dad Kallu"],"pdf_url":"https://arxiv.org/pdf/2408.15119v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.00467v2","updated":"2024-08-28T09:11:40Z","published":"2024-03-01T11:45:29Z","title":"When ControlNet Meets Inexplicit Masks: A Case Study of ControlNet on\n  its Contour-following Ability","summary":"  ControlNet excels at creating content that closely matches precise contours\nin user-provided masks. However, when these masks contain noise, as a frequent\noccurrence with non-expert users, the output would include unwanted artifacts.\nThis paper first highlights the crucial role of controlling the impact of these\ninexplicit masks with diverse deterioration levels through in-depth analysis.\nSubsequently, to enhance controllability with inexplicit masks, an advanced\nShape-aware ControlNet consisting of a deterioration estimator and a\nshape-prior modulation block is devised. The deterioration estimator assesses\nthe deterioration factor of the provided masks. Then this factor is utilized in\nthe modulation block to adaptively modulate the model's contour-following\nability, which helps it dismiss the noise part in the inexplicit masks.\nExtensive experiments prove its effectiveness in encouraging ControlNet to\ninterpret inaccurate spatial conditions robustly rather than blindly following\nthe given contours. We showcase application scenarios like modifying shape\npriors and composable shape-controllable generation. Codes are soon available.\n","authors":["Wenjie Xuan","Yufei Xu","Shanshan Zhao","Chaoyue Wang","Juhua Liu","Bo Du","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2403.00467v2.pdf","comment":"Accepted by ACM-MM 2024"},{"id":"http://arxiv.org/abs/2401.11790v2","updated":"2024-08-28T09:09:34Z","published":"2024-01-22T09:40:52Z","title":"Deep Learning for Computer Vision based Activity Recognition and Fall\n  Detection of the Elderly: a Systematic Review","summary":"  As the percentage of elderly people in developed countries increases\nworldwide, the healthcare of this collective is a worrying matter, especially\nif it includes the preservation of their autonomy. In this direction, many\nstudies are being published on Ambient Assisted Living (AAL) systems, which\nhelp to reduce the preoccupations raised by the independent living of the\nelderly. In this study, a systematic review of the literature is presented on\nfall detection and Human Activity Recognition (HAR) for the elderly, as the two\nmain tasks to solve to guarantee the safety of elderly people living alone. To\naddress the current tendency to perform these two tasks, the review focuses on\nthe use of Deep Learning (DL) based approaches on computer vision data. In\naddition, different collections of data like DL models, datasets or hardware\n(e.g. depth or thermal cameras) are gathered from the reviewed studies and\nprovided for reference in future studies. Strengths and weaknesses of existing\napproaches are also discussed and, based on them, our recommendations for\nfuture works are provided.\n","authors":["F. Xavier Gaya-Morey","Cristina Manresa-Yee","Jose M. Buades-Rubio"],"pdf_url":"https://arxiv.org/pdf/2401.11790v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15651v1","updated":"2024-08-28T09:07:40Z","published":"2024-08-28T09:07:40Z","title":"Online pre-training with long-form videos","summary":"  In this study, we investigate the impact of online pre-training with\ncontinuous video clips. We will examine three methods for pre-training (masked\nimage modeling, contrastive learning, and knowledge distillation), and assess\nthe performance on downstream action recognition tasks. As a result, online\npre-training with contrast learning showed the highest performance in\ndownstream tasks. Our findings suggest that learning from long-form videos can\nbe helpful for action recognition with short videos.\n","authors":["Itsuki Kato","Kodai Kamiya","Toru Tamaki"],"pdf_url":"https://arxiv.org/pdf/2408.15651v1.pdf","comment":"GCCE2024"},{"id":"http://arxiv.org/abs/2408.04957v3","updated":"2024-08-28T09:05:01Z","published":"2024-08-09T09:22:40Z","title":"LLaVA-VSD: Large Language-and-Vision Assistant for Visual Spatial\n  Description","summary":"  Visual Spatial Description (VSD) aims to generate texts that describe the\nspatial relationships between objects within images. Traditional visual spatial\nrelationship classification (VSRC) methods typically output the spatial\nrelationship between two objects in an image, often neglecting world knowledge\nand lacking general language capabilities. In this paper, we propose a Large\nLanguage-and-Vision Assistant for Visual Spatial Description, named LLaVA-VSD,\nwhich is designed for the classification, description, and open-ended\ndescription of visual spatial relationships. Specifically, the model first\nconstructs a VSD instruction-following dataset using given figure-caption pairs\nfor the three tasks. It then employs LoRA to fine-tune a Large Language and\nVision Assistant for VSD, which has 13 billion parameters and supports\nhigh-resolution images. Finally, a large language model (Qwen-2) is used to\nrefine the generated sentences, enhancing their diversity and accuracy.\nLLaVA-VSD demonstrates excellent multimodal conversational capabilities and can\nfollow open-ended instructions to assist with inquiries about object\nrelationships in images.\n","authors":["Yizhang Jin","Jian Li","Jiangning Zhang","Jianlong Hu","Zhenye Gan","Xin Tan","Yong Liu","Yabiao Wang","Chengjie Wang","Lizhuang Ma"],"pdf_url":"https://arxiv.org/pdf/2408.04957v3.pdf","comment":"We have discovered a significant error in the paper that affects the\n  main conclusions. To ensure the accuracy of our research, we have decided to\n  withdraw this paper and will resubmit it after making the necessary\n  corrections"},{"id":"http://arxiv.org/abs/2408.15647v1","updated":"2024-08-28T09:01:55Z","published":"2024-08-28T09:01:55Z","title":"Leveraging Persistent Homology for Differential Diagnosis of Mild\n  Cognitive Impairment","summary":"  Mild cognitive impairment (MCI) is characterized by subtle changes in\ncognitive functions, often associated with disruptions in brain connectivity.\nThe present study introduces a novel fine-grained analysis to examine\ntopological alterations in neurodegeneration pertaining to six different brain\nnetworks of MCI subjects (Early/Late MCI). To achieve this, fMRI time series\nfrom two distinct populations are investigated: (i) the publicly accessible\nADNI dataset and (ii) our in-house dataset. The study utilizes sliding window\nembedding to convert each fMRI time series into a sequence of 3-dimensional\nvectors, facilitating the assessment of changes in regional brain topology.\nDistinct persistence diagrams are computed for Betti descriptors of\ndimension-0, 1, and 2. Wasserstein distance metric is used to quantify\ndifferences in topological characteristics. We have examined both (i)\nROI-specific inter-subject interactions and (ii) subject-specific inter-ROI\ninteractions. Further, a new deep learning model is proposed for\nclassification, achieving a maximum classification accuracy of 95% for the ADNI\ndataset and 85% for the in-house dataset. This methodology is further adapted\nfor the differential diagnosis of MCI sub-types, resulting in a peak accuracy\nof 76.5%, 91.1% and 80% in classifying HC Vs. EMCI, HC Vs. LMCI and EMCI Vs.\nLMCI, respectively. We showed that the proposed approach surpasses current\nstate-of-the-art techniques designed for classifying MCI and its sub-types\nusing fMRI.\n","authors":["Ninad Aithal","Debanjali Bhattacharya","Neelam Sinha","Thomas Gregor Issac"],"pdf_url":"https://arxiv.org/pdf/2408.15647v1.pdf","comment":"16 pages, 6 figures, 3 tables, accepted at International Conference\n  on Pattern Recognition 2024"},{"id":"http://arxiv.org/abs/2402.09066v2","updated":"2024-08-28T09:01:37Z","published":"2024-02-14T10:24:04Z","title":"Solid Waste Detection, Monitoring and Mapping in Remote Sensing Images:\n  A Survey","summary":"  The detection and characterization of illegal solid waste disposal sites are\nessential for environmental protection, particularly for mitigating pollution\nand health hazards. Improperly managed landfills contaminate soil and\ngroundwater via rainwater infiltration, posing threats to both animals and\nhumans. Traditional landfill identification approaches, such as on-site\ninspections, are time-consuming and expensive. Remote sensing is a\ncost-effective solution for the identification and monitoring of solid waste\ndisposal sites that enables broad coverage and repeated acquisitions over time.\nEarth Observation (EO) satellites, equipped with an array of sensors and\nimaging capabilities, have been providing high-resolution data for several\ndecades. Researchers proposed specialized techniques that leverage remote\nsensing imagery to perform a range of tasks such as waste site detection,\ndumping site monitoring, and assessment of suitable locations for new\nlandfills. This review aims to provide a detailed illustration of the most\nrelevant proposals for the detection and monitoring of solid waste sites by\ndescribing and comparing the approaches, the implemented techniques, and the\nemployed data. Furthermore, since the data sources are of the utmost importance\nfor developing an effective solid waste detection model, a comprehensive\noverview of the satellites and publicly available data sets is presented.\nFinally, this paper identifies the open issues in the state-of-the-art and\ndiscusses the relevant research directions for reducing the costs and improving\nthe effectiveness of novel solid waste detection methods.\n","authors":["Piero Fraternali","Luca Morandini","Sergio Luis Herrera González"],"pdf_url":"https://arxiv.org/pdf/2402.09066v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15646v1","updated":"2024-08-28T09:01:18Z","published":"2024-08-28T09:01:18Z","title":"μgat: Improving Single-Page Document Parsing by Providing Multi-Page\n  Context","summary":"  Regesta are catalogs of summaries of other documents and, in some cases, are\nthe only source of information about the content of such full-length documents.\nFor this reason, they are of great interest to scholars in many social and\nhumanities fields. In this work, we focus on Regesta Pontificum Romanum, a\nlarge collection of papal registers. Regesta are visually rich documents, where\nthe layout is as important as the text content to convey the contained\ninformation through the structure, and are inherently multi-page documents.\nAmong Digital Humanities techniques that can help scholars efficiently exploit\nregesta and other documental sources in the form of scanned documents, Document\nParsing has emerged as a task to process document images and convert them into\nmachine-readable structured representations, usually markup language. However,\ncurrent models focus on scientific and business documents, and most of them\nconsider only single-paged documents. To overcome this limitation, in this\nwork, we propose {\\mu}gat, an extension of the recently proposed Document\nparsing Nougat architecture, which can handle elements spanning over the single\npage limits. Specifically, we adapt Nougat to process a larger, multi-page\ncontext, consisting of the previous and the following page, while parsing the\ncurrent page. Experimental results, both qualitative and quantitative,\ndemonstrate the effectiveness of our proposed approach also in the case of the\nchallenging Regesta Pontificum Romanorum.\n","authors":["Fabio Quattrini","Carmine Zaccagnino","Silvia Cascianelli","Laura Righi","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2408.15646v1.pdf","comment":"Accepted at ECCV Workshop \"AI4DH: Artificial Intelligence for Digital\n  Humanities\""},{"id":"http://arxiv.org/abs/2408.15643v1","updated":"2024-08-28T08:53:33Z","published":"2024-08-28T08:53:33Z","title":"RIDE: Boosting 3D Object Detection for LiDAR Point Clouds via\n  Rotation-Invariant Analysis","summary":"  The rotation robustness property has drawn much attention to point cloud\nanalysis, whereas it still poses a critical challenge in 3D object detection.\nWhen subjected to arbitrary rotation, most existing detectors fail to produce\nexpected outputs due to the poor rotation robustness. In this paper, we present\nRIDE, a pioneering exploration of Rotation-Invariance for the 3D\nLiDAR-point-based object DEtector, with the key idea of designing\nrotation-invariant features from LiDAR scenes and then effectively\nincorporating them into existing 3D detectors. Specifically, we design a\nbi-feature extractor that extracts (i) object-aware features though sensitive\nto rotation but preserve geometry well, and (ii) rotation-invariant features,\nwhich lose geometric information to a certain extent but are robust to\nrotation. These two kinds of features complement each other to decode 3D\nproposals that are robust to arbitrary rotations. Particularly, our RIDE is\ncompatible and easy to plug into the existing one-stage and two-stage 3D\ndetectors, and boosts both detection performance and rotation robustness.\nExtensive experiments on the standard benchmarks showcase that the mean average\nprecision (mAP) and rotation robustness can be significantly boosted by\nintegrating with our RIDE, with +5.6% mAP and 53% rotation robustness\nimprovement on KITTI, +5.1% and 28% improvement correspondingly on nuScenes.\nThe code will be available soon.\n","authors":["Zhaoxuan Wang","Xu Han","Hongxin Liu","Xianzhi Li"],"pdf_url":"https://arxiv.org/pdf/2408.15643v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15642v1","updated":"2024-08-28T08:53:20Z","published":"2024-08-28T08:53:20Z","title":"Can SAR improve RSVQA performance?","summary":"  Remote sensing visual question answering (RSVQA) has been involved in several\nresearch in recent years, leading to an increase in new methods. RSVQA\nautomatically extracts information from satellite images, so far only optical,\nand a question to automatically search for the answer in the image and provide\nit in a textual form. In our research, we study whether Synthetic Aperture\nRadar (SAR) images can be beneficial to this field. We divide our study into\nthree phases which include classification methods and VQA. In the first one, we\nexplore the classification results of SAR alone and investigate the best method\nto extract information from SAR data. Then, we study the combination of SAR and\noptical data. In the last phase, we investigate how SAR images and a\ncombination of different modalities behave in RSVQA compared to a method only\nusing optical images. We conclude that adding the SAR modality leads to\nimproved performances, although further research on using SAR data to\nautomatically answer questions is needed as well as more balanced datasets.\n","authors":["Lucrezia Tosato","Sylvain Lobry","Flora Weissgerber","Laurent Wendling"],"pdf_url":"https://arxiv.org/pdf/2408.15642v1.pdf","comment":"6 pages, 4 figures"},{"id":"http://arxiv.org/abs/2408.15641v1","updated":"2024-08-28T08:52:33Z","published":"2024-08-28T08:52:33Z","title":"MMDRFuse: Distilled Mini-Model with Dynamic Refresh for Multi-Modality\n  Image Fusion","summary":"  In recent years, Multi-Modality Image Fusion (MMIF) has been applied to many\nfields, which has attracted many scholars to endeavour to improve the fusion\nperformance. However, the prevailing focus has predominantly been on the\narchitecture design, rather than the training strategies. As a low-level vision\ntask, image fusion is supposed to quickly deliver output images for observation\nand supporting downstream tasks. Thus, superfluous computational and storage\noverheads should be avoided. In this work, a lightweight Distilled Mini-Model\nwith a Dynamic Refresh strategy (MMDRFuse) is proposed to achieve this\nobjective. To pursue model parsimony, an extremely small convolutional network\nwith a total of 113 trainable parameters (0.44 KB) is obtained by three\ncarefully designed supervisions. First, digestible distillation is constructed\nby emphasising external spatial feature consistency, delivering soft\nsupervision with balanced details and saliency for the target network. Second,\nwe develop a comprehensive loss to balance the pixel, gradient, and perception\nclues from the source images. Third, an innovative dynamic refresh training\nstrategy is used to collaborate history parameters and current supervision\nduring training, together with an adaptive adjust function to optimise the\nfusion network. Extensive experiments on several public datasets demonstrate\nthat our method exhibits promising advantages in terms of model efficiency and\ncomplexity, with superior performance in multiple image fusion tasks and\ndownstream pedestrian detection application. The code of this work is publicly\navailable at https://github.com/yanglinDeng/MMDRFuse.\n","authors":["Yanglin Deng","Tianyang Xu","Chunyang Cheng","Xiao-Jun Wu","Josef Kittler"],"pdf_url":"https://arxiv.org/pdf/2408.15641v1.pdf","comment":"10 pages, 8 figures, accpeted by ACM International Conference on\n  Multimedia 2024(Oral)"},{"id":"http://arxiv.org/abs/2407.02392v4","updated":"2024-08-28T08:49:57Z","published":"2024-07-02T16:10:55Z","title":"TokenPacker: Efficient Visual Projector for Multimodal LLM","summary":"  The visual projector serves as an essential bridge between the visual encoder\nand the Large Language Model (LLM) in a Multimodal LLM (MLLM). Typically, MLLMs\nadopt a simple MLP to preserve all visual contexts via one-to-one\ntransformation. However, the visual tokens are redundant and can be\nconsiderably increased when dealing with high-resolution images, impairing the\nefficiency of MLLMs significantly. Some recent works have introduced resampler\nor abstractor to reduce the number of resulting visual tokens. Unfortunately,\nthey fail to capture finer details and undermine the visual reasoning\ncapabilities of MLLMs. In this work, we propose a novel visual projector, which\nadopts a coarse-to-fine scheme to inject the enriched characteristics to\ngenerate the condensed visual tokens. In specific, we first interpolate the\nvisual features as a low-resolution point query, providing the overall visual\nrepresentation as the foundation. Then, we introduce a region-to-point\ninjection module that utilizes high-resolution, multi-level region-based cues\nas fine-grained reference keys and values, allowing them to be fully absorbed\nwithin the corresponding local context region. This step effectively updates\nthe coarse point query, transforming it into an enriched one for the subsequent\nLLM reasoning. Extensive experiments demonstrate that our approach compresses\nthe visual tokens by 75%~89%, while achieves comparable or even better\nperformance across diverse benchmarks with significantly higher efficiency. The\nsource codes can be found at https://github.com/CircleRadon/TokenPacker.\n","authors":["Wentong Li","Yuqian Yuan","Jian Liu","Dongqi Tang","Song Wang","Jie Qin","Jianke Zhu","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.02392v4.pdf","comment":"16 pages, Codes:https://github.com/CircleRadon/TokenPacker"},{"id":"http://arxiv.org/abs/2408.15637v1","updated":"2024-08-28T08:44:58Z","published":"2024-08-28T08:44:58Z","title":"Transfer Learning from Simulated to Real Scenes for Monocular 3D Object\n  Detection","summary":"  Accurately detecting 3D objects from monocular images in dynamic roadside\nscenarios remains a challenging problem due to varying camera perspectives and\nunpredictable scene conditions. This paper introduces a two-stage training\nstrategy to address these challenges. Our approach initially trains a model on\nthe large-scale synthetic dataset, RoadSense3D, which offers a diverse range of\nscenarios for robust feature learning. Subsequently, we fine-tune the model on\na combination of real-world datasets to enhance its adaptability to practical\nconditions. Experimental results of the Cube R-CNN model on challenging public\nbenchmarks show a remarkable improvement in detection performance, with a mean\naverage precision rising from 0.26 to 12.76 on the TUM Traffic A9 Highway\ndataset and from 2.09 to 6.60 on the DAIR-V2X-I dataset when performing\ntransfer learning. Code, data, and qualitative video results are available on\nthe project website: https://roadsense3d.github.io.\n","authors":["Sondos Mohamed","Walter Zimmer","Ross Greer","Ahmed Alaaeldin Ghita","Modesto Castrillón-Santana","Mohan Trivedi","Alois Knoll","Salvatore Mario Carta","Mirko Marras"],"pdf_url":"https://arxiv.org/pdf/2408.15637v1.pdf","comment":"18 pages. Accepted for ECVA European Conference on Computer Vision\n  2024 (ECCV'24)"},{"id":"http://arxiv.org/abs/2401.11835v2","updated":"2024-08-28T08:38:43Z","published":"2024-01-22T10:52:02Z","title":"Unveiling the Human-like Similarities of Automatic Facial Expression\n  Recognition: An Empirical Exploration through Explainable AI","summary":"  Facial expression recognition is vital for human behavior analysis, and deep\nlearning has enabled models that can outperform humans. However, it is unclear\nhow closely they mimic human processing. This study aims to explore the\nsimilarity between deep neural networks and human perception by comparing\ntwelve different networks, including both general object classifiers and\nFER-specific models. We employ an innovative global explainable AI method to\ngenerate heatmaps, revealing crucial facial regions for the twelve networks\ntrained on six facial expressions. We assess these results both quantitatively\nand qualitatively, comparing them to ground truth masks based on Friesen and\nEkman's description and among them. We use Intersection over Union (IoU) and\nnormalized correlation coefficients for comparisons. We generate 72 heatmaps to\nhighlight critical regions for each expression and architecture. Qualitatively,\nmodels with pre-trained weights show more similarity in heatmaps compared to\nthose without pre-training. Specifically, eye and nose areas influence certain\nfacial expressions, while the mouth is consistently important across all models\nand expressions. Quantitatively, we find low average IoU values (avg. 0.2702)\nacross all expressions and architectures. The best-performing architecture\naverages 0.3269, while the worst-performing one averages 0.2066. Dendrograms,\nbuilt with the normalized correlation coefficient, reveal two main clusters for\nmost expressions: models with pre-training and models without pre-training.\nFindings suggest limited alignment between human and AI facial expression\nrecognition, with network architectures influencing the similarity, as similar\narchitectures prioritize similar facial regions.\n","authors":["F. Xavier Gaya-Morey","Silvia Ramis-Guarinos","Cristina Manresa-Yee","Jose M. Buades-Rubio"],"pdf_url":"https://arxiv.org/pdf/2401.11835v2.pdf","comment":"Multimed Tools Appl (2024)"},{"id":"http://arxiv.org/abs/2408.15045v2","updated":"2024-08-28T08:32:44Z","published":"2024-08-27T13:13:38Z","title":"DocLayLLM: An Efficient and Effective Multi-modal Extension of Large\n  Language Models for Text-rich Document Understanding","summary":"  Text-rich document understanding (TDU) refers to analyzing and comprehending\ndocuments containing substantial textual content. With the rapid evolution of\nlarge language models (LLMs), they have been widely leveraged for TDU due to\ntheir remarkable versatility and generalization. In this paper, we introduce\nDocLayLLM, an efficient and effective multi-modal extension of LLMs\nspecifically designed for TDU. By integrating visual patch tokens and 2D\npositional tokens into LLMs and encoding the document content using the LLMs\nthemselves, we fully take advantage of the document comprehension capability of\nLLMs and enhance their perception of OCR information. We have also deeply\nconsidered the role of the chain-of-thought (CoT) and innovatively proposed the\ntechniques of CoT Pre-training and CoT Annealing. Our DocLayLLM can achieve\nremarkable performances with lightweight training settings, showcasing its\nefficiency and effectiveness. Experimental results demonstrate that our\nDocLayLLM surpasses existing OCR-dependent methods and also outperforms\nOCR-free competitors.\n","authors":["Wenhui Liao","Jiapeng Wang","Hongliang Li","Chengyu Wang","Jun Huang","Lianwen Jin"],"pdf_url":"https://arxiv.org/pdf/2408.15045v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08454v2","updated":"2024-08-28T08:31:28Z","published":"2024-08-15T23:34:04Z","title":"Beyond Uniform Query Distribution: Key-Driven Grouped Query Attention","summary":"  The Transformer architecture has revolutionized deep learning through its\nSelf-Attention mechanism, which effectively captures contextual information.\nHowever, the memory footprint of Self-Attention presents significant challenges\nfor long-sequence tasks. Grouped Query Attention (GQA) addresses this issue by\ngrouping queries and mean-pooling the corresponding key-value heads - reducing\nthe number of overall parameters and memory requirements in a flexible manner\nwithout adversely compromising model accuracy. In this work, we introduce\nenhancements to GQA, focusing on two novel approaches that deviate from the\nstatic nature of grouping: Key-Distributed GQA (KDGQA) and Dynamic\nKey-Distributed GQA (DGQA), which leverage information from the norms of the\nkey heads to inform query allocation. Specifically, KDGQA looks at the ratios\nof the norms of the key heads during each forward pass, while DGQA examines the\nratios of the norms as they evolve through training. Additionally, we present\nPerturbed GQA (PGQA) as a case-study, which introduces variability in (static)\ngroup formation via subtracting noise from the attention maps. Our experiments\nwith up-trained Vision Transformers, for Image Classification on datasets such\nas CIFAR-10, CIFAR-100, Food101, and Tiny ImageNet, demonstrate the promise of\nthese variants in improving upon the original GQA through more informed and\nadaptive grouping mechanisms: specifically ViT-L experiences accuracy gains of\nup to 8% when utilizing DGQA in comparison to GQA and other variants. We\nfurther analyze the impact of the number of Key-Value Heads on performance,\nunderscoring the importance of utilizing query-key affinities. Code is\navailable on GitHub.\n","authors":["Zohaib Khan","Muhammad Khaquan","Omer Tafveez","Burhanuddin Samiwala","Agha Ali Raza"],"pdf_url":"https://arxiv.org/pdf/2408.08454v2.pdf","comment":"11 pages, 9 figures"},{"id":"http://arxiv.org/abs/2408.15063v2","updated":"2024-08-28T08:28:50Z","published":"2024-08-27T13:47:31Z","title":"Adapting Segment Anything Model to Multi-modal Salient Object Detection\n  with Semantic Feature Fusion Guidance","summary":"  Although most existing multi-modal salient object detection (SOD) methods\ndemonstrate effectiveness through training models from scratch, the limited\nmulti-modal data hinders these methods from reaching optimality. In this paper,\nwe propose a novel framework to explore and exploit the powerful feature\nrepresentation and zero-shot generalization ability of the pre-trained Segment\nAnything Model (SAM) for multi-modal SOD. Despite serving as a recent vision\nfundamental model, driving the class-agnostic SAM to comprehend and detect\nsalient objects accurately is non-trivial, especially in challenging scenes. To\nthis end, we develop \\underline{SAM} with se\\underline{m}antic\nf\\underline{e}ature fu\\underline{s}ion guidanc\\underline{e} (Sammese), which\nincorporates multi-modal saliency-specific knowledge into SAM to adapt SAM to\nmulti-modal SOD tasks. However, it is difficult for SAM trained on single-modal\ndata to directly mine the complementary benefits of multi-modal inputs and\ncomprehensively utilize them to achieve accurate saliency prediction.To address\nthese issues, we first design a multi-modal complementary fusion module to\nextract robust multi-modal semantic features by integrating information from\nvisible and thermal or depth image pairs. Then, we feed the extracted\nmulti-modal semantic features into both the SAM image encoder and mask decoder\nfor fine-tuning and prompting, respectively. Specifically, in the image\nencoder, a multi-modal adapter is proposed to adapt the single-modal SAM to\nmulti-modal information. In the mask decoder, a semantic-geometric prompt\ngeneration strategy is proposed to produce corresponding embeddings with\nvarious saliency cues. Extensive experiments on both RGB-D and RGB-T SOD\nbenchmarks show the effectiveness of the proposed framework.\n","authors":["Kunpeng Wang","Danying Lin","Chenglong Li","Zhengzheng Tu","Bin Luo"],"pdf_url":"https://arxiv.org/pdf/2408.15063v2.pdf","comment":"10 pages, 9 figures"},{"id":"http://arxiv.org/abs/2408.15628v1","updated":"2024-08-28T08:27:41Z","published":"2024-08-28T08:27:41Z","title":"CSAD: Unsupervised Component Segmentation for Logical Anomaly Detection","summary":"  To improve logical anomaly detection, some previous works have integrated\nsegmentation techniques with conventional anomaly detection methods. Although\nthese methods are effective, they frequently lead to unsatisfactory\nsegmentation results and require manual annotations. To address these\ndrawbacks, we develop an unsupervised component segmentation technique that\nleverages foundation models to autonomously generate training labels for a\nlightweight segmentation network without human labeling. Integrating this new\nsegmentation technique with our proposed Patch Histogram module and the\nLocal-Global Student-Teacher (LGST) module, we achieve a detection AUROC of\n95.3% in the MVTec LOCO AD dataset, which surpasses previous SOTA methods.\nFurthermore, our proposed method provides lower latency and higher throughput\nthan most existing approaches.\n","authors":["Yu-Hsuan Hsieh","Shang-Hong Lai"],"pdf_url":"https://arxiv.org/pdf/2408.15628v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15626v1","updated":"2024-08-28T08:25:41Z","published":"2024-08-28T08:25:41Z","title":"Can Visual Language Models Replace OCR-Based Visual Question Answering\n  Pipelines in Production? A Case Study in Retail","summary":"  Most production-level deployments for Visual Question Answering (VQA) tasks\nare still build as processing pipelines of independent steps including image\npre-processing, object- and text detection, Optical Character Recognition (OCR)\nand (mostly supervised) object classification. However, the recent advances in\nvision Foundation Models [25] and Vision Language Models (VLMs) [23] raise the\nquestion if these custom trained, multi-step approaches can be replaced with\npre-trained, single-step VLMs. This paper analyzes the performance and limits\nof various VLMs in the context of VQA and OCR [5, 9, 12] tasks in a\nproduction-level scenario. Using data from the Retail-786k [10] dataset, we\ninvestigate the capabilities of pre-trained VLMs to answer detailed questions\nabout advertised products in images. Our study includes two commercial models,\nGPT-4V [16] and GPT-4o [17], as well as four open-source models: InternVL [5],\nLLaVA 1.5 [12], LLaVA-NeXT [13], and CogAgent [9]. Our initial results show,\nthat there is in general no big performance gap between open-source and\ncommercial models. However, we observe a strong task dependent variance in VLM\nperformance: while most models are able to answer questions regarding the\nproduct brand and price with high accuracy, they completely fail at the same\ntime to correctly identity the specific product name or discount. This\nindicates the problem of VLMs to solve fine-grained classification tasks as\nwell to model the more abstract concept of discounts.\n","authors":["Bianca Lamm","Janis Keuper"],"pdf_url":"https://arxiv.org/pdf/2408.15626v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15608v1","updated":"2024-08-28T08:02:47Z","published":"2024-08-28T08:02:47Z","title":"Geometry-guided Feature Learning and Fusion for Indoor Scene\n  Reconstruction","summary":"  In addition to color and textural information, geometry provides important\ncues for 3D scene reconstruction. However, current reconstruction methods only\ninclude geometry at the feature level thus not fully exploiting the geometric\ninformation.\n  In contrast, this paper proposes a novel geometry integration mechanism for\n3D scene reconstruction. Our approach incorporates 3D geometry at three levels,\ni.e. feature learning, feature fusion, and network supervision. First,\ngeometry-guided feature learning encodes geometric priors to contain\nview-dependent information. Second, a geometry-guided adaptive feature fusion\nis introduced which utilizes the geometric priors as a guidance to adaptively\ngenerate weights for multiple views. Third, at the supervision level, taking\nthe consistency between 2D and 3D normals into account, a consistent 3D normal\nloss is designed to add local constraints.\n  Large-scale experiments are conducted on the ScanNet dataset, showing that\nvolumetric methods with our geometry integration mechanism outperform\nstate-of-the-art methods quantitatively as well as qualitatively. Volumetric\nmethods with ours also show good generalization on the 7-Scenes and TUM RGB-D\ndatasets.\n","authors":["Ruihong Yin","Sezer Karaoglu","Theo Gevers"],"pdf_url":"https://arxiv.org/pdf/2408.15608v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2408.15605v1","updated":"2024-08-28T07:56:28Z","published":"2024-08-28T07:56:28Z","title":"ES-PTAM: Event-based Stereo Parallel Tracking and Mapping","summary":"  Visual Odometry (VO) and SLAM are fundamental components for spatial\nperception in mobile robots. Despite enormous progress in the field, current\nVO/SLAM systems are limited by their sensors' capability. Event cameras are\nnovel visual sensors that offer advantages to overcome the limitations of\nstandard cameras, enabling robots to expand their operating range to\nchallenging scenarios, such as high-speed motion and high dynamic range\nillumination. We propose a novel event-based stereo VO system by combining two\nideas: a correspondence-free mapping module that estimates depth by maximizing\nray density fusion and a tracking module that estimates camera poses by\nmaximizing edge-map alignment. We evaluate the system comprehensively on five\nreal-world datasets, spanning a variety of camera types (manufacturers and\nspatial resolutions) and scenarios (driving, flying drone, hand-held,\negocentric, etc). The quantitative and qualitative results demonstrate that our\nmethod outperforms the state of the art in majority of the test sequences by a\nmargin, e.g., trajectory error reduction of 45% on RPG dataset, 61% on DSEC\ndataset, and 21% on TUM-VIE dataset. To benefit the community and foster\nresearch on event-based perception systems, we release the source code and\nresults: https://github.com/tub-rip/ES-PTAM\n","authors":["Suman Ghosh","Valentina Cavinato","Guillermo Gallego"],"pdf_url":"https://arxiv.org/pdf/2408.15605v1.pdf","comment":"17 pages, 7 figures, 4 tables, https://github.com/tub-rip/ES-PTAM"},{"id":"http://arxiv.org/abs/2408.08091v2","updated":"2024-08-28T07:51:34Z","published":"2024-08-15T11:34:33Z","title":"HAIR: Hypernetworks-based All-in-One Image Restoration","summary":"  Image restoration aims to recover a high-quality clean image from its\ndegraded version. Recent progress in image restoration has demonstrated the\neffectiveness of All-in-One image restoration models in addressing various\ndegradations simultaneously. However, these existing methods typically utilize\nthe same parameters to tackle images with different degradation types, thus\nforcing the model to balance the performance between different tasks and\nlimiting its performance on each task. To alleviate this issue, we propose\nHAIR, a \\textbf{H}ypernetworks-based \\textbf{A}ll-in-One \\textbf{I}mage\n\\textbf{R}estoration method that dynamically generates parameters based on\ninput images. Specifically, HAIR consists of two main components, i.e.,\nClassifier and Hyper Selecting Net (HSN). The Classifier is a simple image\nclassification network used to generate a Global Information Vector (GIV) that\ncontains the degradation information of the input image, and the HSN is a\nsimple fully-connected neural network that receives the GIV and outputs\nparameters for the corresponding modules. Extensive experiments demonstrate\nthat HAIR can significantly improve the performance of existing image\nrestoration models in a plug-and-play manner, both in single-task and\nall-in-one settings. Notably, our innovative model, Res-HAIR, which integrates\nHAIR into the well-known Restormer, can obtain superior or comparable\nperformance compared with current state-of-the-art methods. Moreover, we\ntheoretically demonstrate that our proposed HAIR requires fewer parameters in\ncontrast to the prevalent All-in-One methodologies. The code is available at\n\\textcolor{blue}{\\href{https://github.com/toummHus/HAIR}{https://github.com/toummHus/HAIR}.}\n","authors":["Jin Cao","Yi Cao","Li Pang","Deyu Meng","Xiangyong Cao"],"pdf_url":"https://arxiv.org/pdf/2408.08091v2.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2408.15602v1","updated":"2024-08-28T07:49:30Z","published":"2024-08-28T07:49:30Z","title":"On the Benefits of Visual Stabilization for Frame- and Event-based\n  Perception","summary":"  Vision-based perception systems are typically exposed to large orientation\nchanges in different robot applications. In such conditions, their performance\nmight be compromised due to the inherent complexity of processing data captured\nunder challenging motion. Integration of mechanical stabilizers to compensate\nfor the camera rotation is not always possible due to the robot payload\nconstraints. This paper presents a processing-based stabilization approach to\ncompensate the camera's rotational motion both on events and on frames (i.e.,\nimages). Assuming that the camera's attitude is available, we evaluate the\nbenefits of stabilization in two perception applications: feature tracking and\nestimating the translation component of the camera's ego-motion. The validation\nis performed using synthetic data and sequences from well-known event-based\nvision datasets. The experiments unveil that stabilization can improve feature\ntracking and camera ego-motion estimation accuracy in 27.37% and 34.82%,\nrespectively. Concurrently, stabilization can reduce the processing time of\ncomputing the camera's linear velocity by at least 25%. Code is available at\nhttps://github.com/tub-rip/visual_stabilization\n","authors":["Juan Pablo Rodriguez-Gomez","Jose Ramiro Martinez-de Dios","Anibal Ollero","Guillermo Gallego"],"pdf_url":"https://arxiv.org/pdf/2408.15602v1.pdf","comment":"8 pages, 4 figures, 4 tables,\n  https://github.com/tub-rip/visual_stabilization"},{"id":"http://arxiv.org/abs/2407.10389v2","updated":"2024-08-28T07:49:14Z","published":"2024-07-15T01:58:54Z","title":"Boost Your NeRF: A Model-Agnostic Mixture of Experts Framework for High\n  Quality and Efficient Rendering","summary":"  Since the introduction of NeRFs, considerable attention has been focused on\nimproving their training and inference times, leading to the development of\nFast-NeRFs models. Despite demonstrating impressive rendering speed and\nquality, the rapid convergence of such models poses challenges for further\nimproving reconstruction quality. Common strategies to improve rendering\nquality involves augmenting model parameters or increasing the number of\nsampled points. However, these computationally intensive approaches encounter\nlimitations in achieving significant quality enhancements. This study\nintroduces a model-agnostic framework inspired by Sparsely-Gated Mixture of\nExperts to enhance rendering quality without escalating computational\ncomplexity. Our approach enables specialization in rendering different scene\ncomponents by employing a mixture of experts with varying resolutions. We\npresent a novel gate formulation designed to maximize expert capabilities and\npropose a resolution-based routing technique to effectively induce sparsity and\ndecompose scenes. Our work significantly improves reconstruction quality while\nmaintaining competitive performance.\n","authors":["Francesco Di Sario","Riccardo Renzulli","Enzo Tartaglione","Marco Grangetto"],"pdf_url":"https://arxiv.org/pdf/2407.10389v2.pdf","comment":"The paper has been accepted to the ECCV 2024 conference"},{"id":"http://arxiv.org/abs/2407.20495v2","updated":"2024-08-28T07:40:18Z","published":"2024-07-30T01:39:30Z","title":"Enhancing Quantitative Image Synthesis through Pretraining and\n  Resolution Scaling for Bone Mineral Density Estimation from a Plain X-ray\n  Image","summary":"  While most vision tasks are essentially visual in nature (for recognition),\nsome important tasks, especially in the medical field, also require\nquantitative analysis (for quantification) using quantitative images. Unlike in\nvisual analysis, pixel values in quantitative images correspond to physical\nmetrics measured by specific devices (e.g., a depth image). However, recent\nwork has shown that it is sometimes possible to synthesize accurate\nquantitative values from visual ones (e.g., depth from visual cues or defocus).\nThis research aims to improve quantitative image synthesis (QIS) by exploring\npretraining and image resolution scaling. We propose a benchmark for evaluating\npretraining performance using the task of QIS-based bone mineral density (BMD)\nestimation from plain X-ray images, where the synthesized quantitative image is\nused to derive BMD. Our results show that appropriate pretraining can improve\nQIS performance, significantly raising the correlation of BMD estimation from\n0.820 to 0.898, while others do not help or even hinder it. Scaling-up the\nresolution can further boost the correlation up to 0.923, a significant\nenhancement over conventional methods. Future work will include exploring more\npretraining strategies and validating them on other image synthesis tasks.\n","authors":["Yi Gu","Yoshito Otake","Keisuke Uemura","Masaki Takao","Mazen Soufi","Seiji Okada","Nobuhiko Sugano","Hugues Talbot","Yoshinobu Sato"],"pdf_url":"https://arxiv.org/pdf/2407.20495v2.pdf","comment":"SASHIMI, 2024 (MICCAI workshop). 13 pages, 3 figures"},{"id":"http://arxiv.org/abs/2305.14668v4","updated":"2024-08-28T07:28:15Z","published":"2023-05-24T03:20:09Z","title":"NOVUM: Neural Object Volumes for Robust Object Classification","summary":"  Discriminative models for object classification typically learn image-based\nrepresentations that do not capture the compositional and 3D nature of objects.\nIn this work, we show that explicitly integrating 3D compositional object\nrepresentations into deep networks for image classification leads to a largely\nenhanced generalization in out-of-distribution scenarios. In particular, we\nintroduce a novel architecture, referred to as NOVUM, that consists of a\nfeature extractor and a neural object volume for every target object class.\nEach neural object volume is a composition of 3D Gaussians that emit feature\nvectors. This compositional object representation allows for a highly robust\nand fast estimation of the object class by independently matching the features\nof the 3D Gaussians of each category to features extracted from an input image.\nAdditionally, the object pose can be estimated via inverse rendering of the\ncorresponding neural object volume. To enable the classification of objects,\nthe neural features at each 3D Gaussian are trained discriminatively to be\ndistinct from (i) the features of 3D Gaussians in other categories, (ii)\nfeatures of other 3D Gaussians of the same object, and (iii) the background\nfeatures. Our experiments show that NOVUM offers intriguing advantages over\nstandard architectures due to the 3D compositional structure of the object\nrepresentation, namely: (1) An exceptional robustness across a spectrum of\nreal-world and synthetic out-of-distribution shifts and (2) an enhanced human\ninterpretability compared to standard models, all while maintaining real-time\ninference and a competitive accuracy on in-distribution data.\n","authors":["Artur Jesslen","Guofeng Zhang","Angtian Wang","Wufei Ma","Alan Yuille","Adam Kortylewski"],"pdf_url":"https://arxiv.org/pdf/2305.14668v4.pdf","comment":"14 pages, 4 figures, accepted at ECCV 2024, code is accessible at\n  https://github.com/GenIntel/NOVUM"},{"id":"http://arxiv.org/abs/2405.15239v3","updated":"2024-08-28T07:07:06Z","published":"2024-05-24T06:06:11Z","title":"Brain3D: Generating 3D Objects from fMRI","summary":"  Understanding the hidden mechanisms behind human's visual perception is a\nfundamental question in neuroscience. To that end, investigating into the\nneural responses of human mind activities, such as functional Magnetic\nResonance Imaging (fMRI), has been a significant research vehicle. However,\nanalyzing fMRI signals is challenging, costly, daunting, and demanding for\nprofessional training. Despite remarkable progress in fMRI analysis, existing\napproaches are limited to generating 2D images and far away from being\nbiologically meaningful and practically useful. Under this insight, we propose\nto generate visually plausible and functionally more comprehensive 3D outputs\ndecoded from brain signals, enabling more sophisticated modeling of fMRI data.\nConceptually, we reformulate this task as a {\\em fMRI conditioned 3D object\ngeneration} problem. We design a novel 3D object representation learning\nmethod, Brain3D, that takes as input the fMRI data of a subject who was\npresented with a 2D image, and yields as output the corresponding 3D object\nimages. The key capabilities of this model include tackling the noises with\nhigh-level semantic signals and a two-stage architecture design for progressive\nhigh-level information integration. Extensive experiments validate the superior\ncapability of our model over previous state-of-the-art 3D object generation\nmethods. Importantly, we show that our model captures the distinct\nfunctionalities of each region of human vision system as well as their\nintricate interplay relationships, aligning remarkably with the established\ndiscoveries in neuroscience. Further, preliminary evaluations indicate that\nBrain3D can successfully identify the disordered brain regions in simulated\nscenarios, such as V1, V2, V3, V4, and the medial temporal lobe (MTL) within\nthe human visual system. Our data and code will be available at\nhttps://brain-3d.github.io/.\n","authors":["Yuankun Yang","Li Zhang","Ziyang Xie","Zhiyuan Yuan","Jianfeng Feng","Xiatian Zhu","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2405.15239v3.pdf","comment":"20 pages, 11 figures, project page: https://brain-3d.github.io/"},{"id":"http://arxiv.org/abs/2408.15580v1","updated":"2024-08-28T07:05:46Z","published":"2024-08-28T07:05:46Z","title":"Hierarchical Visual Categories Modeling: A Joint Representation Learning\n  and Density Estimation Framework for Out-of-Distribution Detection","summary":"  Detecting out-of-distribution inputs for visual recognition models has become\ncritical in safe deep learning. This paper proposes a novel hierarchical visual\ncategory modeling scheme to separate out-of-distribution data from\nin-distribution data through joint representation learning and statistical\nmodeling. We learn a mixture of Gaussian models for each in-distribution\ncategory. There are many Gaussian mixture models to model different visual\ncategories. With these Gaussian models, we design an in-distribution score\nfunction by aggregating multiple Mahalanobis-based metrics. We don't use any\nauxiliary outlier data as training samples, which may hurt the generalization\nability of out-of-distribution detection algorithms. We split the ImageNet-1k\ndataset into ten folds randomly. We use one fold as the in-distribution dataset\nand the others as out-of-distribution datasets to evaluate the proposed method.\nWe also conduct experiments on seven popular benchmarks, including CIFAR,\niNaturalist, SUN, Places, Textures, ImageNet-O, and OpenImage-O. Extensive\nexperiments indicate that the proposed method outperforms state-of-the-art\nalgorithms clearly. Meanwhile, we find that our visual representation has a\ncompetitive performance when compared with features learned by classical\nmethods. These results demonstrate that the proposed method hasn't weakened the\ndiscriminative ability of visual recognition models and keeps high efficiency\nin detecting out-of-distribution samples.\n","authors":["Jinglun Li","Xinyu Zhou","Pinxue Guo","Yixuan Sun","Yiwen Huang","Weifeng Ge","Wenqiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.15580v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2408.13509v2","updated":"2024-08-28T06:53:46Z","published":"2024-08-24T08:09:32Z","title":"DualAnoDiff: Dual-Interrelated Diffusion Model for Few-Shot Anomaly\n  Image Generation","summary":"  The performance of anomaly inspection in industrial manufacturing is\nconstrained by the scarcity of anomaly data. To overcome this challenge,\nresearchers have started employing anomaly generation approaches to augment the\nanomaly dataset. However, existing anomaly generation methods suffer from\nlimited diversity in the generated anomalies and struggle to achieve a seamless\nblending of this anomaly with the original image. In this paper, we overcome\nthese challenges from a new perspective, simultaneously generating a pair of\nthe overall image and the corresponding anomaly part. We propose DualAnoDiff, a\nnovel diffusion-based few-shot anomaly image generation model, which can\ngenerate diverse and realistic anomaly images by using a dual-interrelated\ndiffusion model, where one of them is employed to generate the whole image\nwhile the other one generates the anomaly part. Moreover, we extract background\nand shape information to mitigate the distortion and blurriness phenomenon in\nfew-shot image generation. Extensive experiments demonstrate the superiority of\nour proposed model over state-of-the-art methods in terms of both realism and\ndiversity. Overall, our approach significantly improves the performance of\ndownstream anomaly detection tasks, including anomaly detection, anomaly\nlocalization, and anomaly classification tasks.\n","authors":["Ying Jin","Jinlong Peng","Qingdong He","Teng Hu","Hao Chen","Jiafu Wu","Wenbing Zhu","Mingmin Chi","Jun Liu","Yabiao Wang","Chengjie Wang"],"pdf_url":"https://arxiv.org/pdf/2408.13509v2.pdf","comment":"Code: https://github.com/yinyjin/DualAnoDiff"},{"id":"http://arxiv.org/abs/2408.15569v1","updated":"2024-08-28T06:53:08Z","published":"2024-08-28T06:53:08Z","title":"Temporal Attention for Cross-View Sequential Image Localization","summary":"  This paper introduces a novel approach to enhancing cross-view localization,\nfocusing on the fine-grained, sequential localization of street-view images\nwithin a single known satellite image patch, a significant departure from\ntraditional one-to-one image retrieval methods. By expanding to sequential\nimage fine-grained localization, our model, equipped with a novel Temporal\nAttention Module (TAM), leverages contextual information to significantly\nimprove sequential image localization accuracy. Our method shows substantial\nreductions in both mean and median localization errors on the Cross-View Image\nSequence (CVIS) dataset, outperforming current state-of-the-art single-image\nlocalization techniques. Additionally, by adapting the KITTI-CVL dataset into\nsequential image sets, we not only offer a more realistic dataset for future\nresearch but also demonstrate our model's robust generalization capabilities\nacross varying times and areas, evidenced by a 75.3% reduction in mean distance\nerror in cross-view sequential image localization.\n","authors":["Dong Yuan","Frederic Maire","Feras Dayoub"],"pdf_url":"https://arxiv.org/pdf/2408.15569v1.pdf","comment":"Accepted to IROS 2024"},{"id":"http://arxiv.org/abs/2408.15566v1","updated":"2024-08-28T06:37:59Z","published":"2024-08-28T06:37:59Z","title":"TagOOD: A Novel Approach to Out-of-Distribution Detection via\n  Vision-Language Representations and Class Center Learning","summary":"  Multimodal fusion, leveraging data like vision and language, is rapidly\ngaining traction. This enriched data representation improves performance across\nvarious tasks. Existing methods for out-of-distribution (OOD) detection, a\ncritical area where AI models encounter unseen data in real-world scenarios,\nrely heavily on whole-image features. These image-level features can include\nirrelevant information that hinders the detection of OOD samples, ultimately\nlimiting overall performance. In this paper, we propose \\textbf{TagOOD}, a\nnovel approach for OOD detection that leverages vision-language representations\nto achieve label-free object feature decoupling from whole images. This\ndecomposition enables a more focused analysis of object semantics, enhancing\nOOD detection performance. Subsequently, TagOOD trains a lightweight network on\nthe extracted object features to learn representative class centers. These\ncenters capture the central tendencies of IND object classes, minimizing the\ninfluence of irrelevant image features during OOD detection. Finally, our\napproach efficiently detects OOD samples by calculating distance-based metrics\nas OOD scores between learned centers and test samples. We conduct extensive\nexperiments to evaluate TagOOD on several benchmark datasets and demonstrate\nits superior performance compared to existing OOD detection methods. This work\npresents a novel perspective for further exploration of multimodal information\nutilization in OOD detection, with potential applications across various tasks.\n","authors":["Jinglun Li","Xinyu Zhou","Kaixun Jiang","Lingyi Hong","Pinxue Guo","Zhaoyu Chen","Weifeng Ge","Wenqiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.15566v1.pdf","comment":"Accepted by ACMMM2024"},{"id":"http://arxiv.org/abs/2408.15557v1","updated":"2024-08-28T06:18:55Z","published":"2024-08-28T06:18:55Z","title":"Generalization Capabilities of Neural Cellular Automata for Medical\n  Image Segmentation: A Robust and Lightweight Approach","summary":"  In the field of medical imaging, the U-Net architecture, along with its\nvariants, has established itself as a cornerstone for image segmentation tasks,\nparticularly due to its strong performance when trained on limited datasets.\nDespite its impressive performance on identically distributed (in-domain) data,\nU-Nets exhibit a significant decline in performance when tested on data that\ndeviates from the training distribution, out-of-distribution (out-of-domain)\ndata. Current methodologies predominantly address this issue by employing\ngeneralization techniques that hinge on various forms of regularization, which\nhave demonstrated moderate success in specific scenarios. This paper, however,\nventures into uncharted territory by investigating the implications of\nutilizing models that are smaller by three orders of magnitude (i.e., x1000)\ncompared to a conventional U-Net. A reduction of this size in U-net parameters\ntypically adversely affects both in-domain and out-of-domain performance,\npossibly due to a significantly reduced receptive field. To circumvent this\nissue, we explore the concept of Neural Cellular Automata (NCA), which, despite\nits simpler model structure, can attain larger receptive fields through\nrecursive processes. Experimental results on two distinct datasets reveal that\nNCA outperforms traditional methods in terms of generalization, while still\nmaintaining a commendable IID performance.\n","authors":["Steven Korevaar","Ruwan Tennakoon","Alireza Bab-Hadiashar"],"pdf_url":"https://arxiv.org/pdf/2408.15557v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13134v2","updated":"2024-08-28T06:15:04Z","published":"2023-11-22T03:41:13Z","title":"Lightweight High-Speed Photography Built on Coded Exposure and Implicit\n  Neural Representation of Videos","summary":"  The demand for compact cameras capable of recording high-speed scenes with\nhigh resolution is steadily increasing. However, achieving such capabilities\noften entails high bandwidth requirements, resulting in bulky, heavy systems\nunsuitable for low-capacity platforms. To address this challenge, leveraging a\ncoded exposure setup to encode a frame sequence into a blurry snapshot and\nsubsequently retrieve the latent sharp video presents a lightweight solution.\nNevertheless, restoring motion from blur remains a formidable challenge due to\nthe inherent ill-posedness of motion blur decomposition, the intrinsic\nambiguity in motion direction, and the diverse motions present in natural\nvideos. In this study, we propose a novel approach to address these challenges\nby combining the classical coded exposure imaging technique with the emerging\nimplicit neural representation for videos. We strategically embed motion\ndirection cues into the blurry image during the imaging process. Additionally,\nwe develop a novel implicit neural representation based blur decomposition\nnetwork to sequentially extract the latent video frames from the blurry image,\nleveraging the embedded motion direction cues. To validate the effectiveness\nand efficiency of our proposed framework, we conduct extensive experiments\nusing benchmark datasets and real-captured blurry images. The results\ndemonstrate that our approach significantly outperforms existing methods in\nterms of both quality and flexibility. The code for our work is available at\n.https://github.com/zhihongz/BDINR\n","authors":["Zhihong Zhang","Runzhao Yang","Jinli Suo","Yuxiao Cheng","Qionghai Dai"],"pdf_url":"https://arxiv.org/pdf/2311.13134v2.pdf","comment":"Accepted by IJCV"},{"id":"http://arxiv.org/abs/2408.15556v1","updated":"2024-08-28T06:09:02Z","published":"2024-08-28T06:09:02Z","title":"Divide, Conquer and Combine: A Training-Free Framework for\n  High-Resolution Image Perception in Multimodal Large Language Models","summary":"  Multimodal large language models (MLLMs) have experienced significant\nadvancements recently, but still struggle to recognize and interpret intricate\ndetails in high-resolution (HR) images effectively. While state-of-the-art\n(SOTA) MLLMs claim to process images at 4K resolution, existing MLLM benchmarks\nonly support up to 2K, leaving the capabilities of SOTA models on true HR\nimages largely untested. Furthermore, existing methods for enhancing HR image\nperception in MLLMs rely on computationally expensive visual instruction\ntuning. To address these limitations, we introduce HR-Bench, the first\ndeliberately designed benchmark to rigorously evaluate MLLM performance on\n4K&8K images. Through extensive experiments, we demonstrate that while\ndownsampling HR images leads to vision information loss, leveraging\ncomplementary modalities, e.g., text, can effectively compensate for this loss.\nBuilding upon this insight, we propose Divide, Conquer and Combine (DC$^2$), a\nnovel training-free framework for enhancing MLLM perception of HR images.\nDC$^2$ follows a three-staged approach: 1) Divide: recursively partitioning the\nHR image into patches and merging similar patches to minimize computational\noverhead, 2) Conquer: leveraging the MLLM to generate accurate textual\ndescriptions for each image patch, and 3) Combine: utilizing the generated text\ndescriptions to enhance the MLLM's understanding of the overall HR image.\nExtensive experiments show that: 1) the SOTA MLLM achieves 63% accuracy, which\nis markedly lower than the 87% accuracy achieved by humans on HR-Bench; 2) our\nDC$^2$ brings consistent and significant improvements (a relative increase of\n+6% on HR-Bench and +8% on general multimodal benchmarks). The benchmark and\ncode will be released to facilitate the multimodal R&D community.\n","authors":["Wenbin Wang","Liang Ding","Minyan Zeng","Xiabin Zhou","Li Shen","Yong Luo","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2408.15556v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15555v1","updated":"2024-08-28T06:08:46Z","published":"2024-08-28T06:08:46Z","title":"Latent Relationship Mining of Glaucoma Biomarkers: a TRI-LSTM based Deep\n  Learning","summary":"  In recently years, a significant amount of research has been conducted on\napplying deep learning methods for glaucoma classification and detection.\nHowever, the explainability of those established machine learning models\nremains a big concern. In this research, in contrast, we learn from cognitive\nscience concept and study how ophthalmologists judge glaucoma detection.\nSimulating experts' efforts, we propose a hierarchical decision making system,\ncentered around a holistic set of carefully designed biomarker-oriented machine\nlearning models. While biomarkers represent the key indicators of how\nophthalmologists identify glaucoma, they usually exhibit latent\ninter-relations. We thus construct a time series model, named TRI-LSTM, capable\nof calculating and uncovering potential and latent relationships among various\nbiomarkers of glaucoma. Our model is among the first efforts to explore the\nintrinsic connections among glaucoma biomarkers. We monitor temporal\nrelationships in patients' disease states over time and to capture and retain\nthe progression of disease-relevant clinical information from prior visits,\nthereby enriching biomarker's potential relationships. Extensive experiments\nover real-world dataset have demonstrated the effectiveness of the proposed\nmodel.\n","authors":["Cheng Huang","Junhao Shen","Qiuyu Luo","Karanjit Kooner","Tsengdar Lee","Yishen Liu","Jia Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.15555v1.pdf","comment":"9 pages, 4 images"},{"id":"http://arxiv.org/abs/2408.15548v1","updated":"2024-08-28T05:53:30Z","published":"2024-08-28T05:53:30Z","title":"ConsistencyTrack: A Robust Multi-Object Tracker with a Generation\n  Strategy of Consistency Model","summary":"  Multi-object tracking (MOT) is a critical technology in computer vision,\ndesigned to detect multiple targets in video sequences and assign each target a\nunique ID per frame. Existed MOT methods excel at accurately tracking multiple\nobjects in real-time across various scenarios. However, these methods still\nface challenges such as poor noise resistance and frequent ID switches. In this\nresearch, we propose a novel ConsistencyTrack, joint detection and\ntracking(JDT) framework that formulates detection and association as a\ndenoising diffusion process on perturbed bounding boxes. This progressive\ndenoising strategy significantly improves the model's noise resistance. During\nthe training phase, paired object boxes within two adjacent frames are diffused\nfrom ground-truth boxes to a random distribution, and then the model learns to\ndetect and track by reversing this process. In inference, the model refines\nrandomly generated boxes into detection and tracking results through minimal\ndenoising steps. ConsistencyTrack also introduces an innovative target\nassociation strategy to address target occlusion. Experiments on the MOT17 and\nDanceTrack datasets demonstrate that ConsistencyTrack outperforms other\ncompared methods, especially better than DiffusionTrack in inference speed and\nother performance metrics. Our code is available at\nhttps://github.com/Tankowa/ConsistencyTrack.\n","authors":["Lifan Jiang","Zhihui Wang","Siqi Yin","Guangxiao Ma","Peng Zhang","Boxi Wu"],"pdf_url":"https://arxiv.org/pdf/2408.15548v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2308.09905 by other authors"},{"id":"http://arxiv.org/abs/2406.18967v2","updated":"2024-08-28T05:40:55Z","published":"2024-06-27T07:59:25Z","title":"Structural Attention: Rethinking Transformer for Unpaired Medical Image\n  Synthesis","summary":"  Unpaired medical image synthesis aims to provide complementary information\nfor an accurate clinical diagnostics, and address challenges in obtaining\naligned multi-modal medical scans. Transformer-based models excel in imaging\ntranslation tasks thanks to their ability to capture long-range dependencies.\nAlthough effective in supervised training settings, their performance falters\nin unpaired image synthesis, particularly in synthesizing structural details.\nThis paper empirically demonstrates that, lacking strong inductive biases,\nTransformer can converge to non-optimal solutions in the absence of paired\ndata. To address this, we introduce UNet Structured Transformer (UNest), a\nnovel architecture incorporating structural inductive biases for unpaired\nmedical image synthesis. We leverage the foundational Segment-Anything Model to\nprecisely extract the foreground structure and perform structural attention\nwithin the main anatomy. This guides the model to learn key anatomical regions,\nthus improving structural synthesis under the lack of supervision in unpaired\ntraining. Evaluated on two public datasets, spanning three modalities, i.e.,\nMR, CT, and PET, UNest improves recent methods by up to 19.30% across six\nmedical image synthesis tasks. Our code is released at\nhttps://github.com/HieuPhan33/MICCAI2024-UNest.\n","authors":["Vu Minh Hieu Phan","Yutong Xie","Bowen Zhang","Yuankai Qi","Zhibin Liao","Antonios Perperidis","Son Lam Phung","Johan W. Verjans","Minh-Son To"],"pdf_url":"https://arxiv.org/pdf/2406.18967v2.pdf","comment":"MICCAI version before camera ready"},{"id":"http://arxiv.org/abs/2408.15542v1","updated":"2024-08-28T05:34:14Z","published":"2024-08-28T05:34:14Z","title":"Kangaroo: A Powerful Video-Language Model Supporting Long-context Video\n  Input","summary":"  Rapid advancements have been made in extending Large Language Models (LLMs)\nto Large Multi-modal Models (LMMs). However, extending input modality of LLMs\nto video data remains a challenging endeavor, especially for long videos. Due\nto insufficient access to large-scale high-quality video data and the excessive\ncompression of visual features, current methods exhibit limitations in\neffectively processing long videos. In this paper, we introduce Kangaroo, a\npowerful Video LMM aimed at addressing these challenges. Confronted with issue\nof inadequate training data, we develop a data curation system to build a\nlarge-scale dataset with high-quality annotations for vision-language\npre-training and instruction tuning. In addition, we design a curriculum\ntraining pipeline with gradually increasing resolution and number of input\nframes to accommodate long videos. Evaluation results demonstrate that, with 8B\nparameters, Kangaroo achieves state-of-the-art performance across a variety of\nvideo understanding benchmarks while exhibiting competitive results on others.\nParticularly, on benchmarks specialized for long videos, Kangaroo excels some\nlarger models with over 10B parameters and proprietary models.\n","authors":["Jiajun Liu","Yibing Wang","Hanghang Ma","Xiaoping Wu","Xiaoqi Ma","Xiaoming Wei","Jianbin Jiao","Enhua Wu","Jie Hu"],"pdf_url":"https://arxiv.org/pdf/2408.15542v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08872v2","updated":"2024-08-28T05:03:34Z","published":"2024-08-16T17:57:01Z","title":"xGen-MM (BLIP-3): A Family of Open Large Multimodal Models","summary":"  This report introduces xGen-MM (also known as BLIP-3), a framework for\ndeveloping Large Multimodal Models (LMMs). The framework comprises meticulously\ncurated datasets, a training recipe, model architectures, and a resulting suite\nof LMMs. xGen-MM, short for xGen-MultiModal, expands the Salesforce xGen\ninitiative on foundation AI models. Our models undergo rigorous evaluation\nacross a range of tasks, including both single and multi-image benchmarks. Our\npre-trained base model exhibits strong in-context learning capabilities and the\ninstruction-tuned model demonstrates competitive performance among open-source\nLMMs with similar model sizes. In addition, we introduce a safety-tuned model\nwith DPO, aiming to mitigate harmful behaviors such as hallucinations and\nimprove safety. We open-source our models, curated large-scale datasets, and\nour fine-tuning codebase to facilitate further advancements in LMM research.\nAssociated resources will be available on our project page above.\n","authors":["Le Xue","Manli Shu","Anas Awadalla","Jun Wang","An Yan","Senthil Purushwalkam","Honglu Zhou","Viraj Prabhu","Yutong Dai","Michael S Ryoo","Shrikant Kendre","Jieyu Zhang","Can Qin","Shu Zhang","Chia-Chih Chen","Ning Yu","Juntao Tan","Tulika Manoj Awalgaonkar","Shelby Heinecke","Huan Wang","Yejin Choi","Ludwig Schmidt","Zeyuan Chen","Silvio Savarese","Juan Carlos Niebles","Caiming Xiong","Ran Xu"],"pdf_url":"https://arxiv.org/pdf/2408.08872v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19698v3","updated":"2024-08-28T04:41:09Z","published":"2024-07-29T04:43:58Z","title":"Classification Matters: Improving Video Action Detection with\n  Class-Specific Attention","summary":"  Video action detection (VAD) aims to detect actors and classify their actions\nin a video. We figure that VAD suffers more from classification rather than\nlocalization of actors. Hence, we analyze how prevailing methods form features\nfor classification and find that they prioritize actor regions, yet often\noverlooking the essential contextual information necessary for accurate\nclassification. Accordingly, we propose to reduce the bias toward actor and\nencourage paying attention to the context that is relevant to each action\nclass. By assigning a class-dedicated query to each action class, our model can\ndynamically determine where to focus for effective classification. The proposed\nmodel demonstrates superior performance on three challenging benchmarks with\nsignificantly fewer parameters and less computation.\n","authors":["Jinsung Lee","Taeoh Kim","Inwoong Lee","Minho Shim","Dongyoon Wee","Minsu Cho","Suha Kwak"],"pdf_url":"https://arxiv.org/pdf/2407.19698v3.pdf","comment":"31 pages, accepted to ECCV 2024 (oral)"},{"id":"http://arxiv.org/abs/2208.06561v3","updated":"2024-08-28T04:36:23Z","published":"2022-08-13T03:25:50Z","title":"Drone Referring Localization: An Efficient Heterogeneous Spatial Feature\n  Interaction Method For UAV Self-Localization","summary":"  Image retrieval (IR) has emerged as a promising approach for\nself-localization in unmanned aerial vehicles (UAVs). However, IR-based methods\nface several challenges: 1) Pre- and post-processing incur significant\ncomputational and storage overhead; 2) The lack of interaction between\ndual-source features impairs precise spatial perception. In this paper, we\npropose an efficient heterogeneous spatial feature interaction method, termed\nDrone Referring Localization (DRL), which aims to localize UAV-view images\nwithin satellite imagery. Unlike conventional methods that treat different data\nsources in isolation, followed by cosine similarity computations, DRL\nfacilitates the learnable interaction of heterogeneous features. To implement\nthe proposed DRL, we design two transformer-based frameworks, Post-Fusion and\nMix-Fusion, enabling end-to-end training and inference. Furthermore, we\nintroduce random scale cropping and weight balance loss techniques to augment\npaired data and optimize the balance between positive and negative sample\nweights. Additionally, we construct a new dataset, UL14, and establish a\nbenchmark tailored to the DRL framework. Compared to traditional IR methods,\nDRL achieves superior localization accuracy (MA@20 +9.4\\%) while significantly\nreducing computational time (1/7) and storage overhead (1/3). The dataset and\ncode will be made publicly available. The dataset and code are available at\n\\url{https://github.com/Dmmm1997/DRL} .\n","authors":["Ming Dai","Enhui Zheng","Jiahao Chen","Lei Qi","Zhenhua Feng","Wankou Yang"],"pdf_url":"https://arxiv.org/pdf/2208.06561v3.pdf","comment":"15 pages, 14 figures"},{"id":"http://arxiv.org/abs/2408.15524v1","updated":"2024-08-28T04:19:14Z","published":"2024-08-28T04:19:14Z","title":"Ray-Distance Volume Rendering for Neural Scene Reconstruction","summary":"  Existing methods in neural scene reconstruction utilize the Signed Distance\nFunction (SDF) to model the density function. However, in indoor scenes, the\ndensity computed from the SDF for a sampled point may not consistently reflect\nits real importance in volume rendering, often due to the influence of\nneighboring objects. To tackle this issue, our work proposes a novel approach\nfor indoor scene reconstruction, which instead parameterizes the density\nfunction with the Signed Ray Distance Function (SRDF). Firstly, the SRDF is\npredicted by the network and transformed to a ray-conditioned density function\nfor volume rendering. We argue that the ray-specific SRDF only considers the\nsurface along the camera ray, from which the derived density function is more\nconsistent to the real occupancy than that from the SDF. Secondly, although\nSRDF and SDF represent different aspects of scene geometries, their values\nshould share the same sign indicating the underlying spatial occupancy.\nTherefore, this work introduces a SRDF-SDF consistency loss to constrain the\nsigns of the SRDF and SDF outputs. Thirdly, this work proposes a\nself-supervised visibility task, introducing the physical visibility geometry\nto the reconstruction task. The visibility task combines prior from predicted\nSRDF and SDF as pseudo labels, and contributes to generating more accurate 3D\ngeometry. Our method implemented with different representations has been\nvalidated on indoor datasets, achieving improved performance in both\nreconstruction and view synthesis.\n","authors":["Ruihong Yin","Yunlu Chen","Sezer Karaoglu","Theo Gevers"],"pdf_url":"https://arxiv.org/pdf/2408.15524v1.pdf","comment":"Accepted by ECCV2024"},{"id":"http://arxiv.org/abs/2408.15521v1","updated":"2024-08-28T04:14:01Z","published":"2024-08-28T04:14:01Z","title":"A Simple Baseline with Single-encoder for Referring Image Segmentation","summary":"  Referring image segmentation (RIS) requires dense vision-language\ninteractions between visual pixels and textual words to segment objects based\non a given description. However, commonly adapted dual-encoders in RIS, e.g.,\nSwin transformer and BERT (uni-modal encoders) or CLIP (a multi-modal\ndual-encoder), lack dense multi-modal interactions during pre-training, leading\nto a gap with a pixel-level RIS task. To bridge this gap, existing RIS methods\noften rely on multi-modal fusion modules that interact two encoders, but this\napproach leads to high computational costs. In this paper, we present a novel\nRIS method with a single-encoder, i.e., BEiT-3, maximizing the potential of\nshared self-attention across all framework components. This enables seamless\ninteractions of two modalities from input to final prediction, producing\ngranularly aligned multi-modal features. Furthermore, we propose lightweight\nyet effective decoder modules, a Shared FPN and a Shared Mask Decoder, which\ncontribute to the high efficiency of our model. Our simple baseline with a\nsingle encoder achieves outstanding performances on the RIS benchmark datasets\nwhile maintaining computational efficiency, compared to the most recent SoTA\nmethods based on dual-encoders.\n","authors":["Seonghoon Yu","Ilchae Jung","Byeongju Han","Taeoh Kim","Yunho Kim","Dongyoon Wee","Jeany Son"],"pdf_url":"https://arxiv.org/pdf/2408.15521v1.pdf","comment":"ArXiv pre-print"},{"id":"http://arxiv.org/abs/2408.15519v1","updated":"2024-08-28T04:12:07Z","published":"2024-08-28T04:12:07Z","title":"Depth-Weighted Detection of Behaviours of Risk in People with Dementia\n  using Cameras","summary":"  The behavioural and psychological symptoms of dementia, such as agitation and\naggression, present a significant health and safety risk in residential care\nsettings. Many care facilities have video cameras in place for digital\nmonitoring of public spaces, which can be leveraged to develop an automated\nbehaviours of risk detection system that can alert the staff to enable timely\nintervention and prevent the situation from escalating. However, one of the\nchallenges in our previous study was the presence of false alarms due to\nobstruction of view by activities happening close to the camera. To address\nthis issue, we proposed a novel depth-weighted loss function to train a\ncustomized convolutional autoencoder to enforce equivalent importance to the\nevents happening both near and far from the cameras; thus, helping to reduce\nfalse alarms and making the method more suitable for real-world deployment. The\nproposed method was trained using data from nine participants with dementia\nacross three cameras situated in a specialized dementia unit and achieved an\narea under the curve of receiver operating characteristic of $0.852$, $0.81$\nand $0.768$ for the three cameras. Ablation analysis was conducted for the\nindividual components of the proposed method and the performance of the\nproposed method was investigated for participant-specific and sex-specific\nbehaviours of risk detection. The proposed method performed reasonably well in\ndetecting behaviours of risk in people with dementia motivating further\nresearch toward the development of a behaviours of risk detection system\nsuitable for deployment in video surveillance systems in care facilities.\n","authors":["Pratik K. Mishra","Irene Ballester","Andrea Iaboni","Bing Ye","Kristine Newman","Alex Mihailidis","Shehroz S. Khan"],"pdf_url":"https://arxiv.org/pdf/2408.15519v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03691v3","updated":"2024-08-28T03:57:26Z","published":"2024-03-06T13:17:41Z","title":"MolNexTR: A Generalized Deep Learning Model for Molecular Image\n  Recognition","summary":"  In the field of chemical structure recognition, the task of converting\nmolecular images into machine-readable data formats such as SMILES string\nstands as a significant challenge, primarily due to the varied drawing styles\nand conventions prevalent in chemical literature. To bridge this gap, we\nproposed MolNexTR, a novel image-to-graph deep learning model that collaborates\nto fuse the strengths of ConvNext, a powerful Convolutional Neural Network\nvariant, and Vision-TRansformer. This integration facilitates a more detailed\nextraction of both local and global features from molecular images. MolNexTR\ncan predict atoms and bonds simultaneously and understand their layout rules.\nIt also excels at flexibly integrating symbolic chemistry principles to discern\nchirality and decipher abbreviated structures. We further incorporate a series\nof advanced algorithms, including an improved data augmentation module, an\nimage contamination module, and a post-processing module for getting the final\nSMILES output. These modules cooperate to enhance the model's robustness to\ndiverse styles of molecular images found in real literature. In our test sets,\nMolNexTR has demonstrated superior performance, achieving an accuracy rate of\n81-97%, marking a significant advancement in the domain of molecular structure\nrecognition.\n","authors":["Yufan Chen","Ching Ting Leung","Yong Huang","Jianwei Sun","Hao Chen","Hanyu Gao"],"pdf_url":"https://arxiv.org/pdf/2403.03691v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15513v1","updated":"2024-08-28T03:50:04Z","published":"2024-08-28T03:50:04Z","title":"Continual-learning-based framework for structural damage recognition","summary":"  Multi-damage is common in reinforced concrete structures and leads to the\nrequirement of large number of neural networks, parameters and data storage, if\nconvolutional neural network (CNN) is used for damage recognition. In addition,\nconventional CNN experiences catastrophic forgetting and training inefficiency\nas the number of tasks increases during continual learning, leading to large\naccuracy decrease of previous learned tasks. To address these problems, this\nstudy proposes a continuallearning-based damage recognition model (CLDRM) which\nintegrates the learning without forgetting continual learning method into the\nResNet-34 architecture for the recognition of damages in RC structures as well\nas relevant structural components. Three experiments for four recognition tasks\nwere designed to validate the feasibility and effectiveness of the CLDRM\nframework. In this way, it reduces both the prediction time and data storage by\nabout 75% in four tasks of continuous learning. Three experiments for four\nrecognition tasks were designed to validate the feasibility and effectiveness\nof the CLDRM framework. By gradual feature fusion, CLDRM outperformed other\nmethods by managed to achieve high accuracy in the damage recognition and\nclassification. As the number of recognition tasks increased, CLDRM also\nexperienced smaller decrease of the previous learned tasks. Results indicate\nthat the CLDRM framework successfully performs damage recognition and\nclassification with reasonable accuracy and effectiveness.\n","authors":["Jiangpeng Shu","Jiawei Zhang","Reachsak Ly","Fangzheng Lin","Yuanfeng Duan"],"pdf_url":"https://arxiv.org/pdf/2408.15513v1.pdf","comment":"18 pages, 12 figures"},{"id":"http://arxiv.org/abs/2307.12622v6","updated":"2024-08-28T03:45:08Z","published":"2023-07-24T08:51:49Z","title":"Phase Matching for Out-of-Distribution Generalization","summary":"  The Fourier transform, an explicit decomposition method for visual signals,\nhas been employed to explain the out-of-distribution generalization behaviors\nof Deep Neural Networks (DNNs). Previous studies indicate that the amplitude\nspectrum is susceptible to the disturbance caused by distribution shifts,\nwhereas the phase spectrum preserves highly-structured spatial information that\nis crucial for robust visual representation learning. Inspired by this insight,\nthis paper is dedicated to clarifying the relationships between Domain\nGeneralization (DG) and the frequency components. Specifically, we provide\ndistribution analysis and empirical experiments for the frequency components.\nBased on these observations, we propose a Phase Matching approach, termed\nPhaMa, to address DG problems. To this end, PhaMa introduces perturbations on\nthe amplitude spectrum and establishes spatial relationships to match the phase\ncomponents with patch contrastive learning. Experiments on multiple benchmarks\ndemonstrate that our proposed method achieves state-of-the-art performance in\ndomain generalization and out-of-distribution robustness tasks. Beyond vanilla\nanalysis and experiments, we further clarify the relationships between the\nFourier components and DG problems by introducing a Fourier-based Structural\nCausal Model (SCM).\n","authors":["Chengming Hu","Yeqian Du","Rui Wang","Hao Chen","Congcong Zhu"],"pdf_url":"https://arxiv.org/pdf/2307.12622v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06207v5","updated":"2024-08-28T03:39:10Z","published":"2023-09-12T13:21:12Z","title":"SGNet: Salient Geometric Network for Point Cloud Registration","summary":"  Point Cloud Registration (PCR) is a critical and challenging task in computer\nvision. One of the primary difficulties in PCR is identifying salient and\nmeaningful points that exhibit consistent semantic and geometric properties\nacross different scans. Previous methods have encountered challenges with\nambiguous matching due to the similarity among patch blocks throughout the\nentire point cloud and the lack of consideration for efficient global geometric\nconsistency. To address these issues, we propose a new framework that includes\nseveral novel techniques. Firstly, we introduce a semantic-aware geometric\nencoder that combines object-level and patch-level semantic information. This\nencoder significantly improves registration recall by reducing ambiguity in\npatch-level superpoint matching. Additionally, we incorporate a prior knowledge\napproach that utilizes an intrinsic shape signature to identify salient points.\nThis enables us to extract the most salient super points and meaningful dense\npoints in the scene. Secondly, we introduce an innovative transformer that\nencodes High-Order (HO) geometric features. These features are crucial for\nidentifying salient points within initial overlap regions while considering\nglobal high-order geometric consistency. To optimize this high-order\ntransformer further, we introduce an anchor node selection strategy. By\nencoding inter-frame triangle or polyhedron consistency features based on these\nanchor nodes, we can effectively learn high-order geometric features of salient\nsuper points. These high-order features are then propagated to dense points and\nutilized by a Sinkhorn matching module to identify key correspondences for\nsuccessful registration. In our experiments conducted on well-known datasets\nsuch as 3DMatch/3DLoMatch and KITTI, our approach has shown promising results,\nhighlighting the effectiveness of our novel method.\n","authors":["Qianliang Wu","Yaqing Ding","Lei Luo","Haobo Jiang","Shuo Gu","Chuanwei Zhou","Jin Xie","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2309.06207v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09460v2","updated":"2024-08-28T03:29:42Z","published":"2024-08-18T12:48:48Z","title":"Fine-Grained Building Function Recognition from Street-View Images via\n  Geometry-Aware Semi-Supervised Learning","summary":"  In this work, we propose a geometry-aware semi-supervised method for\nfine-grained building function recognition. This method leverages the geometric\nrelationships between multi-source data to improve the accuracy of pseudo\nlabels in semi-supervised learning, extending the task's scope and making it\napplicable to cross-categorization systems of building function recognition.\nFirstly, we design an online semi-supervised pre-training stage, which\nfacilitates the precise acquisition of building facade location information in\nstreet-view images. In the second stage, we propose a geometry-aware coarse\nannotation generation module. This module effectively combines GIS data and\nstreet-view data based on the geometric relationships, improving the accuracy\nof pseudo annotations. In the third stage, we combine the newly generated\ncoarse annotations with the existing labeled dataset to achieve fine-grained\nfunctional recognition of buildings across multiple cities at a large scale.\nExtensive experiments demonstrate that our proposed framework exhibits superior\nperformance in fine-grained functional recognition of buildings. Within the\nsame categorization system, it achieves improvements of 7.6% and 4.8% compared\nto fully-supervised methods and state-of-the-art semi-supervised methods,\nrespectively. Additionally, our method also performs well in cross-city tasks,\ni.e., extending the model trained on OmniCity (New York) to new areas (i.e.,\nLos Angeles and Boston). This study provides a novel solution for the\nfine-grained function recognition of large-scale buildings across multiple\ncities, offering essential data for understanding urban infrastructure\nplanning, human activity patterns, and the interactions between humans and\nbuildings.\n","authors":["Weijia Li","Jinhua Yu","Dairong Chen","Yi Lin","Runmin Dong","Xiang Zhang","Conghui He","Haohuan Fu"],"pdf_url":"https://arxiv.org/pdf/2408.09460v2.pdf","comment":"This paper is currently under review"},{"id":"http://arxiv.org/abs/2408.15503v1","updated":"2024-08-28T03:17:40Z","published":"2024-08-28T03:17:40Z","title":"RoboSense: Large-scale Dataset and Benchmark for Multi-sensor Low-speed\n  Autonomous Driving","summary":"  Robust object detection and tracking under arbitrary sight of view is\nchallenging yet essential for the development of Autonomous Vehicle technology.\nWith the growing demand of unmanned function vehicles, near-field scene\nunderstanding becomes an important research topic in the areas of low-speed\nautonomous driving. Due to the complexity of driving conditions and diversity\nof near obstacles such as blind spots and high occlusion, the perception\ncapability of near-field environment is still inferior than its farther\ncounterpart. To further enhance the intelligent ability of unmanned vehicles,\nin this paper, we construct a multimodal data collection platform based on 3\nmain types of sensors (Camera, LiDAR and Fisheye), which supports flexible\nsensor configurations to enable dynamic sight of view for ego vehicle, either\nglobal view or local view. Meanwhile, a large-scale multi-sensor dataset is\nbuilt, named RoboSense, to facilitate near-field scene understanding. RoboSense\ncontains more than 133K synchronized data with 1.4M 3D bounding box and IDs\nannotated in the full $360^{\\circ}$ view, forming 216K trajectories across 7.6K\ntemporal sequences. It has $270\\times$ and $18\\times$ as many annotations of\nnear-field obstacles within 5$m$ as the previous single-vehicle datasets such\nas KITTI and nuScenes. Moreover, we define a novel matching criterion for\nnear-field 3D perception and prediction metrics. Based on RoboSense, we\nformulate 6 popular tasks to facilitate the future development of related\nresearch, where the detailed data analysis as well as benchmarks are also\nprovided accordingly.\n","authors":["Haisheng Su","Feixiang Song","Cong Ma","Panpan Cai","Wei Wu","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2408.15503v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.02408v2","updated":"2024-08-28T02:53:22Z","published":"2024-08-05T12:09:38Z","title":"Multi-weather Cross-view Geo-localization Using Denoising Diffusion\n  Models","summary":"  Cross-view geo-localization in GNSS-denied environments aims to determine an\nunknown location by matching drone-view images with the correct geo-tagged\nsatellite-view images from a large gallery. Recent research shows that learning\ndiscriminative image representations under specific weather conditions can\nsignificantly enhance performance. However, the frequent occurrence of unseen\nextreme weather conditions hinders progress. This paper introduces MCGF, a\nMulti-weather Cross-view Geo-localization Framework designed to dynamically\nadapt to unseen weather conditions. MCGF establishes a joint optimization\nbetween image restoration and geo-localization using denoising diffusion\nmodels. For image restoration, MCGF incorporates a shared encoder and a\nlightweight restoration module to help the backbone eliminate weather-specific\ninformation. For geo-localization, MCGF uses EVA-02 as a backbone for feature\nextraction, with cross-entropy loss for training and cosine distance for\ntesting. Extensive experiments on University160k-WX demonstrate that MCGF\nachieves competitive results for geo-localization in varying weather\nconditions.\n","authors":["Tongtong Feng","Qing Li","Xin Wang","Mingzi Wang","Guangyao Li","Wenwu Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.02408v2.pdf","comment":"Accepted by ACM MM24 workshop"},{"id":"http://arxiv.org/abs/2408.15484v1","updated":"2024-08-28T02:17:58Z","published":"2024-08-28T02:17:58Z","title":"NAS-BNN: Neural Architecture Search for Binary Neural Networks","summary":"  Binary Neural Networks (BNNs) have gained extensive attention for their\nsuperior inferencing efficiency and compression ratio compared to traditional\nfull-precision networks. However, due to the unique characteristics of BNNs,\ndesigning a powerful binary architecture is challenging and often requires\nsignificant manpower. A promising solution is to utilize Neural Architecture\nSearch (NAS) to assist in designing BNNs, but current NAS methods for BNNs are\nrelatively straightforward and leave a performance gap between the searched\nmodels and manually designed ones. To address this gap, we propose a novel\nneural architecture search scheme for binary neural networks, named NAS-BNN. We\nfirst carefully design a search space based on the unique characteristics of\nBNNs. Then, we present three training strategies, which significantly enhance\nthe training of supernet and boost the performance of all subnets. Our\ndiscovered binary model family outperforms previous BNNs for a wide range of\noperations (OPs) from 20M to 200M. For instance, we achieve 68.20% top-1\naccuracy on ImageNet with only 57M OPs. In addition, we validate the\ntransferability of these searched BNNs on the object detection task, and our\nbinary detectors with the searched BNNs achieve a novel state-of-the-art\nresult, e.g., 31.6% mAP with 370M OPs, on MS COCO dataset. The source code and\nmodels will be released at https://github.com/VDIGPKU/NAS-BNN.\n","authors":["Zhihao Lin","Yongtao Wang","Jinhe Zhang","Xiaojie Chu","Haibin Ling"],"pdf_url":"https://arxiv.org/pdf/2408.15484v1.pdf","comment":"23 pages"},{"id":"http://arxiv.org/abs/2408.14895v2","updated":"2024-08-28T01:56:33Z","published":"2024-08-27T09:18:57Z","title":"VHAKG: A Multi-modal Knowledge Graph Based on Synchronized Multi-view\n  Videos of Daily Activities","summary":"  Multi-modal knowledge graphs (MMKGs), which ground various non-symbolic data\n(e.g., images and videos) into symbols, have attracted attention as resources\nenabling knowledge processing and machine learning across modalities. However,\nthe construction of MMKGs for videos consisting of multiple events, such as\ndaily activities, is still in the early stages. In this paper, we construct an\nMMKG based on synchronized multi-view simulated videos of daily activities.\nBesides representing the content of daily life videos as event-centric\nknowledge, our MMKG also includes frame-by-frame fine-grained changes, such as\nbounding boxes within video frames. In addition, we provide support tools for\nquerying our MMKG. As an application example, we demonstrate that our MMKG\nfacilitates benchmarking vision-language models by providing the necessary\nvision-language datasets for a tailored task.\n","authors":["Shusaku Egami","Takahiro Ugai","Swe Nwe Nwe Htun","Ken Fukuda"],"pdf_url":"https://arxiv.org/pdf/2408.14895v2.pdf","comment":"5 pages, 4 figures, accepted by CIKM2024 Resource Track"},{"id":"http://arxiv.org/abs/2402.14780v3","updated":"2024-08-28T01:13:44Z","published":"2024-02-22T18:38:48Z","title":"Customize-A-Video: One-Shot Motion Customization of Text-to-Video\n  Diffusion Models","summary":"  Image customization has been extensively studied in text-to-image (T2I)\ndiffusion models, leading to impressive outcomes and applications. With the\nemergence of text-to-video (T2V) diffusion models, its temporal counterpart,\nmotion customization, has not yet been well investigated. To address the\nchallenge of one-shot video motion customization, we propose Customize-A-Video\nthat models the motion from a single reference video and adapts it to new\nsubjects and scenes with both spatial and temporal varieties. It leverages\nlow-rank adaptation (LoRA) on temporal attention layers to tailor the\npre-trained T2V diffusion model for specific motion modeling. To disentangle\nthe spatial and temporal information during training, we introduce a novel\nconcept of appearance absorbers that detach the original appearance from the\nreference video prior to motion learning. The proposed modules are trained in a\nstaged pipeline and inferred in a plug-and-play fashion, enabling easy\nextensions to various downstream tasks such as custom video generation and\nediting, video appearance customization and multiple motion combination. Our\nproject page can be found at https://customize-a-video.github.io.\n","authors":["Yixuan Ren","Yang Zhou","Jimei Yang","Jing Shi","Difan Liu","Feng Liu","Mingi Kwon","Abhinav Shrivastava"],"pdf_url":"https://arxiv.org/pdf/2402.14780v3.pdf","comment":"Accepted by ECCV 2024. Project page:\n  https://customize-a-video.github.io"},{"id":"http://arxiv.org/abs/2408.15465v1","updated":"2024-08-28T01:06:19Z","published":"2024-08-28T01:06:19Z","title":"Dynamic Reconstruction from Neuromorphic Data","summary":"  Unlike traditional cameras which synchronously register pixel intensity,\nneuromorphic sensors only register `changes' at pixels where a change is\noccurring asynchronously. This enables neuromorphic sensors to sample at a\nmicro-second level and efficiently capture the dynamics. Since, only sequences\nof asynchronous event changes are recorded rather than brightness intensities\nover time, many traditional image processing techniques cannot be directly\napplied. Furthermore, existing approaches, including the ones recently\nintroduced by the authors, use traditional images combined with neuromorphic\nevent data to carry out reconstructions. The aim of this work is introduce an\noptimization based approach to reconstruct images and dynamics only from the\nneuromoprhic event data without any additional knowledge of the events. Each\npixel is modeled temporally. The experimental results on real data highlight\nthe efficacy of the presented approach, paving the way for efficient and\naccurate processing of neuromorphic sensor data in real-world applications.\n","authors":["Harbir Antil","Daniel Blauvelt","David Sayre"],"pdf_url":"https://arxiv.org/pdf/2408.15465v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15461v1","updated":"2024-08-28T00:54:51Z","published":"2024-08-28T00:54:51Z","title":"Hand1000: Generating Realistic Hands from Text with Only 1,000 Images","summary":"  Text-to-image generation models have achieved remarkable advancements in\nrecent years, aiming to produce realistic images from textual descriptions.\nHowever, these models often struggle with generating anatomically accurate\nrepresentations of human hands. The resulting images frequently exhibit issues\nsuch as incorrect numbers of fingers, unnatural twisting or interlacing of\nfingers, or blurred and indistinct hands. These issues stem from the inherent\ncomplexity of hand structures and the difficulty in aligning textual\ndescriptions with precise visual depictions of hands. To address these\nchallenges, we propose a novel approach named Hand1000 that enables the\ngeneration of realistic hand images with target gesture using only 1,000\ntraining samples. The training of Hand1000 is divided into three stages with\nthe first stage aiming to enhance the model's understanding of hand anatomy by\nusing a pre-trained hand gesture recognition model to extract gesture\nrepresentation. The second stage further optimizes text embedding by\nincorporating the extracted hand gesture representation, to improve alignment\nbetween the textual descriptions and the generated hand images. The third stage\nutilizes the optimized embedding to fine-tune the Stable Diffusion model to\ngenerate realistic hand images. In addition, we construct the first publicly\navailable dataset specifically designed for text-to-hand image generation.\nBased on the existing hand gesture recognition dataset, we adopt advanced image\ncaptioning models and LLaMA3 to generate high-quality textual descriptions\nenriched with detailed gesture information. Extensive experiments demonstrate\nthat Hand1000 significantly outperforms existing models in producing\nanatomically correct hand images while faithfully representing other details in\nthe text, such as faces, clothing, and colors.\n","authors":["Haozhuo Zhang","Bin Zhu","Yu Cao","Yanbin Hao"],"pdf_url":"https://arxiv.org/pdf/2408.15461v1.pdf","comment":"Project page https://haozhuo-zhang.github.io/Hand1000-project-page/"},{"id":"http://arxiv.org/abs/2408.15450v1","updated":"2024-08-28T00:07:51Z","published":"2024-08-28T00:07:51Z","title":"Avoiding Generative Model Writer's Block With Embedding Nudging","summary":"  Generative image models, since introduction, have become a global phenomenon.\nFrom new arts becoming possible to new vectors of abuse, many new capabilities\nhave become available. One of the challenging issues with generative models is\ncontrolling the generation process specially to prevent specific generations\nclasses or instances . There are several reasons why one may want to control\nthe output of generative models, ranging from privacy and safety concerns to\napplication limitations or user preferences\n  To address memorization and privacy challenges, there has been considerable\nresearch dedicated to filtering prompts or filtering the outputs of these\nmodels. What all these solutions have in common is that at the end of the day\nthey stop the model from producing anything, hence limiting the usability of\nthe model. In this paper, we propose a method for addressing this usability\nissue by making it possible to steer away from unwanted concepts (when detected\nin model's output) and still generating outputs. In particular we focus on the\nlatent diffusion image generative models and how one can prevent them to\ngenerate particular images while generating similar images with limited\noverhead.\n  We focus on mitigating issues like image memorization, demonstrating our\ntechnique's effectiveness through qualitative and quantitative evaluations. Our\nmethod successfully prevents the generation of memorized training images while\nmaintaining comparable image quality and relevance to the unmodified model.\n","authors":["Ali Zand","Milad Nasr"],"pdf_url":"https://arxiv.org/pdf/2408.15450v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16176v1","updated":"2024-08-28T23:53:57Z","published":"2024-08-28T23:53:57Z","title":"VLM4Bio: A Benchmark Dataset to Evaluate Pretrained Vision-Language\n  Models for Trait Discovery from Biological Images","summary":"  Images are increasingly becoming the currency for documenting biodiversity on\nthe planet, providing novel opportunities for accelerating scientific\ndiscoveries in the field of organismal biology, especially with the advent of\nlarge vision-language models (VLMs). We ask if pre-trained VLMs can aid\nscientists in answering a range of biologically relevant questions without any\nadditional fine-tuning. In this paper, we evaluate the effectiveness of 12\nstate-of-the-art (SOTA) VLMs in the field of organismal biology using a novel\ndataset, VLM4Bio, consisting of 469K question-answer pairs involving 30K images\nfrom three groups of organisms: fishes, birds, and butterflies, covering five\nbiologically relevant tasks. We also explore the effects of applying prompting\ntechniques and tests for reasoning hallucination on the performance of VLMs,\nshedding new light on the capabilities of current SOTA VLMs in answering\nbiologically relevant questions using images. The code and datasets for running\nall the analyses reported in this paper can be found at\nhttps://github.com/sammarfy/VLM4Bio.\n","authors":["M. Maruf","Arka Daw","Kazi Sajeed Mehrab","Harish Babu Manogaran","Abhilash Neog","Medha Sawhney","Mridul Khurana","James P. Balhoff","Yasin Bakis","Bahadir Altintas","Matthew J. Thompson","Elizabeth G. Campolongo","Josef C. Uyeda","Hilmar Lapp","Henry L. Bart","Paula M. Mabee","Yu Su","Wei-Lun Chao","Charles Stewart","Tanya Berger-Wolf","Wasila Dahdul","Anuj Karpatne"],"pdf_url":"https://arxiv.org/pdf/2408.16176v1.pdf","comment":"36 pages, 37 figures, 7 tables"},{"id":"http://arxiv.org/abs/2408.16154v1","updated":"2024-08-28T22:14:44Z","published":"2024-08-28T22:14:44Z","title":"Does Data-Efficient Generalization Exacerbate Bias in Foundation Models?","summary":"  Foundation models have emerged as robust models with label efficiency in\ndiverse domains. In medical imaging, these models contribute to the advancement\nof medical diagnoses due to the difficulty in obtaining labeled data. However,\nit is unclear whether using a large amount of unlabeled data, biased by the\npresence of sensitive attributes during pre-training, influences the fairness\nof the model. This research examines the bias in the Foundation model\n(RetFound) when it is applied to fine-tune the Brazilian Multilabel\nOphthalmological Dataset (BRSET), which has a different population than the\npre-training dataset. The model evaluation, in comparison with supervised\nlearning, shows that the Foundation Model has the potential to reduce the gap\nbetween the maximum AUC and minimum AUC evaluations across gender and age\ngroups. However, in a data-efficient generalization, the model increases the\nbias when the data amount decreases. These findings suggest that when deploying\na Foundation Model in real-life scenarios with limited data, the possibility of\nfairness issues should be considered.\n","authors":["Dilermando Queiroz","Anderson Carlos","Maíra Fatoretto","André Anjos","Lilian Berton","Luis Filipe Nakayama"],"pdf_url":"https://arxiv.org/pdf/2408.16154v1.pdf","comment":"Preprint of paper to be presented at Fairness and Ethics Towards\n  Transparent AI: Facing the Challenge through Model Debiasing (FAILED) during\n  ECCV 2024"},{"id":"http://arxiv.org/abs/2408.16150v1","updated":"2024-08-28T22:02:38Z","published":"2024-08-28T22:02:38Z","title":"Single-Photon 3D Imaging with Equi-Depth Photon Histograms","summary":"  Single-photon cameras present a promising avenue for high-resolution 3D\nimaging. They have ultra-high sensitivity -- down to individual photons -- and\ncan record photon arrival times with extremely high (sub-nanosecond)\nresolution. Single-photon 3D cameras estimate the round-trip time of a laser\npulse by forming equi-width (EW) histograms of detected photon timestamps.\nAcquiring and transferring such EW histograms requires high bandwidth and\nin-pixel memory, making SPCs less attractive in resource-constrained settings\nsuch as mobile devices and AR/VR headsets. In this work we propose a 3D sensing\ntechnique based on equi-depth (ED) histograms. ED histograms compress timestamp\ndata more efficiently than EW histograms, reducing the bandwidth requirement.\nMoreover, to reduce the in-pixel memory requirement, we propose a lightweight\nalgorithm to estimate ED histograms in an online fashion without explicitly\nstoring the photon timestamps. This algorithm is amenable to future in-pixel\nimplementations. We propose algorithms that process ED histograms to perform 3D\ncomputer-vision tasks of estimating scene distance maps and performing visual\nodometry under challenging conditions such as high ambient light. Our work\npaves the way towards lower bandwidth and reduced in-pixel memory requirements\nfor SPCs, making them attractive for resource-constrained 3D vision\napplications. Project page:\n$\\href{https://www.computational.camera/pedh}{https://www.computational.camera/pedh}$\n","authors":["Kaustubh Sadekar","David Maier","Atul Ingle"],"pdf_url":"https://arxiv.org/pdf/2408.16150v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16318v2","updated":"2024-08-28T21:07:49Z","published":"2024-03-24T22:53:16Z","title":"AutoInst: Automatic Instance-Based Segmentation of LiDAR 3D Scans","summary":"  Recently, progress in acquisition equipment such as LiDAR sensors has enabled\nsensing increasingly spacious outdoor 3D environments. Making sense of such 3D\nacquisitions requires fine-grained scene understanding, such as constructing\ninstance-based 3D scene segmentations. Commonly, a neural network is trained\nfor this task; however, this requires access to a large, densely annotated\ndataset, which is widely known to be challenging to obtain. To address this\nissue, in this work we propose to predict instance segmentations for 3D scenes\nin an unsupervised way, without relying on ground-truth annotations. To this\nend, we construct a learning framework consisting of two components: (1) a\npseudo-annotation scheme for generating initial unsupervised pseudo-labels; and\n(2) a self-training algorithm for instance segmentation to fit robust, accurate\ninstances from initial noisy proposals. To enable generating 3D instance mask\nproposals, we construct a weighted proxy-graph by connecting 3D points with\nedges integrating multi-modal image- and point-based self-supervised features,\nand perform graph-cuts to isolate individual pseudo-instances. We then build on\na state-of-the-art point-based architecture and train a 3D instance\nsegmentation model, resulting in significant refinement of initial proposals.\nTo scale to arbitrary complexity 3D scenes, we design our algorithm to operate\non local 3D point chunks and construct a merging step to generate scene-level\ninstance segmentations. Experiments on the challenging SemanticKITTI benchmark\ndemonstrate the potential of our approach, where it attains 13.3% higher\nAverage Precision and 9.1% higher F1 score compared to the best-performing\nbaseline. The code will be made publicly available at\nhttps://github.com/artonson/autoinst.\n","authors":["Cedric Perauer","Laurenz Adrian Heidrich","Haifan Zhang","Matthias Nießner","Anastasiia Kornilova","Alexey Artemov"],"pdf_url":"https://arxiv.org/pdf/2403.16318v2.pdf","comment":"8 pages, 7 figures, to be published in IEEE/RSJ International\n  Conference on Intelligent Robots and Systems (IROS) 2024"},{"id":"http://arxiv.org/abs/2408.16130v1","updated":"2024-08-28T20:35:38Z","published":"2024-08-28T20:35:38Z","title":"Using Backbone Foundation Model for Evaluating Fairness in Chest\n  Radiography Without Demographic Data","summary":"  Ensuring consistent performance across diverse populations and incorporating\nfairness into machine learning models are crucial for advancing medical image\ndiagnostics and promoting equitable healthcare. However, many databases do not\nprovide protected attributes or contain unbalanced representations of\ndemographic groups, complicating the evaluation of model performance across\ndifferent demographics and the application of bias mitigation techniques that\nrely on these attributes. This study aims to investigate the effectiveness of\nusing the backbone of Foundation Models as an embedding extractor for creating\ngroups that represent protected attributes, such as gender and age. We propose\nutilizing these groups in different stages of bias mitigation, including\npre-processing, in-processing, and evaluation. Using databases in and\nout-of-distribution scenarios, it is possible to identify that the method can\ncreate groups that represent gender in both databases and reduce in 4.44% the\ndifference between the gender attribute in-distribution and 6.16% in\nout-of-distribution. However, the model lacks robustness in handling age\nattributes, underscoring the need for more fundamentally fair and robust\nFoundation models. These findings suggest a role in promoting fairness\nassessment in scenarios where we lack knowledge of attributes, contributing to\nthe development of more equitable medical diagnostics.\n","authors":["Dilermando Queiroz","André Anjos","Lilian Berton"],"pdf_url":"https://arxiv.org/pdf/2408.16130v1.pdf","comment":"Preprint of paper to be presented at Fairness of AI in Medical\n  Imaging (FAIMI) during MICCAI 2024"},{"id":"http://arxiv.org/abs/2408.15113v2","updated":"2024-08-28T20:31:42Z","published":"2024-08-27T14:51:34Z","title":"AnomalousPatchCore: Exploring the Use of Anomalous Samples in Industrial\n  Anomaly Detection","summary":"  Visual inspection, or industrial anomaly detection, is one of the most common\nquality control types in manufacturing. The task is to identify the presence of\nan anomaly given an image, e.g., a missing component on an image of a circuit\nboard, for subsequent manual inspection. While industrial anomaly detection has\nseen a surge in recent years, most anomaly detection methods still utilize\nknowledge only from normal samples, failing to leverage the information from\nthe frequently available anomalous samples. Additionally, they heavily rely on\nvery general feature extractors pre-trained on common image classification\ndatasets. In this paper, we address these shortcomings and propose the new\nanomaly detection system AnomalousPatchCore~(APC) based on a feature extractor\nfine-tuned with normal and anomalous in-domain samples and a subsequent memory\nbank for identifying unusual features. To fine-tune the feature extractor in\nAPC, we propose three auxiliary tasks that address the different aspects of\nanomaly detection~(classification vs. localization) and mitigate the effect of\nthe imbalance between normal and anomalous samples. Our extensive evaluation on\nthe MVTec dataset shows that APC outperforms state-of-the-art systems in\ndetecting anomalies, which is especially important in industrial anomaly\ndetection given the subsequent manual inspection. In detailed ablation studies,\nwe further investigate the properties of our APC.\n","authors":["Mykhailo Koshil","Tilman Wegener","Detlef Mentrup","Simone Frintrop","Christian Wilms"],"pdf_url":"https://arxiv.org/pdf/2408.15113v2.pdf","comment":"Accepted at the 2nd workshop on Vision-based InduStrial InspectiON\n  (VISION) @ ECCV"},{"id":"http://arxiv.org/abs/2408.15077v2","updated":"2024-08-28T20:30:29Z","published":"2024-08-27T14:05:48Z","title":"MMASD+: A Novel Dataset for Privacy-Preserving Behavior Analysis of\n  Children with Autism Spectrum Disorder","summary":"  Autism spectrum disorder (ASD) is characterized by significant challenges in\nsocial interaction and comprehending communication signals. Recently,\ntherapeutic interventions for ASD have increasingly utilized Deep learning\npowered-computer vision techniques to monitor individual progress over time.\nThese models are trained on private, non-public datasets from the autism\ncommunity, creating challenges in comparing results across different models due\nto privacy-preserving data-sharing issues. This work introduces MMASD+, an\nenhanced version of the novel open-source dataset called Multimodal ASD\n(MMASD). MMASD+ consists of diverse data modalities, including 3D-Skeleton, 3D\nBody Mesh, and Optical Flow data. It integrates the capabilities of Yolov8 and\nDeep SORT algorithms to distinguish between the therapist and children,\naddressing a significant barrier in the original dataset. Additionally, a\nMultimodal Transformer framework is proposed to predict 11 action types and the\npresence of ASD. This framework achieves an accuracy of 95.03% for predicting\naction types and 96.42% for predicting ASD presence, demonstrating over a 10%\nimprovement compared to models trained on single data modalities. These\nfindings highlight the advantages of integrating multiple data modalities\nwithin the Multimodal Transformer framework.\n","authors":["Pavan Uttej Ravva","Behdokht Kiafar","Pinar Kullu","Jicheng Li","Anjana Bhat","Roghayeh Leila Barmaki"],"pdf_url":"https://arxiv.org/pdf/2408.15077v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03652v2","updated":"2024-08-28T20:29:12Z","published":"2024-05-06T17:23:42Z","title":"Field-of-View Extension for Brain Diffusion MRI via Deep Generative\n  Models","summary":"  Purpose: In diffusion MRI (dMRI), the volumetric and bundle analyses of\nwhole-brain tissue microstructure and connectivity can be severely impeded by\nan incomplete field-of-view (FOV). This work aims to develop a method for\nimputing the missing slices directly from existing dMRI scans with an\nincomplete FOV. We hypothesize that the imputed image with complete FOV can\nimprove the whole-brain tractography for corrupted data with incomplete FOV.\nTherefore, our approach provides a desirable alternative to discarding the\nvaluable dMRI data, enabling subsequent tractography analyses that would\notherwise be challenging or unattainable with corrupted data. Approach: We\npropose a framework based on a deep generative model that estimates the absent\nbrain regions in dMRI scans with incomplete FOV. The model is capable of\nlearning both the diffusion characteristics in diffusion-weighted images (DWI)\nand the anatomical features evident in the corresponding structural images for\nefficiently imputing missing slices of DWI outside of incomplete FOV. Results:\nFor evaluating the imputed slices, on the WRAP dataset the proposed framework\nachieved PSNRb0=22.397, SSIMb0=0.905, PSNRb1300=22.479, SSIMb1300=0.893; on the\nNACC dataset it achieved PSNRb0=21.304, SSIMb0=0.892, PSNRb1300=21.599,\nSSIMb1300= 0.877. The proposed framework improved the tractography accuracy, as\ndemonstrated by an increased average Dice score for 72 tracts (p < 0.001) on\nboth the WRAP and NACC datasets. Conclusions: Results suggest that the proposed\nframework achieved sufficient imputation performance in dMRI data with\nincomplete FOV for improving whole-brain tractography, thereby repairing the\ncorrupted data. Our approach achieved more accurate whole-brain tractography\nresults with extended and complete FOV and reduced the uncertainty when\nanalyzing bundles associated with Alzheimer's Disease.\n","authors":["Chenyu Gao","Shunxing Bao","Michael Kim","Nancy Newlin","Praitayini Kanakaraj","Tianyuan Yao","Gaurav Rudravaram","Yuankai Huo","Daniel Moyer","Kurt Schilling","Walter Kukull","Arthur Toga","Derek Archer","Timothy Hohman","Bennett Landman","Zhiyuan Li"],"pdf_url":"https://arxiv.org/pdf/2405.03652v2.pdf","comment":"20 pages, 11 figures"},{"id":"http://arxiv.org/abs/2408.16123v1","updated":"2024-08-28T20:22:39Z","published":"2024-08-28T20:22:39Z","title":"ChartEye: A Deep Learning Framework for Chart Information Extraction","summary":"  The widespread use of charts and infographics as a means of data\nvisualization in various domains has inspired recent research in automated\nchart understanding. However, information extraction from chart images is a\ncomplex multitasked process due to style variations and, as a consequence, it\nis challenging to design an end-to-end system. In this study, we propose a deep\nlearning-based framework that provides a solution for key steps in the chart\ninformation extraction pipeline. The proposed framework utilizes hierarchal\nvision transformers for the tasks of chart-type and text-role classification,\nwhile YOLOv7 for text detection. The detected text is then enhanced using Super\nResolution Generative Adversarial Networks to improve the recognition output of\nthe OCR. Experimental results on a benchmark dataset show that our proposed\nframework achieves excellent performance at every stage with F1-scores of 0.97\nfor chart-type classification, 0.91 for text-role classification, and a mean\nAverage Precision of 0.95 for text detection.\n","authors":["Osama Mustafa","Muhammad Khizer Ali","Momina Moetesum","Imran Siddiqi"],"pdf_url":"https://arxiv.org/pdf/2408.16123v1.pdf","comment":"8 Pages, and 11 Figures"},{"id":"http://arxiv.org/abs/2408.16117v1","updated":"2024-08-28T20:05:36Z","published":"2024-08-28T20:05:36Z","title":"Alternating Direction Method of Multipliers for Negative Binomial Model\n  with The Weighted Difference of Anisotropic and Isotropic Total Variation","summary":"  In many applications such as medical imaging, the measurement data represent\ncounts of photons hitting a detector. Such counts in low-photon settings are\noften modeled using a Poisson distribution. However, this model assumes that\nthe mean and variance of the signal's noise distribution are equal. For\noverdispersed data where the variance is greater than the mean, the negative\nbinomial distribution is a more appropriate statistical model. In this paper,\nwe propose an optimization approach for recovering images corrupted by\noverdispersed Poisson noise. In particular, we incorporate a weighted\nanisotropic-isotropic total variation regularizer, which avoids staircasing\nartifacts that are introduced by a regular total variation penalty. We use an\nalternating direction method of multipliers, where each subproblem has a\nclosed-form solution. Numerical experiments demonstrate the effectiveness of\nour proposed approach, especially in very photon-limited settings.\n","authors":["Yu Lu","Kevin Bui","Roummel F. Marcia"],"pdf_url":"https://arxiv.org/pdf/2408.16117v1.pdf","comment":"6 pages, Accepted by the IEEE International Conference on Multimedia\n  and Expo (ICME)"},{"id":"http://arxiv.org/abs/2408.03393v2","updated":"2024-08-28T19:56:19Z","published":"2024-08-06T18:38:55Z","title":"Biomedical Image Segmentation: A Systematic Literature Review of Deep\n  Learning Based Object Detection Methods","summary":"  Biomedical image segmentation plays a vital role in diagnosis of diseases\nacross various organs. Deep learning-based object detection methods are\ncommonly used for such segmentation. There exists an extensive research in this\ntopic. However, there is no standard review on this topic. Existing surveys\noften lack a standardized approach or focus on broader segmentation techniques.\nIn this paper, we conducted a systematic literature review (SLR), collected and\nanalysed 148 articles that explore deep learning object detection methods for\nbiomedical image segmentation. We critically analyzed these methods, identified\nthe key challenges, and discussed the future directions. From the selected\narticles we extracted the results including the deep learning models, targeted\nimaging modalities, targeted diseases, and the metrics for the analysis of the\nmethods. The results have been presented in tabular and/or charted forms. The\nresults are presented in three major categories including two stage detection\nmodels, one stage detection models and point-based detection models. Each\narticle is individually analyzed along with its pros and cons. Finally, we\ndiscuss open challenges, potential benefits, and future research directions.\nThis SLR aims to provide the research community with a quick yet deeper\nunderstanding of these segmentation models, ultimately facilitating the\ndevelopment of more powerful solutions for biomedical image analysis.\n","authors":["Fazli Wahid","Yingliang Ma","Dawar Khan","Muhammad Aamir","Syed U. K. Bukhari"],"pdf_url":"https://arxiv.org/pdf/2408.03393v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16113v1","updated":"2024-08-28T19:43:48Z","published":"2024-08-28T19:43:48Z","title":"Negative Binomial Matrix Completion","summary":"  Matrix completion focuses on recovering missing or incomplete information in\nmatrices. This problem arises in various applications, including image\nprocessing and network analysis. Previous research proposed Poisson matrix\ncompletion for count data with noise that follows a Poisson distribution, which\nassumes that the mean and variance are equal. Since overdispersed count data,\nwhose variance is greater than the mean, is more likely to occur in realistic\nsettings, we assume that the noise follows the negative binomial (NB)\ndistribution, which can be more general than the Poisson distribution. In this\npaper, we introduce NB matrix completion by proposing a nuclear-norm\nregularized model that can be solved by proximal gradient descent. In our\nexperiments, we demonstrate that the NB model outperforms Poisson matrix\ncompletion in various noise and missing data settings on real data.\n","authors":["Yu Lu","Kevin Bui","Roummel F. Marcia"],"pdf_url":"https://arxiv.org/pdf/2408.16113v1.pdf","comment":"6 pages, Accepted by the IEEE International Workshop on Machine\n  Learning for Signal Processing (MLSP)"},{"id":"http://arxiv.org/abs/2408.13724v2","updated":"2024-08-28T18:09:49Z","published":"2024-08-25T04:56:09Z","title":"PhysPart: Physically Plausible Part Completion for Interactable Objects","summary":"  Interactable objects are ubiquitous in our daily lives. Recent advances in 3D\ngenerative models make it possible to automate the modeling of these objects,\nbenefiting a range of applications from 3D printing to the creation of robot\nsimulation environments. However, while significant progress has been made in\nmodeling 3D shapes and appearances, modeling object physics, particularly for\ninteractable objects, remains challenging due to the physical constraints\nimposed by inter-part motions. In this paper, we tackle the problem of\nphysically plausible part completion for interactable objects, aiming to\ngenerate 3D parts that not only fit precisely into the object but also allow\nsmooth part motions. To this end, we propose a diffusion-based part generation\nmodel that utilizes geometric conditioning through classifier-free guidance and\nformulates physical constraints as a set of stability and mobility losses to\nguide the sampling process. Additionally, we demonstrate the generation of\ndependent parts, paving the way toward sequential part generation for objects\nwith complex part-whole hierarchies. Experimentally, we introduce a new metric\nfor measuring physical plausibility based on motion success rates. Our model\noutperforms existing baselines over shape and physical metrics, especially\nthose that do not adequately model physical constraints. We also demonstrate\nour applications in 3D printing, robot manipulation, and sequential part\ngeneration, showing our strength in realistic tasks with the demand for high\nphysical plausibility.\n","authors":["Rundong Luo","Haoran Geng","Congyue Deng","Puhao Li","Zan Wang","Baoxiong Jia","Leonidas Guibas","Siyuan Huang"],"pdf_url":"https://arxiv.org/pdf/2408.13724v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14028v2","updated":"2024-08-28T18:06:50Z","published":"2024-08-26T05:38:27Z","title":"SurGen: Text-Guided Diffusion Model for Surgical Video Generation","summary":"  Diffusion-based video generation models have made significant strides,\nproducing outputs with improved visual fidelity, temporal coherence, and user\ncontrol. These advancements hold great promise for improving surgical education\nby enabling more realistic, diverse, and interactive simulation environments.\nIn this study, we introduce SurGen, a text-guided diffusion model tailored for\nsurgical video synthesis, producing the highest resolution and longest duration\nvideos among existing surgical video generation models. We validate the visual\nand temporal quality of the outputs using standard image and video generation\nmetrics. Additionally, we assess their alignment to the corresponding text\nprompts through a deep learning classifier trained on surgical data. Our\nresults demonstrate the potential of diffusion models to serve as valuable\neducational tools for surgical trainees.\n","authors":["Joseph Cho","Samuel Schmidgall","Cyril Zakka","Mrudang Mathur","Rohan Shad","William Hiesinger"],"pdf_url":"https://arxiv.org/pdf/2408.14028v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16061v1","updated":"2024-08-28T18:01:00Z","published":"2024-08-28T18:01:00Z","title":"3D Reconstruction with Spatial Memory","summary":"  We present Spann3R, a novel approach for dense 3D reconstruction from ordered\nor unordered image collections. Built on the DUSt3R paradigm, Spann3R uses a\ntransformer-based architecture to directly regress pointmaps from images\nwithout any prior knowledge of the scene or camera parameters. Unlike DUSt3R,\nwhich predicts per image-pair pointmaps each expressed in its local coordinate\nframe, Spann3R can predict per-image pointmaps expressed in a global coordinate\nsystem, thus eliminating the need for optimization-based global alignment. The\nkey idea of Spann3R is to manage an external spatial memory that learns to keep\ntrack of all previous relevant 3D information. Spann3R then queries this\nspatial memory to predict the 3D structure of the next frame in a global\ncoordinate system. Taking advantage of DUSt3R's pre-trained weights, and\nfurther fine-tuning on a subset of datasets, Spann3R shows competitive\nperformance and generalization ability on various unseen datasets and can\nprocess ordered image collections in real time. Project page:\n\\url{https://hengyiwang.github.io/projects/spanner}\n","authors":["Hengyi Wang","Lourdes Agapito"],"pdf_url":"https://arxiv.org/pdf/2408.16061v1.pdf","comment":"Project page: \\url{https://hengyiwang.github.io/projects/spanner}"},{"id":"http://arxiv.org/abs/2011.08388v3","updated":"2024-08-28T22:05:07Z","published":"2020-11-17T02:55:16Z","title":"Interpretable Image Emotion Recognition: A Domain Adaptation Approach\n  Using Facial Expressions","summary":"  This paper proposes a feature-based domain adaptation technique for\nidentifying emotions in generic images, encompassing both facial and non-facial\nobjects, as well as non-human components. This approach addresses the challenge\nof the limited availability of pre-trained models and well-annotated datasets\nfor Image Emotion Recognition (IER). Initially, a deep-learning-based Facial\nExpression Recognition (FER) system is developed, classifying facial images\ninto discrete emotion classes. Maintaining the same network architecture, this\nFER system is then adapted to recognize emotions in generic images through the\napplication of discrepancy loss, enabling the model to effectively learn IER\nfeatures while classifying emotions into categories such as 'happy,' 'sad,'\n'hate,' and 'anger.' Additionally, a novel interpretability method, Divide and\nConquer based Shap (DnCShap), is introduced to elucidate the visual features\nmost relevant for emotion recognition. The proposed IER system demonstrated\nemotion classification accuracies of 60.98% for the IAPSa dataset, 58.86% for\nthe ArtPhoto dataset, 69.13% for the FI dataset, and 58.06% for the EMOTIC\ndataset. The system effectively identifies the important visual features\nleading to specific emotion classifications and provides detailed embedding\nplots to explain the predictions, enhancing the understanding and trust in\nAI-driven emotion recognition systems.\n","authors":["Puneet Kumar","Balasubramanian Raman"],"pdf_url":"https://arxiv.org/pdf/2011.08388v3.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2408.15953v1","updated":"2024-08-28T17:12:01Z","published":"2024-08-28T17:12:01Z","title":"Modeling and Analyzing the Influence of Non-Item Pages on Sequential\n  Next-Item Prediction","summary":"  Analyzing the sequence of historical interactions between users and items,\nsequential recommendation models learn user intent and make predictions about\nthe next item of interest. Next to these item interactions, most systems also\nhave interactions with pages not related to specific items, for example\nnavigation pages, account pages, and pages for a specific category, which may\nprovide additional insights into the user's interests. However, while there are\nseveral approaches to integrate additional information about items and users,\nthe topic of integrating non-item pages has been less explored. We use the\nhypotheses testing framework HypTrails to show that there is indeed a\nrelationship between these non-item pages and the items of interest and fill\nthis gap by proposing various approaches of representing non-item pages (e.g,\nbased on their content) to use them as an additional information source for the\ntask of sequential next-item prediction.\n  We create a synthetic dataset with non-item pages highly related to the\nsubsequent item to show that the models are generally capable of learning from\nthese interactions, and subsequently evaluate the improvements gained by\nincluding non-item pages in two real-world datasets.\n  We adapt eight popular sequential recommender models, covering CNN-, RNN- and\ntransformer-based architectures, to integrate non-item pages and investigate\nthe capabilities of these models to leverage their information for next item\nprediction. We also analyze their behavior on noisy data and compare different\nitem representation strategies.\n  Our results show that non-item pages are a valuable source of information,\nbut representing such a page well is the key to successfully leverage them. The\ninclusion of non-item pages can increase the performance for next-item\nprediction in all examined model architectures with a varying degree.\n","authors":["Elisabeth Fischer","Daniel Schlör","Albin Zehe","Andreas Hotho"],"pdf_url":"https://arxiv.org/pdf/2408.15953v1.pdf","comment":"36 pages, 19 figures; Work in Progress"},{"id":"http://arxiv.org/abs/2408.15836v1","updated":"2024-08-28T14:48:37Z","published":"2024-08-28T14:48:37Z","title":"Knowledge Navigator: LLM-guided Browsing Framework for Exploratory\n  Search in Scientific Literature","summary":"  The exponential growth of scientific literature necessitates advanced tools\nfor effective knowledge exploration. We present Knowledge Navigator, a system\ndesigned to enhance exploratory search abilities by organizing and structuring\nthe retrieved documents from broad topical queries into a navigable, two-level\nhierarchy of named and descriptive scientific topics and subtopics. This\nstructured organization provides an overall view of the research themes in a\ndomain, while also enabling iterative search and deeper knowledge discovery\nwithin specific subtopics by allowing users to refine their focus and retrieve\nadditional relevant documents. Knowledge Navigator combines LLM capabilities\nwith cluster-based methods to enable an effective browsing method. We\ndemonstrate our approach's effectiveness through automatic and manual\nevaluations on two novel benchmarks, CLUSTREC-COVID and SCITOC. Our code,\nprompts, and benchmarks are made publicly available.\n","authors":["Uri Katz","Mosh Levy","Yoav Goldberg"],"pdf_url":"https://arxiv.org/pdf/2408.15836v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15796v1","updated":"2024-08-28T13:42:28Z","published":"2024-08-28T13:42:28Z","title":"Evaluating Named Entity Recognition Using Few-Shot Prompting with Large\n  Language Models","summary":"  This paper evaluates Few-Shot Prompting with Large Language Models for Named\nEntity Recognition (NER). Traditional NER systems rely on extensive labeled\ndatasets, which are costly and time-consuming to obtain. Few-Shot Prompting or\nin-context learning enables models to recognize entities with minimal examples.\nWe assess state-of-the-art models like GPT-4 in NER tasks, comparing their\nfew-shot performance to fully supervised benchmarks. Results show that while\nthere is a performance gap, large models excel in adapting to new entity types\nand domains with very limited data. We also explore the effects of prompt\nengineering, guided output format and context length on performance. This study\nunderscores Few-Shot Learning's potential to reduce the need for large labeled\ndatasets, enhancing NER scalability and accessibility.\n","authors":["Hédi Zhegidi","Ludovic Moncla"],"pdf_url":"https://arxiv.org/pdf/2408.15796v1.pdf","comment":"Github repo: https://github.com/GEODE-project/ner-llm"},{"id":"http://arxiv.org/abs/2408.15787v1","updated":"2024-08-28T13:29:59Z","published":"2024-08-28T13:29:59Z","title":"Interactive Agents: Simulating Counselor-Client Psychological Counseling\n  via Role-Playing LLM-to-LLM Interactions","summary":"  Virtual counselors powered by large language models (LLMs) aim to create\ninteractive support systems that effectively assist clients struggling with\nmental health challenges. To replicate counselor-client conversations,\nresearchers have built an online mental health platform that allows\nprofessional counselors to provide clients with text-based counseling services\nfor about an hour per session. Notwithstanding its effectiveness, challenges\nexist as human annotation is time-consuming, cost-intensive, privacy-protected,\nand not scalable. To address this issue and investigate the applicability of\nLLMs in psychological counseling conversation simulation, we propose a\nframework that employs two LLMs via role-playing for simulating\ncounselor-client interactions. Our framework involves two LLMs, one acting as a\nclient equipped with a specific and real-life user profile and the other\nplaying the role of an experienced counselor, generating professional responses\nusing integrative therapy techniques. We implement both the counselor and the\nclient by zero-shot prompting the GPT-4 model. In order to assess the\neffectiveness of LLMs in simulating counselor-client interactions and\nunderstand the disparities between LLM- and human-generated conversations, we\nevaluate the synthetic data from various perspectives. We begin by assessing\nthe client's performance through automatic evaluations. Next, we analyze and\ncompare the disparities between dialogues generated by the LLM and those\ngenerated by professional counselors. Furthermore, we conduct extensive\nexperiments to thoroughly examine the performance of our LLM-based counselor\ntrained with synthetic interactive dialogues by benchmarking against\nstate-of-the-art models for mental health.\n","authors":["Huachuan Qiu","Zhenzhong Lan"],"pdf_url":"https://arxiv.org/pdf/2408.15787v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14432v2","updated":"2024-08-28T12:39:57Z","published":"2024-08-26T17:20:34Z","title":"Contextual Bandit with Herding Effects: Algorithms and Recommendation\n  Applications","summary":"  Contextual bandits serve as a fundamental algorithmic framework for\noptimizing recommendation decisions online. Though extensive attention has been\npaid to tailoring contextual bandits for recommendation applications, the\n\"herding effects\" in user feedback have been ignored. These herding effects\nbias user feedback toward historical ratings, breaking down the assumption of\nunbiased feedback inherent in contextual bandits. This paper develops a novel\nvariant of the contextual bandit that is tailored to address the feedback bias\ncaused by the herding effects. A user feedback model is formulated to capture\nthis feedback bias. We design the TS-Conf (Thompson Sampling under Conformity)\nalgorithm, which employs posterior sampling to balance the exploration and\nexploitation tradeoff. We prove an upper bound for the regret of the algorithm,\nrevealing the impact of herding effects on learning speed. Extensive\nexperiments on datasets demonstrate that TS-Conf outperforms four benchmark\nalgorithms. Analysis reveals that TS-Conf effectively mitigates the negative\nimpact of herding effects, resulting in faster learning and improved\nrecommendation accuracy.\n","authors":["Luyue Xu","Liming Wang","Hong Xie","Mingqiang Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.14432v2.pdf","comment":"Published as a conference paper at PRICAI 2024"},{"id":"http://arxiv.org/abs/2408.15688v1","updated":"2024-08-28T10:25:36Z","published":"2024-08-28T10:25:36Z","title":"PDSR: A Privacy-Preserving Diversified Service Recommendation Method on\n  Distributed Data","summary":"  The last decade has witnessed a tremendous growth of service computing, while\nefficient service recommendation methods are desired to recommend high-quality\nservices to users. It is well known that collaborative filtering is one of the\nmost popular methods for service recommendation based on QoS, and many existing\nproposals focus on improving recommendation accuracy, i.e., recommending\nhigh-quality redundant services. Nevertheless, users may have different\nrequirements on QoS, and hence diversified recommendation has been attracting\nincreasing attention in recent years to fulfill users' diverse demands and to\nexplore potential services. Unfortunately, the recommendation performances\nrelies on a large volume of data (e.g., QoS data), whereas the data may be\ndistributed across multiple platforms. Therefore, to enable data sharing across\nthe different platforms for diversified service recommendation, we propose a\nPrivacy-preserving Diversified Service Recommendation (PDSR) method.\nSpecifically, we innovate in leveraging the Locality-Sensitive Hashing (LSH)\nmechanism such that privacy-preserved data sharing across different platforms\nis enabled to construct a service similarity graph. Based on the similarity\ngraph, we propose a novel accuracy-diversity metric and design a\n$2$-approximation algorithm to select $K$ services to recommend by maximizing\nthe accuracy-diversity measure. Extensive experiments on real datasets are\nconducted to verify the efficacy of our PDSR method.\n","authors":["Lina Wang","Huan Yang","Yiran Shen","Chao Liu","Lianyong Qi","Xiuzhen Cheng","Feng Li"],"pdf_url":"https://arxiv.org/pdf/2408.15688v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.11245v4","updated":"2024-08-28T08:51:57Z","published":"2022-05-18T04:38:15Z","title":"PASH at TREC 2021 Deep Learning Track: Generative Enhanced Model for\n  Multi-stage Ranking","summary":"  This paper describes the PASH participation in TREC 2021 Deep Learning Track.\nIn the recall stage, we adopt a scheme combining sparse and dense retrieval\nmethod. In the multi-stage ranking phase, point-wise and pair-wise ranking\nstrategies are used one after another based on model continual pre-trained on\ngeneral knowledge and document-level data. Compared to TREC 2020 Deep Learning\nTrack, we have additionally introduced the generative model T5 to further\nenhance the performance.\n","authors":["Yixuan Qiao","Hao Chen","Jun Wang","Tuozhen Liu","Xianbin Ye","Xin Tang","Rui Fang","Peng Gao","Wenfeng Xie","Guotong Xie"],"pdf_url":"https://arxiv.org/pdf/2205.11245v4.pdf","comment":"TREC 2021"},{"id":"http://arxiv.org/abs/2408.15620v1","updated":"2024-08-28T08:21:56Z","published":"2024-08-28T08:21:56Z","title":"CAPER: Enhancing Career Trajectory Prediction using Temporal Knowledge\n  Graph and Ternary Relationship","summary":"  The problem of career trajectory prediction (CTP) aims to predict one's\nfuture employer or job position. While several CTP methods have been developed\nfor this problem, we posit that none of these methods (1) jointly considers the\nmutual ternary dependency between three key units (i.e., user, position, and\ncompany) of a career and (2) captures the characteristic shifts of key units in\ncareer over time, leading to an inaccurate understanding of the job movement\npatterns in the labor market. To address the above challenges, we propose a\nnovel solution, named as CAPER, that solves the challenges via sophisticated\ntemporal knowledge graph (TKG) modeling. It enables the utilization of a\ngraph-structured knowledge base with rich expressiveness, effectively\npreserving the changes in job movement patterns. Furthermore, we devise an\nextrapolated career reasoning task on TKG for a realistic evaluation. The\nexperiments on a real-world career trajectory dataset demonstrate that CAPER\nconsistently and significantly outperforms four baselines, two recent TKG\nreasoning methods, and five state-of-the-art CTP methods in predicting one's\nfuture companies and positions-i.e., on average, yielding 6.80% and 34.58% more\naccurate predictions, respectively.\n","authors":["Yeon-Chang Lee","JaeHyun Lee","Michiharu Yamashita","Dongwon Lee","Sang-Wook Kim"],"pdf_url":"https://arxiv.org/pdf/2408.15620v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15575v1","updated":"2024-08-28T07:00:19Z","published":"2024-08-28T07:00:19Z","title":"Lyrically Speaking: Exploring the Link Between Lyrical Emotions, Themes\n  and Depression Risk","summary":"  Lyrics play a crucial role in affecting and reinforcing emotional states by\nproviding meaning and emotional connotations that interact with the acoustic\nproperties of the music. Specific lyrical themes and emotions may intensify\nexisting negative states in listeners and may lead to undesirable outcomes,\nespecially in listeners with mood disorders such as depression. Hence, it is\nimportant for such individuals to be mindful of their listening strategies. In\nthis study, we examine online music consumption of individuals at risk of\ndepression in light of lyrical themes and emotions. Lyrics obtained from the\nlistening histories of 541 Last.fm users, divided into At-Risk and No-Risk\nbased on their mental well-being scores, were analyzed using natural language\nprocessing techniques. Statistical analyses of the results revealed that\nindividuals at risk for depression prefer songs with lyrics associated with low\nvalence and low arousal. Additionally, lyrics associated with themes of denial,\nself-reference, and ambivalence were preferred. In contrast, themes such as\nliberation, familiarity, and activity are not as favored. This study opens up\nthe possibility of an approach to assessing depression risk from the digital\nfootprint of individuals and potentially developing personalized recommendation\nsystems.\n","authors":["Pavani Chowdary","Bhavyajeet Singh","Rajat Agarwal","Vinoo Alluri"],"pdf_url":"https://arxiv.org/pdf/2408.15575v1.pdf","comment":"Accepted at the 25th International Society for Music Information\n  Retrieval Conference (ISMIR) 2024, San Francisco, United States"},{"id":"http://arxiv.org/abs/2408.07611v2","updated":"2024-08-28T03:47:28Z","published":"2024-08-14T15:19:16Z","title":"WeKnow-RAG: An Adaptive Approach for Retrieval-Augmented Generation\n  Integrating Web Search and Knowledge Graphs","summary":"  Large Language Models (LLMs) have greatly contributed to the development of\nadaptive intelligent agents and are positioned as an important way to achieve\nArtificial General Intelligence (AGI). However, LLMs are prone to produce\nfactually incorrect information and often produce \"phantom\" content that\nundermines their reliability, which poses a serious challenge for their\ndeployment in real-world scenarios. Enhancing LLMs by combining external\ndatabases and information retrieval mechanisms is an effective path. To address\nthe above challenges, we propose a new approach called WeKnow-RAG, which\nintegrates Web search and Knowledge Graphs into a \"Retrieval-Augmented\nGeneration (RAG)\" system. First, the accuracy and reliability of LLM responses\nare improved by combining the structured representation of Knowledge Graphs\nwith the flexibility of dense vector retrieval. WeKnow-RAG then utilizes\ndomain-specific knowledge graphs to satisfy a variety of queries and domains,\nthereby improving performance on factual information and complex reasoning\ntasks by employing multi-stage web page retrieval techniques using both sparse\nand dense retrieval methods. Our approach effectively balances the efficiency\nand accuracy of information retrieval, thus improving the overall retrieval\nprocess. Finally, we also integrate a self-assessment mechanism for the LLM to\nevaluate the trustworthiness of the answers it generates. Our approach proves\nits outstanding effectiveness in a wide range of offline experiments and online\nsubmissions.\n","authors":["Weijian Xie","Xuefeng Liang","Yuhui Liu","Kaihua Ni","Hong Cheng","Zetian Hu"],"pdf_url":"https://arxiv.org/pdf/2408.07611v2.pdf","comment":"8 pages, 2 figures, technical report for 3rd place in Task 3 of Meta\n  KDD Cup 2024 CRAG Challenge"},{"id":"http://arxiv.org/abs/2402.07926v2","updated":"2024-08-28T18:53:17Z","published":"2024-02-05T18:16:04Z","title":"From Data Creator to Data Reuser: Distance Matters","summary":"  Sharing research data is necessary, but not sufficient, for data reuse. Open\nscience policies focus more heavily on data sharing than on reuse, yet both are\ncomplex, labor-intensive, expensive, and require infrastructure investments by\nmultiple stakeholders. The value of data reuse lies in relationships between\ncreators and reusers. By addressing knowledge exchange, rather than mere\ntransactions between stakeholders, investments in data management and knowledge\ninfrastructures can be made more wisely. Drawing upon empirical studies of data\nsharing and reuse, we develop the theoretical construct of distance between\ndata creator and data reuser, identifying six distance dimensions that\ninfluence the ability to transfer knowledge effectively: domain, methods,\ncollaboration, curation, purposes, and time and temporality. We address the\nsocial and socio-technical aspects of these dimensions, exploring ways in which\nthey may decrease -- or increase -- distances between creators and reusers. Our\ntheoretical framing of the distance between data creators and prospective\nreusers leads to recommendations to four categories of stakeholders on how to\nmake data sharing and reuse more effective: data creators, data reusers, data\narchivists, and funding agencies. 'It takes a village' to share research data\n-- and a village to reuse data. Our aim is to provoke new research questions,\nnew research, and new investments in effective and efficient circulation of\nresearch data; and to identify criteria for investments at each stage of data\nand research life cycles.\n","authors":["Christine L. Borgman","Paul T. Groth"],"pdf_url":"https://arxiv.org/pdf/2402.07926v2.pdf","comment":"74 pages, double-spaced, consisting of Table of Contents, Abstract,\n  45 page narrative, 1 box, 1 figure, 1 table, 27 pages references. Original\n  work"},{"id":"http://arxiv.org/abs/2408.16036v1","updated":"2024-08-28T16:16:55Z","published":"2024-08-28T16:16:55Z","title":"Efficient $k$-NN Search in IoT Data: Overlap Optimization in Tree-Based\n  Indexing Structures","summary":"  The proliferation of interconnected devices in the Internet of Things (IoT)\nhas led to an exponential increase in data, commonly known as Big IoT Data.\nEfficient retrieval of this heterogeneous data demands a robust indexing\nmechanism for effective organization. However, a significant challenge remains:\nthe overlap in data space partitions during index construction. This overlap\nincreases node access during search and retrieval, resulting in higher resource\nconsumption, performance bottlenecks, and impedes system scalability. To\naddress this issue, we propose three innovative heuristics designed to quantify\nand strategically reduce data space partition overlap. The volume-based method\n(VBM) offers a detailed assessment by calculating the intersection volume\nbetween partitions, providing deeper insights into spatial relationships. The\ndistance-based method (DBM) enhances efficiency by using the distance between\npartition centers and radii to evaluate overlap, offering a streamlined yet\naccurate approach. Finally, the object-based method (OBM) provides a practical\nsolution by counting objects across multiple partitions, delivering an\nintuitive understanding of data space dynamics. Experimental results\ndemonstrate the effectiveness of these methods in reducing search time,\nunderscoring their potential to improve data space partitioning and enhance\noverall system performance.\n","authors":["Ala-Eddine Benrazek","Zineddine Kouahla","Brahim Farou","Hamid Seridi","Ibtissem Kemouguette"],"pdf_url":"https://arxiv.org/pdf/2408.16036v1.pdf","comment":"28 pages, 21 figures, 1 table"},{"id":"http://arxiv.org/abs/2408.16032v1","updated":"2024-08-28T10:31:50Z","published":"2024-08-28T10:31:50Z","title":"An Extremely Data-efficient and Generative LLM-based Reinforcement\n  Learning Agent for Recommenders","summary":"  Recent advancements in large language models (LLMs) have enabled\nunderstanding webpage contexts, product details, and human instructions.\nUtilizing LLMs as the foundational architecture for either reward models or\npolicies in reinforcement learning has gained popularity -- a notable\nachievement is the success of InstructGPT. RL algorithms have been instrumental\nin maximizing long-term customer satisfaction and avoiding short-term, myopic\ngoals in industrial recommender systems, which often rely on deep learning\nmodels to predict immediate clicks or purchases.\n  In this project, several RL methods are implemented and evaluated using the\nWebShop benchmark environment, data, simulator, and pre-trained model\ncheckpoints. The goal is to train an RL agent to maximize the purchase reward\ngiven a detailed human instruction describing a desired product. The RL agents\nare developed by fine-tuning a pre-trained BERT model with various objectives,\nlearning from preferences without a reward model, and employing contemporary\ntraining techniques such as Proximal Policy Optimization (PPO) as used in\nInstructGPT, and Direct Preference Optimization (DPO). This report also\nevaluates the RL agents trained using generative trajectories. Evaluations were\nconducted using Thompson sampling in the WebShop simulator environment.\n  The simulated online experiments demonstrate that agents trained on generated\ntrajectories exhibited comparable task performance to those trained using human\ntrajectories. This has demonstrated an example of an extremely low-cost\ndata-efficient way of training reinforcement learning agents. Also, with\nlimited training time (<2hours), without utilizing any images, a DPO agent\nachieved a 19% success rate after approximately 3000 steps or 30 minutes of\ntraining on T4 GPUs, compared to a PPO agent, which reached a 15% success rate.\n","authors":["Shuang Feng","Grace Feng"],"pdf_url":"https://arxiv.org/pdf/2408.16032v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2408.15999v1","updated":"2024-08-28T18:05:53Z","published":"2024-08-28T18:05:53Z","title":"Q-MRS: A Deep Learning Framework for Quantitative Magnetic Resonance\n  Spectra Analysis","summary":"  Magnetic resonance spectroscopy (MRS) is an established technique for\nstudying tissue metabolism, particularly in central nervous system disorders.\nWhile powerful and versatile, MRS is often limited by challenges associated\nwith data quality, processing, and quantification. Existing MRS quantification\nmethods face difficulties in balancing model complexity and reproducibility\nduring spectral modeling, often falling into the trap of either\noversimplification or over-parameterization. To address these limitations, this\nstudy introduces a deep learning (DL) framework that employs transfer learning,\nin which the model is pre-trained on simulated datasets before it undergoes\nfine-tuning on in vivo data. The proposed framework showed promising\nperformance when applied to the Philips dataset from the BIG GABA repository\nand represents an exciting advancement in MRS data analysis.\n","authors":["Christopher J. Wu","Lawrence S. Kegeles","Jia Guo"],"pdf_url":"https://arxiv.org/pdf/2408.15999v1.pdf","comment":"8 pages, 4 figures, and 3 tables for the main body; 9 pages, 4\n  figures, and 3 tables for the supplementary material"},{"id":"http://arxiv.org/abs/2408.15998v1","updated":"2024-08-28T17:59:31Z","published":"2024-08-28T17:59:31Z","title":"Eagle: Exploring The Design Space for Multimodal LLMs with Mixture of\n  Encoders","summary":"  The ability to accurately interpret complex visual information is a crucial\ntopic of multimodal large language models (MLLMs). Recent work indicates that\nenhanced visual perception significantly reduces hallucinations and improves\nperformance on resolution-sensitive tasks, such as optical character\nrecognition and document analysis. A number of recent MLLMs achieve this goal\nusing a mixture of vision encoders. Despite their success, there is a lack of\nsystematic comparisons and detailed ablation studies addressing critical\naspects, such as expert selection and the integration of multiple vision\nexperts. This study provides an extensive exploration of the design space for\nMLLMs using a mixture of vision encoders and resolutions. Our findings reveal\nseveral underlying principles common to various existing strategies, leading to\na streamlined yet effective design approach. We discover that simply\nconcatenating visual tokens from a set of complementary vision encoders is as\neffective as more complex mixing architectures or strategies. We additionally\nintroduce Pre-Alignment to bridge the gap between vision-focused encoders and\nlanguage tokens, enhancing model coherence. The resulting family of MLLMs,\nEagle, surpasses other leading open-source models on major MLLM benchmarks.\nModels and code: https://github.com/NVlabs/Eagle\n","authors":["Min Shi","Fuxiao Liu","Shihao Wang","Shijia Liao","Subhashree Radhakrishnan","De-An Huang","Hongxu Yin","Karan Sapra","Yaser Yacoob","Humphrey Shi","Bryan Catanzaro","Andrew Tao","Jan Kautz","Zhiding Yu","Guilin Liu"],"pdf_url":"https://arxiv.org/pdf/2408.15998v1.pdf","comment":"Github: https://github.com/NVlabs/Eagle, HuggingFace:\n  https://huggingface.co/NVEagle"},{"id":"http://arxiv.org/abs/2408.15997v1","updated":"2024-08-28T17:59:27Z","published":"2024-08-28T17:59:27Z","title":"Mamba or Transformer for Time Series Forecasting? Mixture of Universals\n  (MoU) Is All You Need","summary":"  Time series forecasting requires balancing short-term and long-term\ndependencies for accurate predictions. Existing methods mainly focus on\nlong-term dependency modeling, neglecting the complexities of short-term\ndynamics, which may hinder performance. Transformers are superior in modeling\nlong-term dependencies but are criticized for their quadratic computational\ncost. Mamba provides a near-linear alternative but is reported less effective\nin time series longterm forecasting due to potential information loss. Current\narchitectures fall short in offering both high efficiency and strong\nperformance for long-term dependency modeling. To address these challenges, we\nintroduce Mixture of Universals (MoU), a versatile model to capture both\nshort-term and long-term dependencies for enhancing performance in time series\nforecasting. MoU is composed of two novel designs: Mixture of Feature\nExtractors (MoF), an adaptive method designed to improve time series patch\nrepresentations for short-term dependency, and Mixture of Architectures (MoA),\nwhich hierarchically integrates Mamba, FeedForward, Convolution, and\nSelf-Attention architectures in a specialized order to model long-term\ndependency from a hybrid perspective. The proposed approach achieves\nstate-of-the-art performance while maintaining relatively low computational\ncosts. Extensive experiments on seven real-world datasets demonstrate the\nsuperiority of MoU. Code is available at https://github.com/lunaaa95/mou/.\n","authors":["Sijia Peng","Yun Xiong","Yangyong Zhu","Zhiqiang Shen"],"pdf_url":"https://arxiv.org/pdf/2408.15997v1.pdf","comment":"Code at https://github.com/lunaaa95/mou/"},{"id":"http://arxiv.org/abs/2408.15993v1","updated":"2024-08-28T17:58:53Z","published":"2024-08-28T17:58:53Z","title":"ClimDetect: A Benchmark Dataset for Climate Change Detection and\n  Attribution","summary":"  Detecting and attributing temperature increases due to climate change is\ncrucial for understanding global warming and guiding adaptation strategies. The\ncomplexity of distinguishing human-induced climate signals from natural\nvariability has challenged traditional detection and attribution (D&A)\napproaches, which seek to identify specific \"fingerprints\" in climate response\nvariables. Deep learning offers potential for discerning these complex patterns\nin expansive spatial datasets. However, lack of standard protocols has hindered\nconsistent comparisons across studies. We introduce ClimDetect, a standardized\ndataset of over 816k daily climate snapshots, designed to enhance model\naccuracy in identifying climate change signals. ClimDetect integrates various\ninput and target variables used in past research, ensuring comparability and\nconsistency. We also explore the application of vision transformers (ViT) to\nclimate data, a novel and modernizing approach in this context. Our open-access\ndata and code serve as a benchmark for advancing climate science through\nimproved model evaluations. ClimDetect is publicly accessible via Huggingface\ndataet respository at: https://huggingface.co/datasets/ClimDetect/ClimDetect.\n","authors":["Sungduk Yu","Brian L. White","Anahita Bhiwandiwalla","Musashi Hinck","Matthew Lyle Olson","Tung Nguyen","Vasudev Lal"],"pdf_url":"https://arxiv.org/pdf/2408.15993v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15992v1","updated":"2024-08-28T17:58:39Z","published":"2024-08-28T17:58:39Z","title":"CoGen: Learning from Feedback with Coupled Comprehension and Generation","summary":"  Systems with both language comprehension and generation capabilities can\nbenefit from the tight connection between the two. This work studies coupling\ncomprehension and generation with focus on continually learning from\ninteraction with users. We propose techniques to tightly integrate the two\ncapabilities for both learning and inference. We situate our studies in\ntwo-player reference games, and deploy various models for thousands of\ninteractions with human users, while learning from interaction feedback\nsignals. We show dramatic improvements in performance over time, with\ncomprehension-generation coupling leading to performance improvements up to 26%\nin absolute terms and up to 17% higher accuracies compared to a non-coupled\nsystem. Our analysis also shows coupling has substantial qualitative impact on\nthe system's language, making it significantly more human-like.\n","authors":["Mustafa Omer Gul","Yoav Artzi"],"pdf_url":"https://arxiv.org/pdf/2408.15992v1.pdf","comment":"17 pages, 9 figures"},{"id":"http://arxiv.org/abs/2404.17701v5","updated":"2024-08-28T17:47:35Z","published":"2024-04-26T20:59:23Z","title":"Embedded FPGA Developments in 130nm and 28nm CMOS for Machine Learning\n  in Particle Detector Readout","summary":"  Embedded field programmable gate array (eFPGA) technology allows the\nimplementation of reconfigurable logic within the design of an\napplication-specific integrated circuit (ASIC). This approach offers the low\npower and efficiency of an ASIC along with the ease of FPGA configuration,\nparticularly beneficial for the use case of machine learning in the data\npipeline of next-generation collider experiments. An open-source framework\ncalled \"FABulous\" was used to design eFPGAs using 130 nm and 28 nm CMOS\ntechnology nodes, which were subsequently fabricated and verified through\ntesting. The capability of an eFPGA to act as a front-end readout chip was\nassessed using simulation of high energy particles passing through a silicon\npixel sensor. A machine learning-based classifier, designed for reduction of\nsensor data at the source, was synthesized and configured onto the eFPGA. A\nsuccessful proof-of-concept was demonstrated through reproduction of the\nexpected algorithm result on the eFPGA with perfect accuracy. Further\ndevelopment of the eFPGA technology and its application to collider detector\nreadout is discussed.\n","authors":["Julia Gonski","Aseem Gupta","Haoyi Jia","Hyunjoon Kim","Lorenzo Rota","Larry Ruckman","Angelo Dragone","Ryan Herbst"],"pdf_url":"https://arxiv.org/pdf/2404.17701v5.pdf","comment":"16 pages, 12 figures"},{"id":"http://arxiv.org/abs/2408.15969v1","updated":"2024-08-28T17:43:18Z","published":"2024-08-28T17:43:18Z","title":"Stability of Primal-Dual Gradient Flow Dynamics for Multi-Block Convex\n  Optimization Problems","summary":"  We examine stability properties of primal-dual gradient flow dynamics for\ncomposite convex optimization problems with multiple, possibly nonsmooth, terms\nin the objective function under the generalized consensus constraint. The\nproposed dynamics are based on the proximal augmented Lagrangian and they\nprovide a viable alternative to ADMM which faces significant challenges from\nboth analysis and implementation viewpoints in large-scale multi-block\nscenarios. In contrast to customized algorithms with individualized convergence\nguarantees, we provide a systematic approach for solving a broad class of\nchallenging composite optimization problems. We leverage various structural\nproperties to establish global (exponential) convergence guarantees for the\nproposed dynamics. Our assumptions are much weaker than those required to prove\n(exponential) stability of various primal-dual dynamics as well as (linear)\nconvergence of discrete-time methods, e.g., standard two-block and multi-block\nADMM and EXTRA algorithms. Finally, we show necessity of some of our structural\nassumptions for exponential stability and provide computational experiments to\ndemonstrate the convenience of the proposed dynamics for parallel and\ndistributed computing applications.\n","authors":["Ibrahim K. Ozaslan","Panagiotis Patrinos","Mihailo R. Jovanović"],"pdf_url":"https://arxiv.org/pdf/2408.15969v1.pdf","comment":"31 pages; 4 figures"},{"id":"http://arxiv.org/abs/2406.10260v2","updated":"2024-08-28T17:26:03Z","published":"2024-06-11T01:16:10Z","title":"Flextron: Many-in-One Flexible Large Language Model","summary":"  Training modern LLMs is extremely resource intensive, and customizing them\nfor various deployment scenarios characterized by limited compute and memory\nresources through repeated training is impractical. In this paper, we introduce\nFlextron, a network architecture and post-training model optimization framework\nsupporting flexible model deployment. The Flextron architecture utilizes a\nnested elastic structure to rapidly adapt to specific user-defined latency and\naccuracy targets during inference with no additional fine-tuning required. It\nis also input-adaptive, and can automatically route tokens through its\nsub-networks for improved performance and efficiency. We present a\nsample-efficient training method and associated routing algorithms for\nsystematically transforming an existing trained LLM into a Flextron model. We\nevaluate Flextron on the GPT-3 and LLama-2 family of LLMs, and demonstrate\nsuperior performance over multiple end-to-end trained variants and other\nstate-of-the-art elastic networks, all with a single pretraining run that\nconsumes a mere 7.63% tokens compared to original pretraining.\n","authors":["Ruisi Cai","Saurav Muralidharan","Greg Heinrich","Hongxu Yin","Zhangyang Wang","Jan Kautz","Pavlo Molchanov"],"pdf_url":"https://arxiv.org/pdf/2406.10260v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15958v1","updated":"2024-08-28T17:20:56Z","published":"2024-08-28T17:20:56Z","title":"Efficient Slice Anomaly Detection Network for 3D Brain MRI Volume","summary":"  Current anomaly detection methods excel with benchmark industrial data but\nstruggle with natural images and medical data due to varying definitions of\n'normal' and 'abnormal.' This makes accurate identification of deviations in\nthese fields particularly challenging. Especially for 3D brain MRI data, all\nthe state-of-the-art models are reconstruction-based with 3D convolutional\nneural networks which are memory-intensive, time-consuming and producing noisy\noutputs that require further post-processing. We propose a framework called\nSimple Slice-based Network (SimpleSliceNet), which utilizes a model pre-trained\non ImageNet and fine-tuned on a separate MRI dataset as a 2D slice feature\nextractor to reduce computational cost. We aggregate the extracted features to\nperform anomaly detection tasks on 3D brain MRI volumes. Our model integrates a\nconditional normalizing flow to calculate log likelihood of features and\nemploys the Semi-Push-Pull Mechanism to enhance anomaly detection accuracy. The\nresults indicate improved performance, showcasing our model's remarkable\nadaptability and effectiveness when addressing the challenges exists in brain\nMRI data. In addition, for the large-scale 3D brain volumes, our model\nSimpleSliceNet outperforms the state-of-the-art 2D and 3D models in terms of\naccuracy, memory usage and time consumption. Code is available at:\nhttps://anonymous.4open.science/r/SimpleSliceNet-8EA3.\n","authors":["Zeduo Zhang","Yalda Mohsenzadeh"],"pdf_url":"https://arxiv.org/pdf/2408.15958v1.pdf","comment":"15 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.15956v1","updated":"2024-08-28T17:17:20Z","published":"2024-08-28T17:17:20Z","title":"Generating Binary Species Range Maps","summary":"  Accurately predicting the geographic ranges of species is crucial for\nassisting conservation efforts. Traditionally, range maps were manually created\nby experts. However, species distribution models (SDMs) and, more recently,\ndeep learning-based variants offer a potential automated alternative. Deep\nlearning-based SDMs generate a continuous probability representing the\npredicted presence of a species at a given location, which must be binarized by\nsetting per-species thresholds to obtain binary range maps. However, selecting\nappropriate per-species thresholds to binarize these predictions is non-trivial\nas different species can require distinct thresholds. In this work, we evaluate\ndifferent approaches for automatically identifying the best thresholds for\nbinarizing range maps using presence-only data. This includes approaches that\nrequire the generation of additional pseudo-absence data, along with ones that\nonly require presence data. We also propose an extension of an existing\npresence-only technique that is more robust to outliers. We perform a detailed\nevaluation of different thresholding techniques on the tasks of binary range\nestimation and large-scale fine-grained visual classification, and we\ndemonstrate improved performance over existing pseudo-absence free approaches\nusing our method.\n","authors":["Filip Dorm","Christian Lange","Scott Loarie","Oisin Mac Aodha"],"pdf_url":"https://arxiv.org/pdf/2408.15956v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15953v1","updated":"2024-08-28T17:12:01Z","published":"2024-08-28T17:12:01Z","title":"Modeling and Analyzing the Influence of Non-Item Pages on Sequential\n  Next-Item Prediction","summary":"  Analyzing the sequence of historical interactions between users and items,\nsequential recommendation models learn user intent and make predictions about\nthe next item of interest. Next to these item interactions, most systems also\nhave interactions with pages not related to specific items, for example\nnavigation pages, account pages, and pages for a specific category, which may\nprovide additional insights into the user's interests. However, while there are\nseveral approaches to integrate additional information about items and users,\nthe topic of integrating non-item pages has been less explored. We use the\nhypotheses testing framework HypTrails to show that there is indeed a\nrelationship between these non-item pages and the items of interest and fill\nthis gap by proposing various approaches of representing non-item pages (e.g,\nbased on their content) to use them as an additional information source for the\ntask of sequential next-item prediction.\n  We create a synthetic dataset with non-item pages highly related to the\nsubsequent item to show that the models are generally capable of learning from\nthese interactions, and subsequently evaluate the improvements gained by\nincluding non-item pages in two real-world datasets.\n  We adapt eight popular sequential recommender models, covering CNN-, RNN- and\ntransformer-based architectures, to integrate non-item pages and investigate\nthe capabilities of these models to leverage their information for next item\nprediction. We also analyze their behavior on noisy data and compare different\nitem representation strategies.\n  Our results show that non-item pages are a valuable source of information,\nbut representing such a page well is the key to successfully leverage them. The\ninclusion of non-item pages can increase the performance for next-item\nprediction in all examined model architectures with a varying degree.\n","authors":["Elisabeth Fischer","Daniel Schlör","Albin Zehe","Andreas Hotho"],"pdf_url":"https://arxiv.org/pdf/2408.15953v1.pdf","comment":"36 pages, 19 figures; Work in Progress"},{"id":"http://arxiv.org/abs/2408.15946v1","updated":"2024-08-28T17:04:56Z","published":"2024-08-28T17:04:56Z","title":"Sigma Flows for Image and Data Labeling and Learning Structured\n  Prediction","summary":"  This paper introduces the sigma flow model for the prediction of structured\nlabelings of data observed on Riemannian manifolds, including Euclidean image\ndomains as special case. The approach combines the Laplace-Beltrami framework\nfor image denoising and enhancement, introduced by Sochen, Kimmel and Malladi\nabout 25 years ago, and the assignment flow approach introduced and studied by\nthe authors.\n  The sigma flow arises as Riemannian gradient flow of generalized harmonic\nenergies and thus is governed by a nonlinear geometric PDE which determines a\nharmonic map from a closed Riemannian domain manifold to a statistical\nmanifold, equipped with the Fisher-Rao metric from information geometry. A\nspecific ingredient of the sigma flow is the mutual dependency of the\nRiemannian metric of the domain manifold on the evolving state. This makes the\napproach amenable to machine learning in a specific way, by realizing this\ndependency through a mapping with compact time-variant parametrization that can\nbe learned from data. Proof of concept experiments demonstrate the expressivity\nof the sigma flow model and prediction performance.\n  Structural similarities to transformer network architectures and networks\ngenerated by the geometric integration of sigma flows are pointed out, which\nhighlights the connection to deep learning and, conversely, may stimulate the\nuse of geometric design principles for structured prediction in other areas of\nscientific machine learning.\n","authors":["Jonas Cassel","Bastian Boll","Stefania Petra","Peter Albers","Christoph Schnörr"],"pdf_url":"https://arxiv.org/pdf/2408.15946v1.pdf","comment":"51 pages"},{"id":"http://arxiv.org/abs/2402.09786v4","updated":"2024-08-28T16:48:06Z","published":"2024-02-15T08:34:21Z","title":"Examining Pathological Bias in a Generative Adversarial Network\n  Discriminator: A Case Study on a StyleGAN3 Model","summary":"  Generative adversarial networks (GANs) generate photorealistic faces that are\noften indistinguishable by humans from real faces. While biases in machine\nlearning models are often assumed to be due to biases in training data, we find\npathological internal color and luminance biases in the discriminator of a\npre-trained StyleGAN3-r model that are not explicable by the training data. We\nalso find that the discriminator systematically stratifies scores by both\nimage- and face-level qualities and that this disproportionately affects images\nacross gender, race, and other categories. We examine axes common in research\non stereotyping in social psychology.\n","authors":["Alvin Grissom II","Ryan F. Lei","Matt Gusdorff","Jeova Farias Sales Rocha Neto","Bailey Lin","Ryan Trotter"],"pdf_url":"https://arxiv.org/pdf/2402.09786v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15923v1","updated":"2024-08-28T16:36:18Z","published":"2024-08-28T16:36:18Z","title":"Generalized Naive Bayes","summary":"  In this paper we introduce the so-called Generalized Naive Bayes structure as\nan extension of the Naive Bayes structure. We give a new greedy algorithm that\nfinds a good fitting Generalized Naive Bayes (GNB) probability distribution. We\nprove that this fits the data at least as well as the probability distribution\ndetermined by the classical Naive Bayes (NB). Then, under a not very\nrestrictive condition, we give a second algorithm for which we can prove that\nit finds the optimal GNB probability distribution, i.e. best fitting structure\nin the sense of KL divergence. Both algorithms are constructed to maximize the\ninformation content and aim to minimize redundancy. Based on these algorithms,\nnew methods for feature selection are introduced. We discuss the similarities\nand differences to other related algorithms in terms of structure, methodology,\nand complexity. Experimental results show, that the algorithms introduced\noutperform the related algorithms in many cases.\n","authors":["Edith Alice Kovács","Anna Ország","Dániel Pfeifer","András Benczúr"],"pdf_url":"https://arxiv.org/pdf/2408.15923v1.pdf","comment":"44 pages, 19 figures"},{"id":"http://arxiv.org/abs/2408.15916v1","updated":"2024-08-28T16:30:41Z","published":"2024-08-28T16:30:41Z","title":"Multi-modal Adversarial Training for Zero-Shot Voice Cloning","summary":"  A text-to-speech (TTS) model trained to reconstruct speech given text tends\ntowards predictions that are close to the average characteristics of a dataset,\nfailing to model the variations that make human speech sound natural. This\nproblem is magnified for zero-shot voice cloning, a task that requires training\ndata with high variance in speaking styles. We build off of recent works which\nhave used Generative Advsarial Networks (GAN) by proposing a Transformer\nencoder-decoder architecture to conditionally discriminates between real and\ngenerated speech features. The discriminator is used in a training pipeline\nthat improves both the acoustic and prosodic features of a TTS model. We\nintroduce our novel adversarial training technique by applying it to a\nFastSpeech2 acoustic model and training on Libriheavy, a large multi-speaker\ndataset, for the task of zero-shot voice cloning. Our model achieves\nimprovements over the baseline in terms of speech quality and speaker\nsimilarity. Audio examples from our system are available online.\n","authors":["John Janiczek","Dading Chong","Dongyang Dai","Arlo Faria","Chao Wang","Tao Wang","Yuzong Liu"],"pdf_url":"https://arxiv.org/pdf/2408.15916v1.pdf","comment":"Accepted at INTERSPEECH 2024"},{"id":"http://arxiv.org/abs/2408.15905v1","updated":"2024-08-28T16:19:35Z","published":"2024-08-28T16:19:35Z","title":"MetaGFN: Exploring Distant Modes with Adapted Metadynamics for\n  Continuous GFlowNets","summary":"  Generative Flow Networks (GFlowNets) are a class of generative models that\nsample objects in proportion to a specified reward function through a learned\npolicy. They can be trained either on-policy or off-policy, needing a balance\nbetween exploration and exploitation for fast convergence to a target\ndistribution. While exploration strategies for discrete GFlowNets have been\nstudied, exploration in the continuous case remains to be investigated, despite\nthe potential for novel exploration algorithms due to the local connectedness\nof continuous domains. Here, we introduce Adapted Metadynamics, a variant of\nmetadynamics that can be applied to arbitrary black-box reward functions on\ncontinuous domains. We use Adapted Metadynamics as an exploration strategy for\ncontinuous GFlowNets. We show three continuous domains where the resulting\nalgorithm, MetaGFN, accelerates convergence to the target distribution and\ndiscovers more distant reward modes than previous off-policy exploration\nstrategies used for GFlowNets.\n","authors":["Dominic Phillips","Flaviu Cipcigan"],"pdf_url":"https://arxiv.org/pdf/2408.15905v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2408.15901v1","updated":"2024-08-28T16:12:55Z","published":"2024-08-28T16:12:55Z","title":"Nexus: Specialization meets Adaptability for Efficiently Training\n  Mixture of Experts","summary":"  Efficiency, specialization, and adaptability to new data distributions are\nqualities that are hard to combine in current Large Language Models. The\nMixture of Experts (MoE) architecture has been the focus of significant\nresearch because its inherent conditional computation enables such desirable\nproperties. In this work, we focus on \"upcycling\" dense expert models into an\nMoE, aiming to improve specialization while also adding the ability to adapt to\nnew tasks easily. We introduce Nexus, an enhanced MoE architecture with\nadaptive routing where the model learns to project expert embeddings from\ndomain representations. This approach allows Nexus to flexibly add new experts\nafter the initial upcycling through separately trained dense models, without\nrequiring large-scale MoE training for unseen data domains. Our experiments\nshow that Nexus achieves a relative gain of up to 2.1% over the baseline for\ninitial upcycling, and a 18.8% relative gain for extending the MoE with a new\nexpert by using limited finetuning data. This flexibility of Nexus is crucial\nto enable an open-source ecosystem where every user continuously assembles\ntheir own MoE-mix according to their needs.\n","authors":["Nikolas Gritsch","Qizhen Zhang","Acyr Locatelli","Sara Hooker","Ahmet Üstün"],"pdf_url":"https://arxiv.org/pdf/2408.15901v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15898v1","updated":"2024-08-28T16:12:16Z","published":"2024-08-28T16:12:16Z","title":"Airfoil Diffusion: Denoising Diffusion Model For Conditional Airfoil\n  Generation","summary":"  The design of aerodynamic shapes, such as airfoils, has traditionally\nrequired significant computational resources and relied on predefined design\nparameters, which limit the potential for novel shape synthesis. In this work,\nwe introduce a data-driven methodology for airfoil generation using a diffusion\nmodel. Trained on a dataset of preexisting airfoils, our model can generate an\narbitrary number of new airfoils from random vectors, which can be conditioned\non specific aerodynamic performance metrics such as lift and drag, or geometric\ncriteria. Our results demonstrate that the diffusion model effectively produces\nairfoil shapes with realistic aerodynamic properties, offering substantial\nimprovements in efficiency, flexibility, and the potential for discovering\ninnovative airfoil designs. This approach significantly expands the design\nspace, facilitating the synthesis of high-performance aerodynamic shapes that\ntranscend the limitations of traditional methods.\n","authors":["Reid Graves","Amir Barati Farimani"],"pdf_url":"https://arxiv.org/pdf/2408.15898v1.pdf","comment":"12 Pages, 6 figures"},{"id":"http://arxiv.org/abs/2408.15896v1","updated":"2024-08-28T16:06:12Z","published":"2024-08-28T16:06:12Z","title":"A New Method for Cross-Lingual-based Semantic Role Labeling","summary":"  Semantic role labeling is a crucial task in natural language processing,\nenabling better comprehension of natural language. However, the lack of\nannotated data in multiple languages has posed a challenge for researchers. To\naddress this, a deep learning algorithm based on model transfer has been\nproposed. The algorithm utilizes a dataset consisting of the English portion of\nCoNLL2009 and a corpus of semantic roles in Persian. To optimize the efficiency\nof training, only ten percent of the educational data from each language is\nused. The results of the proposed model demonstrate significant improvements\ncompared to Niksirt et al.'s model. In monolingual mode, the proposed model\nachieved a 2.05 percent improvement on F1-score, while in cross-lingual mode,\nthe improvement was even more substantial, reaching 6.23 percent. Worth noting\nis that the compared model only trained two of the four stages of semantic role\nlabeling and employed golden data for the remaining two stages. This suggests\nthat the actual superiority of the proposed model surpasses the reported\nnumbers by a significant margin. The development of cross-lingual methods for\nsemantic role labeling holds promise, particularly in addressing the scarcity\nof annotated data for various languages. These advancements pave the way for\nfurther research in understanding and processing natural language across\ndifferent linguistic contexts.\n","authors":["Mohammad Ebrahimi","Behrouz Minaei Bidgoli","Nasim Khozouei"],"pdf_url":"https://arxiv.org/pdf/2408.15896v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15895v1","updated":"2024-08-28T16:05:20Z","published":"2024-08-28T16:05:20Z","title":"Bias in LLMs as Annotators: The Effect of Party Cues on Labelling\n  Decision by Large Language Models","summary":"  Human coders are biased. We test similar biases in Large Language Models\n(LLMs) as annotators. By replicating an experiment run by Ennser-Jedenastik and\nMeyer (2018), we find evidence that LLMs use political information, and\nspecifically party cues, to judge political statements. Not only do LLMs use\nrelevant information to contextualize whether a statement is positive,\nnegative, or neutral based on the party cue, they also reflect the biases of\nthe human-generated data upon which they have been trained. We also find that\nunlike humans, who are only biased when faced with statements from extreme\nparties, LLMs exhibit significant bias even when prompted with statements from\ncenter-left and center-right parties. The implications of our findings are\ndiscussed in the conclusion.\n","authors":["Sebastian Vallejo Vera","Hunter Driggers"],"pdf_url":"https://arxiv.org/pdf/2408.15895v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15894v1","updated":"2024-08-28T16:04:40Z","published":"2024-08-28T16:04:40Z","title":"The Role of Fibration Symmetries in Geometric Deep Learning","summary":"  Geometric Deep Learning (GDL) unifies a broad class of machine learning\ntechniques from the perspectives of symmetries, offering a framework for\nintroducing problem-specific inductive biases like Graph Neural Networks\n(GNNs). However, the current formulation of GDL is limited to global symmetries\nthat are not often found in real-world problems. We propose to relax GDL to\nallow for local symmetries, specifically fibration symmetries in graphs, to\nleverage regularities of realistic instances. We show that GNNs apply the\ninductive bias of fibration symmetries and derive a tighter upper bound for\ntheir expressive power. Additionally, by identifying symmetries in networks, we\ncollapse network nodes, thereby increasing their computational efficiency\nduring both inference and training of deep neural networks. The mathematical\nextension introduced here applies beyond graphs to manifolds, bundles, and\ngrids for the development of models with inductive biases induced by local\nsymmetries that can lead to better generalization.\n","authors":["Osvaldo Velarde","Lucas Parra","Paolo Boldi","Hernan Makse"],"pdf_url":"https://arxiv.org/pdf/2408.15894v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14780v2","updated":"2024-08-28T15:48:31Z","published":"2024-08-27T04:57:53Z","title":"GINN-KAN: Interpretability pipelining with applications in Physics\n  Informed Neural Networks","summary":"  Neural networks are powerful function approximators, yet their ``black-box\"\nnature often renders them opaque and difficult to interpret. While many\npost-hoc explanation methods exist, they typically fail to capture the\nunderlying reasoning processes of the networks. A truly interpretable neural\nnetwork would be trained similarly to conventional models using techniques such\nas backpropagation, but additionally provide insights into the learned\ninput-output relationships. In this work, we introduce the concept of\ninterpretability pipelineing, to incorporate multiple interpretability\ntechniques to outperform each individual technique. To this end, we first\nevaluate several architectures that promise such interpretability, with a\nparticular focus on two recent models selected for their potential to\nincorporate interpretability into standard neural network architectures while\nstill leveraging backpropagation: the Growing Interpretable Neural Network\n(GINN) and Kolmogorov Arnold Networks (KAN). We analyze the limitations and\nstrengths of each and introduce a novel interpretable neural network GINN-KAN\nthat synthesizes the advantages of both models. When tested on the Feynman\nsymbolic regression benchmark datasets, GINN-KAN outperforms both GINN and KAN.\nTo highlight the capabilities and the generalizability of this approach, we\nposition GINN-KAN as an alternative to conventional black-box networks in\nPhysics-Informed Neural Networks (PINNs). We expect this to have far-reaching\nimplications in the application of deep learning pipelines in the natural\nsciences. Our experiments with this interpretable PINN on 15 different partial\ndifferential equations demonstrate that GINN-KAN augmented PINNs outperform\nPINNs with black-box networks in solving differential equations and surpass the\ncapabilities of both GINN and KAN.\n","authors":["Nisal Ranasinghe","Yu Xia","Sachith Seneviratne","Saman Halgamuge"],"pdf_url":"https://arxiv.org/pdf/2408.14780v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04600v2","updated":"2024-08-28T15:46:39Z","published":"2023-11-08T11:02:51Z","title":"A Deep Learning Based Resource Allocator for Communication Systems with\n  Dynamic User Utility Demands","summary":"  Deep learning (DL) based resource allocation (RA) has recently gained\nsignificant attention due to its performance efficiency. However, most related\nstudies assume an ideal case where the number of users and their utility\ndemands, e.g., data rate constraints, are fixed, and the designed DL-based RA\nscheme exploits a policy trained only for these fixed parameters. Consequently,\ncomputationally complex policy retraining is required whenever these parameters\nchange. In this paper, we introduce a DL-based resource allocator (ALCOR) that\nallows users to adjust their utility demands freely, such as based on their\napplication layer requirements. ALCOR employs deep neural networks (DNNs) as\nthe policy in a time-sharing problem. The underlying optimization algorithm\niteratively optimizes the on-off status of users to satisfy their utility\ndemands in expectation. The policy performs unconstrained RA (URA)--RA without\nconsidering user utility demands--among active users to maximize the sum\nutility (SU) at each time instant. Depending on the chosen URA scheme, ALCOR\ncan perform RA in either a centralized or distributed scenario. Derived\nconvergence analyses provide guarantees for ALCOR's convergence, and numerical\nexperiments corroborate its effectiveness.\n","authors":["Pourya Behmandpoor","Mark Eisen","Panagiotis Patrinos","Marc Moonen"],"pdf_url":"https://arxiv.org/pdf/2311.04600v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15874v1","updated":"2024-08-28T15:44:34Z","published":"2024-08-28T15:44:34Z","title":"Robust Statistical Scaling of Outlier Scores: Improving the Quality of\n  Outlier Probabilities for Outliers (Extended Version)","summary":"  Outlier detection algorithms typically assign an outlier score to each\nobservation in a dataset, indicating the degree to which an observation is an\noutlier. However, these scores are often not comparable across algorithms and\ncan be difficult for humans to interpret. Statistical scaling addresses this\nproblem by transforming outlier scores into outlier probabilities without using\nground-truth labels, thereby improving interpretability and comparability\nacross algorithms. However, the quality of this transformation can be different\nfor outliers and inliers. Missing outliers in scenarios where they are of\nparticular interest - such as healthcare, finance, or engineering - can be\ncostly or dangerous. Thus, ensuring good probabilities for outliers is\nessential. This paper argues that statistical scaling, as commonly used in the\nliterature, does not produce equally good probabilities for outliers as for\ninliers. Therefore, we propose robust statistical scaling, which uses robust\nestimators to improve the probabilities for outliers. We evaluate several\nvariants of our method against other outlier score transformations for\nreal-world datasets and outlier detection algorithms, where it can improve the\nprobabilities for outliers.\n","authors":["Philipp Röchner","Henrique O. Marques","Ricardo J. G. B. Campello","Arthur Zimek","Franz Rothlauf"],"pdf_url":"https://arxiv.org/pdf/2408.15874v1.pdf","comment":"15 pages, 4 figures, accepted for publication in SISAP 2024"},{"id":"http://arxiv.org/abs/2403.05645v3","updated":"2024-08-28T15:39:45Z","published":"2024-03-08T19:36:20Z","title":"Geometric Neural Network based on Phase Space for BCI-EEG decoding","summary":"  Objective: The integration of Deep Learning (DL) algorithms on brain signal\nanalysis is still in its nascent stages compared to their success in fields\nlike Computer Vision. This is particularly true for BCI, where the brain\nactivity is decoded to control external devices without requiring muscle\ncontrol. Electroencephalography (EEG) is a widely adopted choice for designing\nBCI systems due to its non-invasive and cost-effective nature and excellent\ntemporal resolution. Still, it comes at the expense of limited training data,\npoor signal-to-noise, and a large variability across and within-subject\nrecordings. Finally, setting up a BCI system with many electrodes takes a long\ntime, hindering the widespread adoption of reliable DL architectures in BCIs\noutside research laboratories. To improve adoption, we need to improve user\ncomfort using, for instance, reliable algorithms that operate with few\nelectrodes. Approach: Our research aims to develop a DL algorithm that delivers\neffective results with a limited number of electrodes. Taking advantage of the\nAugmented Covariance Method and the framework of SPDNet, we propose the\nPhase-SPDNet architecture and analyze its performance and the interpretability\nof the results. The evaluation is conducted on 5-fold cross-validation, using\nonly three electrodes positioned above the Motor Cortex. The methodology was\ntested on nearly 100 subjects from several open-source datasets using the\nMother Of All BCI Benchmark (MOABB) framework. Main results: The results of our\nPhase-SPDNet demonstrate that the augmented approach combined with the SPDNet\nsignificantly outperforms all the current state-of-the-art DL architecture in\nMI decoding. Significance: This new architecture is explainable and with a low\nnumber of trainable parameters.\n","authors":["Igor Carrara","Bruno Aristimunha","Marie-Constance Corsi","Raphael Y. de Camargo","Sylvain Chevallier","Théodore Papadopoulo"],"pdf_url":"https://arxiv.org/pdf/2403.05645v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.10734v2","updated":"2024-08-28T15:36:08Z","published":"2024-07-15T14:01:34Z","title":"On-Device Training of Fully Quantized Deep Neural Networks on Cortex-M\n  Microcontrollers","summary":"  On-device training of DNNs allows models to adapt and fine-tune to newly\ncollected data or changing domains while deployed on microcontroller units\n(MCUs). However, DNN training is a resource-intensive task, making the\nimplementation and execution of DNN training algorithms on MCUs challenging due\nto low processor speeds, constrained throughput, limited floating-point\nsupport, and memory constraints. In this work, we explore on-device training of\nDNNs for Cortex-M MCUs. We present a method that enables efficient training of\nDNNs completely in place on the MCU using fully quantized training (FQT) and\ndynamic partial gradient updates. We demonstrate the feasibility of our\napproach on multiple vision and time-series datasets and provide insights into\nthe tradeoff between training accuracy, memory overhead, energy, and latency on\nreal hardware.\n","authors":["Mark Deutel","Frank Hannig","Christopher Mutschler","Jürgen Teich"],"pdf_url":"https://arxiv.org/pdf/2407.10734v2.pdf","comment":"12 pages, 9 figures"},{"id":"http://arxiv.org/abs/2408.15866v1","updated":"2024-08-28T15:33:47Z","published":"2024-08-28T15:33:47Z","title":"Retrieval-Augmented Instruction Tuning for Automated Process Engineering\n  Calculations : A Tool-Chaining Problem-Solving Framework with Attributable\n  Reflection","summary":"  The current technology landscape lacks a foundational AI model for solving\nprocess engineering calculations. In this work, we introduce a novel autonomous\nagent framework leveraging Retrieval-Augmented Instruction-Tuning (RAIT) to\nenhance open, customizable small code language models (SLMs) for these\ncalculations. By combining instruction tuned code SLMs with Retrieval-Augmented\nCode Generation (RACG) using external tools, the agent generates, debugs, and\noptimizes code from natural language specifications. Our approach addresses the\nlimitations of the current lack of a foundational AI model for specialized\nprocess engineering tasks and offers benefits of explainability, knowledge\nediting, and cost-effectiveness. Additionally, we curate custom datasets of\nchemical and process engineering problems and solutions to overcome data\nscarcity. Experimental results show that our framework matches the performance\nof large-scale proprietary models on benchmark datasets, proving its\neffectiveness and usability.\n","authors":["Sagar Srinivas Sakhinana","Geethan Sannidhi","Venkataramana Runkana"],"pdf_url":"https://arxiv.org/pdf/2408.15866v1.pdf","comment":"Accepted for publication at ML4CCE workshop at ECML PKDD 2024. Please\n  find the link: https://ml4cce-ecml.com/#agenda"},{"id":"http://arxiv.org/abs/2408.15865v1","updated":"2024-08-28T15:29:27Z","published":"2024-08-28T15:29:27Z","title":"microYOLO: Towards Single-Shot Object Detection on Microcontrollers","summary":"  This work-in-progress paper presents results on the feasibility of\nsingle-shot object detection on microcontrollers using YOLO. Single-shot object\ndetectors like YOLO are widely used, however due to their complexity mainly on\nlarger GPU-based platforms. We present microYOLO, which can be used on Cortex-M\nbased microcontrollers, such as the OpenMV H7 R2, achieving about 3.5 FPS when\nclassifying 128x128 RGB images while using less than 800 KB Flash and less than\n350 KB RAM. Furthermore, we share experimental results for three different\nobject detection tasks, analyzing the accuracy of microYOLO on them.\n","authors":["Mark Deutel","Christopher Mutschler","Jürgen Teich"],"pdf_url":"https://arxiv.org/pdf/2408.15865v1.pdf","comment":"Published at the ECML PKDD Conference 2023, at the 4th Workshop on\n  IoT, Edge, and Mobile for Embedded Machine Learning"},{"id":"http://arxiv.org/abs/2310.10835v3","updated":"2024-08-28T15:29:17Z","published":"2023-10-16T21:17:29Z","title":"Provable Probabilistic Imaging using Score-Based Generative Priors","summary":"  Estimating high-quality images while also quantifying their uncertainty are\ntwo desired features in an image reconstruction algorithm for solving ill-posed\ninverse problems. In this paper, we propose plug-and-play Monte Carlo (PMC) as\na principled framework for characterizing the space of possible solutions to a\ngeneral inverse problem. PMC is able to incorporate expressive score-based\ngenerative priors for high-quality image reconstruction while also performing\nuncertainty quantification via posterior sampling. In particular, we develop\ntwo PMC algorithms that can be viewed as the sampling analogues of the\ntraditional plug-and-play priors (PnP) and regularization by denoising (RED)\nalgorithms. To improve the sampling efficiency, we introduce weighted annealing\ninto these PMC algorithms, further developing two additional annealed PMC\nalgorithms (APMC). We establish a theoretical analysis for characterizing the\nconvergence behavior of PMC algorithms. Our analysis provides non-asymptotic\nstationarity guarantees in terms of the Fisher information, fully compatible\nwith the joint presence of weighted annealing, potentially non-log-concave\nlikelihoods, and imperfect score networks. We demonstrate the performance of\nthe PMC algorithms on multiple representative inverse problems with both linear\nand nonlinear forward models. Experimental results show that PMC significantly\nimproves reconstruction quality and enables high-fidelity uncertainty\nquantification.\n","authors":["Yu Sun","Zihui Wu","Yifan Chen","Berthy T. Feng","Katherine L. Bouman"],"pdf_url":"https://arxiv.org/pdf/2310.10835v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15861v1","updated":"2024-08-28T15:21:10Z","published":"2024-08-28T15:21:10Z","title":"Fusing Pruned and Backdoored Models: Optimal Transport-based Data-free\n  Backdoor Mitigation","summary":"  Backdoor attacks present a serious security threat to deep neuron networks\n(DNNs). Although numerous effective defense techniques have been proposed in\nrecent years, they inevitably rely on the availability of either clean or\npoisoned data. In contrast, data-free defense techniques have evolved slowly\nand still lag significantly in performance. To address this issue, different\nfrom the traditional approach of pruning followed by fine-tuning, we propose a\nnovel data-free defense method named Optimal Transport-based Backdoor Repairing\n(OTBR) in this work. This method, based on our findings on neuron weight\nchanges (NWCs) of random unlearning, uses optimal transport (OT)-based model\nfusion to combine the advantages of both pruned and backdoored models.\nSpecifically, we first demonstrate our findings that the NWCs of random\nunlearning are positively correlated with those of poison unlearning. Based on\nthis observation, we propose a random-unlearning NWC pruning technique to\neliminate the backdoor effect and obtain a backdoor-free pruned model. Then,\nmotivated by the OT-based model fusion, we propose the pruned-to-backdoored\nOT-based fusion technique, which fuses pruned and backdoored models to combine\nthe advantages of both, resulting in a model that demonstrates high clean\naccuracy and a low attack success rate. To our knowledge, this is the first\nwork to apply OT and model fusion techniques to backdoor defense. Extensive\nexperiments show that our method successfully defends against all seven\nbackdoor attacks across three benchmark datasets, outperforming both\nstate-of-the-art (SOTA) data-free and data-dependent methods. The code\nimplementation and Appendix are provided in the Supplementary Material.\n","authors":["Weilin Lin","Li Liu","Jianze Li","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2408.15861v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.16653v3","updated":"2024-08-28T15:17:44Z","published":"2022-11-30T00:47:03Z","title":"Correlation recurrent units: A novel neural architecture for improving\n  the predictive performance of time-series data","summary":"  The time-series forecasting (TSF) problem is a traditional problem in the\nfield of artificial intelligence. Models such as Recurrent Neural Network\n(RNN), Long Short Term Memory (LSTM), and GRU (Gate Recurrent Units) have\ncontributed to improving the predictive accuracy of TSF. Furthermore, model\nstructures have been proposed to combine time-series decomposition methods,\nsuch as seasonal-trend decomposition using Loess (STL) to ensure improved\npredictive accuracy. However, because this approach is learned in an\nindependent model for each component, it cannot learn the relationships between\ntime-series components. In this study, we propose a new neural architecture\ncalled a correlation recurrent unit (CRU) that can perform time series\ndecomposition within a neural cell and learn correlations (autocorrelation and\ncorrelation) between each decomposition component. The proposed neural\narchitecture was evaluated through comparative experiments with previous\nstudies using five univariate time-series datasets and four multivariate\ntime-series data. The results showed that long- and short-term predictive\nperformance was improved by more than 10%. The experimental results show that\nthe proposed CRU is an excellent method for TSF problems compared to other\nneural architectures.\n","authors":["Sunghyun Sim","Dohee Kim","Hyerim Bae"],"pdf_url":"https://arxiv.org/pdf/2211.16653v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15852v1","updated":"2024-08-28T15:14:58Z","published":"2024-08-28T15:14:58Z","title":"chemtrain: Learning Deep Potential Models via Automatic Differentiation\n  and Statistical Physics","summary":"  Neural Networks (NNs) are promising models for refining the accuracy of\nmolecular dynamics, potentially opening up new fields of application. Typically\ntrained bottom-up, atomistic NN potential models can reach first-principle\naccuracy, while coarse-grained implicit solvent NN potentials surpass classical\ncontinuum solvent models. However, overcoming the limitations of costly\ngeneration of accurate reference data and data inefficiency of common bottom-up\ntraining demands efficient incorporation of data from many sources. This paper\nintroduces the framework chemtrain to learn sophisticated NN potential models\nthrough customizable training routines and advanced training algorithms. These\nroutines can combine multiple top-down and bottom-up algorithms, e.g., to\nincorporate both experimental and simulation data or pre-train potentials with\nless costly algorithms. chemtrain provides an object-oriented high-level\ninterface to simplify the creation of custom routines. On the lower level,\nchemtrain relies on JAX to compute gradients and scale the computations to use\navailable resources. We demonstrate the simplicity and importance of combining\nmultiple algorithms in the examples of parametrizing an all-atomistic model of\ntitanium and a coarse-grained implicit solvent model of alanine dipeptide.\n","authors":["Paul Fuchs","Stephan Thaler","Sebastien Röcken","Julija Zavadlav"],"pdf_url":"https://arxiv.org/pdf/2408.15852v1.pdf","comment":"Package source code published at http://github.com/tummfm/chemtrain"},{"id":"http://arxiv.org/abs/2404.07839v2","updated":"2024-08-28T15:05:42Z","published":"2024-04-11T15:27:22Z","title":"RecurrentGemma: Moving Past Transformers for Efficient Open Language\n  Models","summary":"  We introduce RecurrentGemma, a family of open language models which uses\nGoogle's novel Griffin architecture. Griffin combines linear recurrences with\nlocal attention to achieve excellent performance on language. It has a\nfixed-sized state, which reduces memory use and enables efficient inference on\nlong sequences. We provide two sizes of models, containing 2B and 9B\nparameters, and provide pre-trained and instruction tuned variants for both.\nOur models achieve comparable performance to similarly-sized Gemma baselines\ndespite being trained on fewer tokens.\n","authors":["Aleksandar Botev","Soham De","Samuel L Smith","Anushan Fernando","George-Cristian Muraru","Ruba Haroun","Leonard Berrada","Razvan Pascanu","Pier Giuseppe Sessa","Robert Dadashi","Léonard Hussenot","Johan Ferret","Sertan Girgin","Olivier Bachem","Alek Andreev","Kathleen Kenealy","Thomas Mesnard","Cassidy Hardin","Surya Bhupatiraju","Shreya Pathak","Laurent Sifre","Morgane Rivière","Mihir Sanjay Kale","Juliette Love","Pouya Tafti","Armand Joulin","Noah Fiedel","Evan Senter","Yutian Chen","Srivatsan Srinivasan","Guillaume Desjardins","David Budden","Arnaud Doucet","Sharad Vikram","Adam Paszke","Trevor Gale","Sebastian Borgeaud","Charlie Chen","Andy Brock","Antonia Paterson","Jenny Brennan","Meg Risdal","Raj Gundluru","Nesh Devanathan","Paul Mooney","Nilay Chauhan","Phil Culliton","Luiz Gustavo Martins","Elisa Bandy","David Huntsperger","Glenn Cameron","Arthur Zucker","Tris Warkentin","Ludovic Peran","Minh Giang","Zoubin Ghahramani","Clément Farabet","Koray Kavukcuoglu","Demis Hassabis","Raia Hadsell","Yee Whye Teh","Nando de Frietas"],"pdf_url":"https://arxiv.org/pdf/2404.07839v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01245v2","updated":"2024-08-28T15:01:04Z","published":"2024-04-01T17:03:41Z","title":"A Statistical Framework of Watermarks for Large Language Models: Pivot,\n  Detection Efficiency and Optimal Rules","summary":"  Since ChatGPT was introduced in November 2022, embedding (nearly)\nunnoticeable statistical signals into text generated by large language models\n(LLMs), also known as watermarking, has been used as a principled approach to\nprovable detection of LLM-generated text from its human-written counterpart. In\nthis paper, we introduce a general and flexible framework for reasoning about\nthe statistical efficiency of watermarks and designing powerful detection\nrules. Inspired by the hypothesis testing formulation of watermark detection,\nour framework starts by selecting a pivotal statistic of the text and a secret\nkey -- provided by the LLM to the verifier -- to enable controlling the false\npositive rate (the error of mistakenly detecting human-written text as\nLLM-generated). Next, this framework allows one to evaluate the power of\nwatermark detection rules by obtaining a closed-form expression of the\nasymptotic false negative rate (the error of incorrectly classifying\nLLM-generated text as human-written). Our framework further reduces the problem\nof determining the optimal detection rule to solving a minimax optimization\nprogram. We apply this framework to two representative watermarks -- one of\nwhich has been internally implemented at OpenAI -- and obtain several findings\nthat can be instrumental in guiding the practice of implementing watermarks. In\nparticular, we derive optimal detection rules for these watermarks under our\nframework. These theoretically derived detection rules are demonstrated to be\ncompetitive and sometimes enjoy a higher power than existing detection\napproaches through numerical experiments.\n","authors":["Xiang Li","Feng Ruan","Huiyuan Wang","Qi Long","Weijie J. Su"],"pdf_url":"https://arxiv.org/pdf/2404.01245v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.15641v2","updated":"2024-08-28T15:00:32Z","published":"2023-10-24T08:59:40Z","title":"Guaranteed Coverage Prediction Intervals with Gaussian Process\n  Regression","summary":"  Gaussian Process Regression (GPR) is a popular regression method, which\nunlike most Machine Learning techniques, provides estimates of uncertainty for\nits predictions. These uncertainty estimates however, are based on the\nassumption that the model is well-specified, an assumption that is violated in\nmost practical applications, since the required knowledge is rarely available.\nAs a result, the produced uncertainty estimates can become very misleading; for\nexample the prediction intervals (PIs) produced for the 95% confidence level\nmay cover much less than 95% of the true labels. To address this issue, this\npaper introduces an extension of GPR based on a Machine Learning framework\ncalled, Conformal Prediction (CP). This extension guarantees the production of\nPIs with the required coverage even when the model is completely misspecified.\nThe proposed approach combines the advantages of GPR with the valid coverage\nguarantee of CP, while the performed experimental results demonstrate its\nsuperiority over existing methods.\n","authors":["Harris Papadopoulos"],"pdf_url":"https://arxiv.org/pdf/2310.15641v2.pdf","comment":"12 pages. This article has been accepted for publication in IEEE\n  Transactions on Pattern Analysis and Machine Intelligence. This is the\n  author's version which has not been fully edited and content may change prior\n  to final publication. Citation information: DOI 10.1109/TPAMI.2024.3418214"},{"id":"http://arxiv.org/abs/2307.08220v2","updated":"2024-08-28T14:47:24Z","published":"2023-07-17T03:45:00Z","title":"FRANC: A Lightweight Framework for High-Quality Code Generation","summary":"  In recent years, the use of automated source code generation utilizing\ntransformer-based generative models has expanded, and these models can generate\nfunctional code according to the requirements of the developers. However,\nrecent research revealed that these automatically generated source codes can\ncontain vulnerabilities and other quality issues. Despite researchers' and\npractitioners' attempts to enhance code generation models, retraining and\nfine-tuning large language models is time-consuming and resource-intensive.\nThus, we describe FRANC, a lightweight framework for recommending more secure\nand high-quality source code derived from transformer-based code generation\nmodels. FRANC includes a static filter to make the generated code compilable\nwith heuristics and a quality-aware ranker to sort the code snippets based on a\nquality score. Moreover, the framework uses prompt engineering to fix\npersistent quality issues. We evaluated the framework with five Python and Java\ncode generation models and six prompt datasets, including a newly created one\nin this work (SOEval). The static filter improves 9% to 46% Java suggestions\nand 10% to 43% Python suggestions regarding compilability. The average\nimprovement over the NDCG@10 score for the ranking system is 0.0763, and the\nrepairing techniques repair the highest 80% of prompts. FRANC takes, on\naverage, 1.98 seconds for Java; for Python, it takes 0.08 seconds.\n","authors":["Mohammed Latif Siddiq","Beatrice Casey","Joanna C. S. Santos"],"pdf_url":"https://arxiv.org/pdf/2307.08220v2.pdf","comment":"Accepted at the 24th IEEE International Conference on Source Code\n  Analysis and Manipulation (SCAM 2024)"},{"id":"http://arxiv.org/abs/2408.15827v1","updated":"2024-08-28T14:40:15Z","published":"2024-08-28T14:40:15Z","title":"Automatic Differential Diagnosis using Transformer-Based Multi-Label\n  Sequence Classification","summary":"  As the field of artificial intelligence progresses, assistive technologies\nare becoming more widely used across all industries. The healthcare industry is\nno different, with numerous studies being done to develop assistive tools for\nhealthcare professionals. Automatic diagnostic systems are one such beneficial\ntool that can assist with a variety of tasks, including collecting patient\ninformation, analyzing test results, and diagnosing patients. However, the idea\nof developing systems that can provide a differential diagnosis has been\nlargely overlooked in most of these research studies. In this study, we propose\na transformer-based approach for providing differential diagnoses based on a\npatient's age, sex, medical history, and symptoms. We use the DDXPlus dataset,\nwhich provides differential diagnosis information for patients based on 49\ndisease types. Firstly, we propose a method to process the tabular patient data\nfrom the dataset and engineer them into patient reports to make them suitable\nfor our research. In addition, we introduce two data modification modules to\ndiversify the training data and consequently improve the robustness of the\nmodels. We approach the task as a multi-label classification problem and\nconduct extensive experiments using four transformer models. All the models\ndisplayed promising results by achieving over 97% F1 score on the held-out test\nset. Moreover, we design additional behavioral tests to get a broader\nunderstanding of the models. In particular, for one of our test cases, we\nprepared a custom test set of 100 samples with the assistance of a doctor. The\nresults on the custom set showed that our proposed data modification modules\nimproved the model's generalization capabilities. We hope our findings will\nprovide future researchers with valuable insights and inspire them to develop\nreliable systems for automatic differential diagnosis.\n","authors":["Abu Adnan Sadi","Mohammad Ashrafuzzaman Khan","Lubaba Binte Saber"],"pdf_url":"https://arxiv.org/pdf/2408.15827v1.pdf","comment":"25 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.10155v2","updated":"2024-08-28T14:38:51Z","published":"2024-04-15T22:02:58Z","title":"The Fault in our Stars: Quality Assessment of Code Generation Benchmarks","summary":"  Large Language Models (LLMs) are gaining popularity among software engineers.\nA crucial aspect of developing effective code generation LLMs is to evaluate\nthese models using a robust benchmark. Evaluation benchmarks with quality\nissues can provide a false sense of performance. In this work, we conduct the\nfirst-of-its-kind study of the quality of prompts within benchmarks used to\ncompare the performance of different code generation models. To conduct this\nstudy, we analyzed 3,566 prompts from 9 code generation benchmarks to identify\nquality issues in them. We also investigated whether fixing the identified\nquality issues in the benchmarks' prompts affects a model's performance. We\nalso studied memorization issues of the evaluation dataset, which can put into\nquestion a benchmark's trustworthiness. We found that code generation\nevaluation benchmarks mainly focused on Python and coding exercises and had\nvery limited contextual dependencies to challenge the model. These datasets and\nthe developers' prompts suffer from quality issues like spelling and\ngrammatical errors, unclear sentences to express developers' intent, and not\nusing proper documentation style. Fixing all these issues in the benchmarks can\nlead to a better performance for Python code generation, but not a significant\nimprovement was observed for Java code generation. We also found evidence that\nGPT-3.5-Turbo and CodeGen-2.5 models may have data contamination issues.\n","authors":["Mohammed Latif Siddiq","Simantika Dristi","Joy Saha","Joanna C. S. Santos"],"pdf_url":"https://arxiv.org/pdf/2404.10155v2.pdf","comment":"Accepted at the 24th IEEE International Conference on Source Code\n  Analysis and Manipulation(SCAM 2024)"},{"id":"http://arxiv.org/abs/2408.15819v1","updated":"2024-08-28T14:32:24Z","published":"2024-08-28T14:32:24Z","title":"Automated Mixture Analysis via Structural Evaluation","summary":"  The determination of chemical mixture components is vital to a multitude of\nscientific fields. Oftentimes spectroscopic methods are employed to decipher\nthe composition of these mixtures. However, the sheer density of spectral\nfeatures present in spectroscopic databases can make unambiguous assignment to\nindividual species challenging. Yet, components of a mixture are commonly\nchemically related due to environmental processes or shared precursor\nmolecules. Therefore, analysis of the chemical relevance of a molecule is\nimportant when determining which species are present in a mixture. In this\npaper, we combine machine-learning molecular embedding methods with a\ngraph-based ranking system to determine the likelihood of a molecule being\npresent in a mixture based on the other known species and/or chemical priors.\nBy incorporating this metric in a rotational spectroscopy mixture analysis\nalgorithm, we demonstrate that the mixture components can be identified with\nextremely high accuracy (>97%) in an efficient manner.\n","authors":["Zachary T. P. Fried","Brett A. McGuire"],"pdf_url":"https://arxiv.org/pdf/2408.15819v1.pdf","comment":"Accepted for publication in The Journal of Physical Chemistry A"},{"id":"http://arxiv.org/abs/2408.14511v2","updated":"2024-08-28T14:13:41Z","published":"2024-08-25T04:07:18Z","title":"Unveiling the Statistical Foundations of Chain-of-Thought Prompting\n  Methods","summary":"  Chain-of-Thought (CoT) prompting and its variants have gained popularity as\neffective methods for solving multi-step reasoning problems using pretrained\nlarge language models (LLMs). In this work, we analyze CoT prompting from a\nstatistical estimation perspective, providing a comprehensive characterization\nof its sample complexity. To this end, we introduce a multi-step latent\nvariable model that encapsulates the reasoning process, where the latent\nvariable encodes the task information. Under this framework, we demonstrate\nthat when the pretraining dataset is sufficiently large, the estimator formed\nby CoT prompting is equivalent to a Bayesian estimator. This estimator\neffectively solves the multi-step reasoning problem by aggregating a posterior\ndistribution inferred from the demonstration examples in the prompt. Moreover,\nwe prove that the statistical error of the CoT estimator can be decomposed into\ntwo main components: (i) a prompting error, which arises from inferring the\ntrue task using CoT prompts, and (ii) the statistical error of the pretrained\nLLM. We establish that, under appropriate assumptions, the prompting error\ndecays exponentially to zero as the number of demonstrations increases.\nAdditionally, we explicitly characterize the approximation and generalization\nerrors of the pretrained LLM. Notably, we construct a transformer model that\napproximates the target distribution of the multi-step reasoning problem with\nan error that decreases exponentially in the number of transformer blocks. Our\nanalysis extends to other variants of CoT, including Self-Consistent CoT,\nTree-of-Thought, and Selection-Inference, offering a broad perspective on the\nefficacy of these methods. We also provide numerical experiments to validate\nthe theoretical findings.\n","authors":["Xinyang Hu","Fengzhuo Zhang","Siyu Chen","Zhuoran Yang"],"pdf_url":"https://arxiv.org/pdf/2408.14511v2.pdf","comment":"150 pages, 18 figures, 3 tables"},{"id":"http://arxiv.org/abs/2404.18531v2","updated":"2024-08-28T14:12:22Z","published":"2024-04-29T09:17:36Z","title":"A Framework to Model ML Engineering Processes","summary":"  The development of Machine Learning (ML) based systems is complex and\nrequires multidisciplinary teams with diverse skill sets. This may lead to\ncommunication issues or misapplication of best practices. Process models can\nalleviate these challenges by standardizing task orchestration, providing a\ncommon language to facilitate communication, and nurturing a collaborative\nenvironment. Unfortunately, current process modeling languages are not suitable\nfor describing the development of such systems. In this paper, we introduce a\nframework for modeling ML-based software development processes, built around a\ndomain-specific language and derived from an analysis of scientific and gray\nliterature. A supporting toolkit is also available.\n","authors":["Sergio Morales","Robert Clarisó","Jordi Cabot"],"pdf_url":"https://arxiv.org/pdf/2404.18531v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.14846v4","updated":"2024-08-28T14:04:05Z","published":"2024-02-19T14:53:01Z","title":"Stick to your Role! Stability of Personal Values Expressed in Large\n  Language Models","summary":"  The standard way to study Large Language Models (LLMs) with benchmarks or\npsychology questionnaires is to provide many different queries from similar\nminimal contexts (e.g. multiple choice questions). However, due to LLMs' highly\ncontext-dependent nature, conclusions from such minimal-context evaluations may\nbe little informative about the model's behavior in deployment (where it will\nbe exposed to many new contexts). We argue that context-dependence\n(specifically, value stability) should be studied as a specific property of\nLLMs and used as another dimension of LLM comparison (alongside others such as\ncognitive abilities, knowledge, or model size). We present a case-study on the\nstability of value expression over different contexts (simulated conversations\non different topics) as measured using a standard psychology questionnaire\n(PVQ) and on behavioral downstream tasks. Reusing methods from psychology, we\nstudy Rank-order stability on the population (interpersonal) level, and\nIpsative stability on the individual (intrapersonal) level. We consider two\nsettings (with and without instructing LLMs to simulate particular personas),\ntwo simulated populations, and three downstream tasks. We observe consistent\ntrends in the stability of models and model families - Mixtral, Mistral,\nGPT-3.5 and Qwen families are more stable than LLaMa-2 and Phi. The consistency\nof these trends implies that some models exhibit higher value stability than\nothers, and that stability can be estimated with the set of introduced\nmethodological tools. When instructed to simulate particular personas, LLMs\nexhibit low Rank-order stability, which further diminishes with conversation\nlength. This highlights the need for future research on LLMs that coherently\nsimulate different personas. This paper provides a foundational step in that\ndirection, and, to our knowledge, it is the first study of value stability in\nLLMs.\n","authors":["Grgur Kovač","Rémy Portelas","Masataka Sawayama","Peter Ford Dominey","Pierre-Yves Oudeyer"],"pdf_url":"https://arxiv.org/pdf/2402.14846v4.pdf","comment":"The project website and code are available at\n  https://sites.google.com/view/llmvaluestability Published in PLOS ONE (\n  https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0309114 ),\n  and a shorter version at CogSci 24 (\n  https://escholarship.org/uc/item/7w4823c6 )"},{"id":"http://arxiv.org/abs/2405.12390v2","updated":"2024-08-28T13:48:07Z","published":"2024-05-20T21:50:19Z","title":"A Metric-based Principal Curve Approach for Learning One-dimensional\n  Manifold","summary":"  Principal curve is a well-known statistical method oriented in manifold\nlearning using concepts from differential geometry. In this paper, we propose a\nnovel metric-based principal curve (MPC) method that learns one-dimensional\nmanifold of spatial data. Synthetic datasets Real applications using MNIST\ndataset show that our method can learn the one-dimensional manifold well in\nterms of the shape.\n","authors":["Elvis Han Cui"],"pdf_url":"https://arxiv.org/pdf/2405.12390v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15793v1","updated":"2024-08-28T13:37:07Z","published":"2024-08-28T13:37:07Z","title":"Language Adaptation on a Tight Academic Compute Budget: Tokenizer\n  Swapping Works and Pure bfloat16 Is Enough","summary":"  We investigate continued pretraining of LLMs for language adaptation on a\ntight academic budget: a setting in which only a few GPUs can be used in\nparallel, for a heavily constrained duration. We focus on adapting Mistral-7B\nto German or Arabic and evaluate several techniques to improve efficiency and\neffectiveness in this setting. Our German models adapted on this tight compute\nbudget underperform compared to the base Mistral-7B, while our Arabic models\noutperform several baselines, showing that for sufficiently well-represented\nlanguages, continued pretraining for specialization is not always helpful. Our\nmain findings focus on training precision and tokenizer swapping. Our results\nshow that pure bfloat16 training is a viable alternative to mixed-precision\ntraining, while being much faster when only using a few GPUs. Swapping the\ntokenizer for a specialized one yields more efficient tokenization and is\ncompetitive with the original tokenizer, which already contains some German\ntokens, but did not significantly increase performance for German. Code and\nmodel weights are available at on GitHub.\n","authors":["Konstantin Dobler","Gerard de Melo"],"pdf_url":"https://arxiv.org/pdf/2408.15793v1.pdf","comment":"WANT@ICML 2024"},{"id":"http://arxiv.org/abs/2408.15792v1","updated":"2024-08-28T13:35:54Z","published":"2024-08-28T13:35:54Z","title":"Efficient LLM Scheduling by Learning to Rank","summary":"  In Large Language Model (LLM) inference, the output length of an LLM request\nis typically regarded as not known a priori. Consequently, most LLM serving\nsystems employ a simple First-come-first-serve (FCFS) scheduling strategy,\nleading to Head-Of-Line (HOL) blocking and reduced throughput and service\nquality. In this paper, we reexamine this assumption -- we show that, although\npredicting the exact generation length of each request is infeasible, it is\npossible to predict the relative ranks of output lengths in a batch of\nrequests, using learning to rank. The ranking information offers valuable\nguidance for scheduling requests. Building on this insight, we develop a novel\nscheduler for LLM inference and serving that can approximate the\nshortest-job-first (SJF) schedule better than existing approaches. We integrate\nthis scheduler with the state-of-the-art LLM serving system and show\nsignificant performance improvement in several important applications: 2.8x\nlower latency in chatbot serving and 6.5x higher throughput in synthetic data\ngeneration. Our code is available at https://github.com/hao-ai-lab/vllm-ltr.git\n","authors":["Yichao Fu","Siqi Zhu","Runlong Su","Aurick Qiao","Ion Stoica","Hao Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.15792v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.03469v2","updated":"2024-08-28T13:34:13Z","published":"2022-06-07T17:40:51Z","title":"Marked Neural Spatio-Temporal Point Process Involving a Dynamic Graph\n  Neural Network","summary":"  Temporal Point Processes (TPPs) have recently become increasingly interesting\nfor learning dynamics in graph data. A reason for this is that learning on\ndynamic graph data is becoming more relevant, since data from many scientific\nfields, ranging from mathematics, biology, social sciences, and physics to\ncomputer science, is naturally related and inherently dynamic. In addition,\nTPPs provide a meaningful characterization of event streams and a prediction\nmechanism for future events. Therefore, (semi-)parameterized Neural TPPs have\nbeen introduced whose characterization can be (partially) learned and, thus,\nenable the representation of more complex phenomena. However, the research on\nmodeling dynamic graphs with TPPs is relatively young, and only a few models\nfor node attribute changes or evolving edges have been proposed yet. To allow\nfor learning on fully dynamic graph streams, i.e., graphs that can change in\ntheir structure (addition/deletion of nodes/edge) and in their node/edge\nattributes, we propose a Marked Neural Spatio-Temporal Point Process (MNSTPP).\nIt leverages a Dynamic Graph Neural Network to learn a Marked TPP that handles\nattributes and spatial data to model and predict any event in a graph stream.\n","authors":["Alice Moallemy-Oureh","Silvia Beddar-Wiesing","Yannick Nagel","Rüdiger Nather","Josephine M. Thomas"],"pdf_url":"https://arxiv.org/pdf/2206.03469v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00645v2","updated":"2024-08-28T13:32:43Z","published":"2023-08-30T13:26:49Z","title":"Analysis of Diagnostics (Part I): Prevalence, Uncertainty\n  Quantification, and Machine Learning","summary":"  Diagnostic testing provides a unique setting for studying and developing\ntools in classification theory. In such contexts, the concept of prevalence,\ni.e. the number of individuals with a given condition, is fundamental, both as\nan inherent quantity of interest and as a parameter that controls\nclassification accuracy. This manuscript is the first in a two-part series that\nstudies deeper connections between classification theory and prevalence,\nshowing how the latter establishes a more complete theory of uncertainty\nquantification (UQ) for certain types of machine learning (ML). We motivate\nthis analysis via a lemma demonstrating that general classifiers minimizing a\nprevalence-weighted error contain the same probabilistic information as\nBayes-optimal classifiers, which depend on conditional probability densities.\nThis leads us to study relative probability level-sets $B^\\star (q)$, which are\nreinterpreted as both classification boundaries and useful tools for\nquantifying uncertainty in class labels. To realize this in practice, we also\npropose a numerical, homotopy algorithm that estimates the $B^\\star (q)$ by\nminimizing a prevalence-weighted empirical error. The successes and\nshortcomings of this method motivate us to revisit properties of the level\nsets, and we deduce the corresponding classifiers obey a useful monotonicity\nproperty that stabilizes the numerics and points to important extensions to UQ\nof ML. Throughout, we validate our methods in the context of synthetic data and\na research-use-only SARS-CoV-2 enzyme-linked immunosorbent (ELISA) assay.\n","authors":["Paul N. Patrone","Raquel A. Binder","Catherine S. Forconi","Ann M. Moormann","Anthony J. Kearsley"],"pdf_url":"https://arxiv.org/pdf/2309.00645v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14382v2","updated":"2024-08-28T13:30:36Z","published":"2023-07-25T20:08:41Z","title":"When Multi-Task Learning Meets Partial Supervision: A Computer Vision\n  Review","summary":"  Multi-Task Learning (MTL) aims to learn multiple tasks simultaneously while\nexploiting their mutual relationships. By using shared resources to\nsimultaneously calculate multiple outputs, this learning paradigm has the\npotential to have lower memory requirements and inference times compared to the\ntraditional approach of using separate methods for each task. Previous work in\nMTL has mainly focused on fully-supervised methods, as task relationships can\nnot only be leveraged to lower the level of data-dependency of those methods\nbut they can also improve performance. However, MTL introduces a set of\nchallenges due to a complex optimisation scheme and a higher labeling\nrequirement. This review focuses on how MTL could be utilised under different\npartial supervision settings to address these challenges. First, this review\nanalyses how MTL traditionally uses different parameter sharing techniques to\ntransfer knowledge in between tasks. Second, it presents the different\nchallenges arising from such a multi-objective optimisation scheme. Third, it\nintroduces how task groupings can be achieved by analysing task relationships.\nFourth, it focuses on how partially supervised methods applied to MTL can\ntackle the aforementioned challenges. Lastly, this review presents the\navailable datasets, tools and benchmarking results of such methods.\n","authors":["Maxime Fontana","Michael Spratling","Miaojing Shi"],"pdf_url":"https://arxiv.org/pdf/2307.14382v2.pdf","comment":"Accepted by Proceedings of the IEEE"},{"id":"http://arxiv.org/abs/2408.15784v1","updated":"2024-08-28T13:26:36Z","published":"2024-08-28T13:26:36Z","title":"Implicit Regularization Paths of Weighted Neural Representations","summary":"  We study the implicit regularization effects induced by (observation)\nweighting of pretrained features. For weight and feature matrices of bounded\noperator norms that are infinitesimally free with respect to (normalized) trace\nfunctionals, we derive equivalence paths connecting different weighting\nmatrices and ridge regularization levels. Specifically, we show that ridge\nestimators trained on weighted features along the same path are asymptotically\nequivalent when evaluated against test vectors of bounded norms. These paths\ncan be interpreted as matching the effective degrees of freedom of ridge\nestimators fitted with weighted features. For the special case of subsampling\nwithout replacement, our results apply to independently sampled random features\nand kernel features and confirm recent conjectures (Conjectures 7 and 8) of the\nauthors on the existence of such paths in Patil et al. We also present an\nadditive risk decomposition for ensembles of weighted estimators and show that\nthe risks are equivalent along the paths when the ensemble size goes to\ninfinity. As a practical consequence of the path equivalences, we develop an\nefficient cross-validation method for tuning and apply it to subsampled\npretrained representations across several models (e.g., ResNet-50) and datasets\n(e.g., CIFAR-100).\n","authors":["Jin-Hong Du","Pratik Patil"],"pdf_url":"https://arxiv.org/pdf/2408.15784v1.pdf","comment":"19 pages for main and 19 pages for appendix"},{"id":"http://arxiv.org/abs/2408.09237v2","updated":"2024-08-28T13:10:40Z","published":"2024-08-17T16:06:14Z","title":"QEDCartographer: Automating Formal Verification Using Reward-Free\n  Reinforcement Learning","summary":"  Formal verification is a promising method for producing reliable software,\nbut the difficulty of manually writing verification proofs severely limits its\nutility in practice. Recent methods have automated some proof synthesis by\nguiding a search through the proof space using a theorem prover. Unfortunately,\nthe theorem prover provides only the crudest estimate of progress, resulting in\neffectively undirected search. To address this problem, we create\nQEDCartographer, an automated proof-synthesis tool that combines supervised and\nreinforcement learning to more effectively explore the proof space.\nQEDCartographer incorporates the proofs' branching structure, enabling\nreward-free search and overcoming the sparse reward problem inherent to formal\nverification. We evaluate QEDCartographer using the CoqGym benchmark of 68.5K\ntheorems from 124 open-source Coq projects. QEDCartographer fully automatically\nproves 21.4% of the test-set theorems. Previous search-based proof-synthesis\ntools Tok, Tac, ASTactic, Passport, and Proverbot9001, which rely only on\nsupervised learning, prove 9.6%, 9.8%, 10.9%, 12.5%, and 19.8%, respectively.\nDiva, which combines 62 tools, proves 19.2%. Comparing to the most effective\nprior tool, Proverbot9001, QEDCartographer produces 26% shorter proofs 27%\nfaster, on average over the theorems both tools prove. Together,\nQEDCartographer and non-learning-based CoqHammer prove 31.8% of the theorems,\nwhile CoqHammer alone proves 26.6%. Our work demonstrates that reinforcement\nlearning is a fruitful research direction for improving proof-synthesis tools'\nsearch mechanisms.\n","authors":["Alex Sanchez-Stern","Abhishek Varghese","Zhanna Kaufman","Dylan Zhang","Talia Ringer","Yuriy Brun"],"pdf_url":"https://arxiv.org/pdf/2408.09237v2.pdf","comment":"Published in the International Conference on Software Engineering\n  (ICSE) 2025: Alex Sanchez-Stern, Abhishek Varghese, Zhanna Kaufman, Dylan\n  Zhang, Talia Ringer, and Yuriy Brun, QEDCartographer: Automating Formal\n  Verification Using Reward-Free Reinforcement Learning, in Proceedings of the\n  47th International Conference on Software Engineering (ICSE), 2025"},{"id":"http://arxiv.org/abs/2408.15771v1","updated":"2024-08-28T13:09:20Z","published":"2024-08-28T13:09:20Z","title":"wav2pos: Sound Source Localization using Masked Autoencoders","summary":"  We present a novel approach to the 3D sound source localization task for\ndistributed ad-hoc microphone arrays by formulating it as a set-to-set\nregression problem. By training a multi-modal masked autoencoder model that\noperates on audio recordings and microphone coordinates, we show that such a\nformulation allows for accurate localization of the sound source, by\nreconstructing coordinates masked in the input. Our approach is flexible in the\nsense that a single model can be used with an arbitrary number of microphones,\neven when a subset of audio recordings and microphone coordinates are missing.\nWe test our method on simulated and real-world recordings of music and speech\nin indoor environments, and demonstrate competitive performance compared to\nboth classical and other learning based localization methods.\n","authors":["Axel Berg","Jens Gulin","Mark O'Connor","Chuteng Zhou","Karl Åström","Magnus Oskarsson"],"pdf_url":"https://arxiv.org/pdf/2408.15771v1.pdf","comment":"IPIN 2024"},{"id":"http://arxiv.org/abs/2405.19730v5","updated":"2024-08-28T13:05:41Z","published":"2024-05-30T06:21:34Z","title":"Research on the Spatial Data Intelligent Foundation Model","summary":"  This report focuses on spatial data intelligent large models, delving into\nthe principles, methods, and cutting-edge applications of these models. It\nprovides an in-depth discussion on the definition, development history, current\nstatus, and trends of spatial data intelligent large models, as well as the\nchallenges they face. The report systematically elucidates the key technologies\nof spatial data intelligent large models and their applications in urban\nenvironments, aerospace remote sensing, geography, transportation, and other\nscenarios. Additionally, it summarizes the latest application cases of spatial\ndata intelligent large models in themes such as urban development, multimodal\nsystems, remote sensing, smart transportation, and resource environments.\nFinally, the report concludes with an overview and outlook on the development\nprospects of spatial data intelligent large models.\n","authors":["Shaohua Wang","Xing Xie","Yong Li","Danhuai Guo","Zhi Cai","Yu Liu","Yang Yue","Xiao Pan","Feng Lu","Huayi Wu","Zhipeng Gui","Zhiming Ding","Bolong Zheng","Fuzheng Zhang","Jingyuan Wang","Zhengchao Chen","Hao Lu","Jiayi Li","Peng Yue","Wenhao Yu","Yao Yao","Leilei Sun","Yong Zhang","Longbiao Chen","Xiaoping Du","Xiang Li","Xueying Zhang","Kun Qin","Zhaoya Gong","Weihua Dong","Xiaofeng Meng"],"pdf_url":"https://arxiv.org/pdf/2405.19730v5.pdf","comment":"V1 and V2 are in Chinese language, other versions are in English"},{"id":"http://arxiv.org/abs/2408.15766v1","updated":"2024-08-28T12:59:12Z","published":"2024-08-28T12:59:12Z","title":"Harmonized Speculative Sampling","summary":"  Speculative sampling has proven to be an effective solution to accelerate\ndecoding from large language models, where the acceptance rate significantly\ndetermines the performance. Most previous works on improving the acceptance\nrate focus on aligned training and efficient decoding, implicitly paying less\nattention to the linkage of training and decoding. In this work, we first\ninvestigate the linkage of training and decoding for speculative sampling and\nthen propose a solution named HArmonized Speculative Sampling (HASS). HASS\nimproves the acceptance rate without extra inference overhead by harmonizing\ntraining and decoding on their objectives and contexts. Experiments on three\nLLaMA models demonstrate that HASS achieves 2.81x-3.65x wall-clock time speedup\nratio averaging across three datasets, which is 8%-15% faster than EAGLE-2.\n","authors":["Lefan Zhang","Xiaodan Wang","Yanhua Huang","Ruiwen Xu"],"pdf_url":"https://arxiv.org/pdf/2408.15766v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.09495v2","updated":"2024-08-28T12:46:28Z","published":"2024-06-13T17:36:05Z","title":"FADE: Towards Fairness-aware Augmentation for Domain Generalization via\n  Classifier-Guided Score-based Diffusion Models","summary":"  Fairness-aware domain generalization (FairDG) has emerged as a critical\nchallenge for deploying trustworthy AI systems, particularly in scenarios\ninvolving distribution shifts. Traditional methods for addressing fairness have\nfailed in domain generalization due to their lack of consideration for\ndistribution shifts. Although disentanglement has been used to tackle FairDG,\nit is limited by its strong assumptions. To overcome these limitations, we\npropose Fairness-aware Classifier-Guided Score-based Diffusion Models (FADE) as\na novel approach to effectively address the FairDG issue. Specifically, we\nfirst pre-train a score-based diffusion model (SDM) and two classifiers to\nequip the model with strong generalization capabilities across different\ndomains. Then, we guide the SDM using these pre-trained classifiers to\neffectively eliminate sensitive information from the generated data. Finally,\nthe generated fair data is used to train downstream classifiers, ensuring\nrobust performance under new data distributions. Extensive experiments on three\nreal-world datasets demonstrate that FADE not only enhances fairness but also\nimproves accuracy in the presence of distribution shifts. Additionally, FADE\noutperforms existing methods in achieving the best accuracy-fairness\ntrade-offs.\n","authors":["Yujie Lin","Dong Li","Chen Zhao","Minglai Shao"],"pdf_url":"https://arxiv.org/pdf/2406.09495v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02255v3","updated":"2024-08-28T12:43:10Z","published":"2023-12-04T18:56:08Z","title":"Re-Nerfing: Improving Novel View Synthesis through Novel View Synthesis","summary":"  Recent neural rendering and reconstruction techniques, such as NeRFs or\nGaussian Splatting, have shown remarkable novel view synthesis capabilities but\nrequire hundreds of images of the scene from diverse viewpoints to render\nhigh-quality novel views. With fewer images available, these methods start to\nfail since they can no longer correctly triangulate the underlying 3D geometry\nand converge to a non-optimal solution. These failures can manifest as floaters\nor blurry renderings in sparsely observed areas of the scene. In this paper, we\npropose Re-Nerfing, a simple and general add-on approach that leverages novel\nview synthesis itself to tackle this problem. Using an already trained NVS\nmethod, we render novel views between existing ones and augment the training\ndata to optimize a second model. This introduces additional multi-view\nconstraints and allows the second model to converge to a better solution. With\nRe-Nerfing we achieve significant improvements upon multiple pipelines based on\nNeRF and Gaussian-Splatting in sparse view settings of the mip-NeRF 360 and\nLLFF datasets. Notably, Re-Nerfing does not require prior knowledge or extra\nsupervision signals, making it a flexible and practical add-on.\n","authors":["Felix Tristram","Stefano Gasperini","Nassir Navab","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2312.02255v3.pdf","comment":"Code will be released upon acceptance"},{"id":"http://arxiv.org/abs/2408.14432v2","updated":"2024-08-28T12:39:57Z","published":"2024-08-26T17:20:34Z","title":"Contextual Bandit with Herding Effects: Algorithms and Recommendation\n  Applications","summary":"  Contextual bandits serve as a fundamental algorithmic framework for\noptimizing recommendation decisions online. Though extensive attention has been\npaid to tailoring contextual bandits for recommendation applications, the\n\"herding effects\" in user feedback have been ignored. These herding effects\nbias user feedback toward historical ratings, breaking down the assumption of\nunbiased feedback inherent in contextual bandits. This paper develops a novel\nvariant of the contextual bandit that is tailored to address the feedback bias\ncaused by the herding effects. A user feedback model is formulated to capture\nthis feedback bias. We design the TS-Conf (Thompson Sampling under Conformity)\nalgorithm, which employs posterior sampling to balance the exploration and\nexploitation tradeoff. We prove an upper bound for the regret of the algorithm,\nrevealing the impact of herding effects on learning speed. Extensive\nexperiments on datasets demonstrate that TS-Conf outperforms four benchmark\nalgorithms. Analysis reveals that TS-Conf effectively mitigates the negative\nimpact of herding effects, resulting in faster learning and improved\nrecommendation accuracy.\n","authors":["Luyue Xu","Liming Wang","Hong Xie","Mingqiang Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.14432v2.pdf","comment":"Published as a conference paper at PRICAI 2024"},{"id":"http://arxiv.org/abs/2408.15753v1","updated":"2024-08-28T12:39:51Z","published":"2024-08-28T12:39:51Z","title":"A Neural Material Point Method for Particle-based Simulations","summary":"  Mesh-free Lagrangian methods are widely used for simulating fluids, solids,\nand their complex interactions due to their ability to handle large\ndeformations and topological changes. These physics simulators, however,\nrequire substantial computational resources for accurate simulations. To\naddress these issues, deep learning emulators promise faster and scalable\nsimulations, yet they often remain expensive and difficult to train, limiting\ntheir practical use. Inspired by the Material Point Method (MPM), we present\nNeuralMPM, a neural emulation framework for particle-based simulations.\nNeuralMPM interpolates Lagrangian particles onto a fixed-size grid, computes\nupdates on grid nodes using image-to-image neural networks, and interpolates\nback to the particles. Similarly to MPM, NeuralMPM benefits from the regular\nvoxelized representation to simplify the computation of the state dynamics,\nwhile avoiding the drawbacks of mesh-based Eulerian methods. We demonstrate the\nadvantages of NeuralMPM on several datasets, including fluid dynamics and\nfluid-solid interactions. Compared to existing methods, NeuralMPM reduces\ntraining times from days to hours, while achieving comparable or superior\nlong-term accuracy, making it a promising approach for practical forward and\ninverse problems. A project page is available at https://neuralmpm.isach.be\n","authors":["Omer Rochman Sharabi","Sacha Lewin","Gilles Louppe"],"pdf_url":"https://arxiv.org/pdf/2408.15753v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11122v6","updated":"2024-08-28T12:33:52Z","published":"2023-10-17T10:14:10Z","title":"Sensitivity-Aware Amortized Bayesian Inference","summary":"  Sensitivity analyses reveal the influence of various modeling choices on the\noutcomes of statistical analyses. While theoretically appealing, they are\noverwhelmingly inefficient for complex Bayesian models. In this work, we\npropose sensitivity-aware amortized Bayesian inference (SA-ABI), a multifaceted\napproach to efficiently integrate sensitivity analyses into simulation-based\ninference with neural networks. First, we utilize weight sharing to encode the\nstructural similarities between alternative likelihood and prior specifications\nin the training process with minimal computational overhead. Second, we\nleverage the rapid inference of neural networks to assess sensitivity to data\nperturbations and preprocessing steps. In contrast to most other Bayesian\napproaches, both steps circumvent the costly bottleneck of refitting the model\nfor each choice of likelihood, prior, or data set. Finally, we propose to use\ndeep ensembles to detect sensitivity arising from unreliable approximation\n(e.g., due to model misspecification). We demonstrate the effectiveness of our\nmethod in applied modeling problems, ranging from disease outbreak dynamics and\nglobal warming thresholds to human decision-making. Our results support\nsensitivity-aware inference as a default choice for amortized Bayesian\nworkflows, automatically providing modelers with insights into otherwise hidden\ndimensions.\n","authors":["Lasse Elsemüller","Hans Olischläger","Marvin Schmitt","Paul-Christian Bürkner","Ullrich Köthe","Stefan T. Radev"],"pdf_url":"https://arxiv.org/pdf/2310.11122v6.pdf","comment":"Published in TMLR (2024)"},{"id":"http://arxiv.org/abs/2407.16496v2","updated":"2024-08-28T12:20:42Z","published":"2024-07-23T14:11:12Z","title":"Articulation Work and Tinkering for Fairness in Machine Learning","summary":"  The field of fair AI aims to counter biased algorithms through computational\nmodelling. However, it faces increasing criticism for perpetuating the use of\noverly technical and reductionist methods. As a result, novel approaches appear\nin the field to address more socially-oriented and interdisciplinary (SOI)\nperspectives on fair AI. In this paper, we take this dynamic as the starting\npoint to study the tension between computer science (CS) and SOI research. By\ndrawing on STS and CSCW theory, we position fair AI research as a matter of\n'organizational alignment': what makes research 'doable' is the successful\nalignment of three levels of work organization (the social world, the\nlaboratory, and the experiment). Based on qualitative interviews with CS\nresearchers, we analyze the tasks, resources, and actors required for doable\nresearch in the case of fair AI. We find that CS researchers engage with SOI\nresearch to some extent, but organizational conditions, articulation work, and\nambiguities of the social world constrain the doability of SOI research for\nthem. Based on our findings, we identify and discuss problems for aligning CS\nand SOI as fair AI continues to evolve.\n","authors":["Miriam Fahimi","Mayra Russo","Kristen M. Scott","Maria-Esther Vidal","Bettina Berendt","Katharina Kinder-Kurlanda"],"pdf_url":"https://arxiv.org/pdf/2407.16496v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.08459v3","updated":"2024-08-28T12:11:46Z","published":"2023-03-15T09:03:58Z","title":"Forecasting Intraday Power Output by a Set of PV Systems using Recurrent\n  Neural Networks and Physical Covariates","summary":"  Accurate intraday forecasts of the power output by PhotoVoltaic (PV) systems\nare critical to improve the operation of energy distribution grids. We describe\na neural autoregressive model that aims to perform such intraday forecasts. We\nbuild upon a physical, deterministic PV performance model, the output of which\nis used as covariates in the context of the neural model. In addition, our\napplication data relates to a geographically distributed set of PV systems. We\naddress all PV sites with a single neural model, which embeds the information\nabout the PV site in specific covariates. We use a scale-free approach which\nrelies on the explicit modeling of seasonal effects. Our proposal repurposes a\nmodel initially used in the retail sector and discloses a novel truncated\nGaussian output distribution. An ablation study and a comparison to alternative\narchitectures from the literature shows that the components in the best\nperforming proposed model variant work synergistically to reach a skill score\nof 15.72% with respect to the physical model, used as a baseline.\n","authors":["Pierrick Bruneau","David Fiorelli","Christian Braun","Daniel Koster"],"pdf_url":"https://arxiv.org/pdf/2303.08459v3.pdf","comment":"25 pages, 7 figures, Accepted for publication in Neural Computing and\n  Applications on 12/07/2024"},{"id":"http://arxiv.org/abs/2408.14398v2","updated":"2024-08-28T12:03:54Z","published":"2024-08-26T16:29:13Z","title":"Language-specific Calibration for Pruning Multilingual Language Models","summary":"  Recent advances in large language model (LLM) pruning have shown\nstate-of-the-art compression results in post-training and retraining-free\nsettings while maintaining high predictive performance. However, such research\nmainly considers calibrating pruning using English text, despite the\nmultilingual nature of modern LLMs and their frequent uses in non-English\nlanguages. In this paper, we set out to explore effective strategies for\ncalibrating the pruning of multilingual language models. We present the first\ncomprehensive empirical study, comparing different calibration languages for\npruning multilingual models across diverse tasks, models, and state-of-the-art\npruning techniques. Our results present practical suggestions, for example,\ncalibrating in the target language can efficiently yield lower perplexity, but\ndoes not necessarily benefit downstream tasks. Our further analysis experiments\nunveil that calibration in the target language mainly contributes to preserving\nlanguage-specific features related to fluency and coherence, but might not\ncontribute to capturing language-agnostic features such as language\nunderstanding and reasoning. Last, we provide practical recommendations for\nfuture practitioners.\n","authors":["Simon Kurz","Jian-Jia Chen","Lucie Flek","Zhixue Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.14398v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11960v3","updated":"2024-08-28T11:48:43Z","published":"2024-03-18T16:57:16Z","title":"Causality-Aware Spatiotemporal Graph Neural Networks for Spatiotemporal\n  Time Series Imputation","summary":"  Spatiotemporal time series are usually collected via monitoring sensors\nplaced at different locations, which usually contain missing values due to\nvarious failures, such as mechanical damages and Internet outages. Imputing the\nmissing values is crucial for analyzing time series. When recovering a specific\ndata point, most existing methods consider all the information relevant to that\npoint regardless of the cause-and-effect relationship. During data collection,\nit is inevitable that some unknown confounders are included, e.g., background\nnoise in time series and non-causal shortcut edges in the constructed sensor\nnetwork. These confounders could open backdoor paths and establish non-causal\ncorrelations between the input and output. Over-exploiting these non-causal\ncorrelations could cause overfitting. In this paper, we first revisit\nspatiotemporal time series imputation from a causal perspective and show how to\nblock the confounders via the frontdoor adjustment. Based on the results of\nfrontdoor adjustment, we introduce a novel Causality-Aware Spatiotemporal Graph\nNeural Network (Casper), which contains a novel Prompt Based Decoder (PBD) and\na Spatiotemporal Causal Attention (SCA). PBD could reduce the impact of\nconfounders and SCA could discover the sparse causal relationships among\nembeddings. Theoretical analysis reveals that SCA discovers causal\nrelationships based on the values of gradients. We evaluate Casper on three\nreal-world datasets, and the experimental results show that Casper could\noutperform the baselines and could effectively discover causal relationships.\n","authors":["Baoyu Jing","Dawei Zhou","Kan Ren","Carl Yang"],"pdf_url":"https://arxiv.org/pdf/2403.11960v3.pdf","comment":"Accepted by CIKM'2024"},{"id":"http://arxiv.org/abs/2408.15722v1","updated":"2024-08-28T11:39:24Z","published":"2024-08-28T11:39:24Z","title":"Advanced POD-Based Performance Evaluation of Classifiers Applied to\n  Human Driver Lane Changing Prediction","summary":"  Machine learning (ML) classifiers serve as essential tools facilitating\nclassification and prediction across various domains. The performance of these\nalgorithms should be known to ensure their reliable application. In certain\nfields, receiver operating characteristic and precision-recall curves are\nfrequently employed to assess machine learning algorithms without accounting\nfor the impact of process parameters. However, it may be essential to evaluate\nthe performance of these algorithms in relation to such parameters. As a\nperformance evaluation metric capable of considering the effects of process\nparameters, this paper uses a modified probability of detection (POD) approach\nto assess the reliability of ML-based algorithms. As an example, the POD-based\napproach is employed to assess ML models used for predicting the lane changing\nbehavior of a vehicle driver. The time remaining to the predicted (and\ntherefore unknown) lane changing event is considered as process parameter. The\nhit/miss approach to POD is taken here and modified by considering the\nprobability of lane changing derived from ML algorithms at each time step, and\nobtaining the final result of the analysis accordingly. This improves the\nreliability of results compared to the standard hit/miss approach, which\nconsiders the outcome of the classifiers as either 0 or 1, while also\nsimplifying evaluation compared to the \\^a versus a approach. Performance\nevaluation results of the proposed approach are compared with those obtained\nwith the standard hit/miss approach and a pre-developed \\^a versus a approach\nto validate the effectiveness of the proposed method. The comparison shows that\nthis method provides an averaging conservative behavior with the advantage of\nenhancing the reliability of the hit/miss approach to POD while retaining its\nsimplicity.\n","authors":["Zahra Rastin","Dirk Söffker"],"pdf_url":"https://arxiv.org/pdf/2408.15722v1.pdf","comment":"Manuscript: 8 pages, 6 figures, 4 tables"},{"id":"http://arxiv.org/abs/2305.17479v3","updated":"2024-08-28T11:27:25Z","published":"2023-05-27T13:57:26Z","title":"Inferring Individual Direct Causal Effects Under Heterogeneous Peer\n  Influence","summary":"  Causal inference in networks should account for interference, which occurs\nwhen a unit's outcome is influenced by treatments or outcomes of peers.\nHeterogeneous peer influence (HPI) occurs when a unit's outcome is influenced\ndifferently by different peers based on their attributes and relationships, or\nwhen each unit has a different susceptibility to peer influence. Existing\nsolutions to estimating direct causal effects under interference consider\neither homogeneous influence from peers or specific heterogeneous influence\nmechanisms (e.g., based on local neighborhood structure). This paper presents a\nmethodology for estimating individual direct causal effects in the presence of\nHPI where the mechanism of influence is not known a priori. We propose a\nstructural causal model for networks that can capture different possible\nassumptions about network structure, interference conditions, and causal\ndependence and enables reasoning about identifiability in the presence of HPI.\nWe find potential heterogeneous contexts using the causal model and propose a\nnovel graph neural network-based estimator to estimate individual direct causal\neffects. We show that state-of-the-art methods for individual direct effect\nestimation produce biased results in the presence of HPI, and that our proposed\nestimator is robust.\n","authors":["Shishir Adhikari","Elena Zheleva"],"pdf_url":"https://arxiv.org/pdf/2305.17479v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15715v1","updated":"2024-08-28T11:21:33Z","published":"2024-08-28T11:21:33Z","title":"Autoregressive model path dependence near Ising criticality","summary":"  Autoregressive models are a class of generative model that probabilistically\npredict the next output of a sequence based on previous inputs. The\nautoregressive sequence is by definition one-dimensional (1D), which is natural\nfor language tasks and hence an important component of modern architectures\nlike recurrent neural networks (RNNs) and transformers. However, when language\nmodels are used to predict outputs on physical systems that are not\nintrinsically 1D, the question arises of which choice of autoregressive\nsequence -- if any -- is optimal. In this paper, we study the reconstruction of\ncritical correlations in the two-dimensional (2D) Ising model, using RNNs and\ntransformers trained on binary spin data obtained near the thermal phase\ntransition. We compare the training performance for a number of different 1D\nautoregressive sequences imposed on finite-size 2D lattices. We find that paths\nwith long 1D segments are more efficient at training the autoregressive models\ncompared to space-filling curves that better preserve the 2D locality. Our\nresults illustrate the potential importance in choosing the optimal\nautoregressive sequence ordering when training modern language models for tasks\nin physics.\n","authors":["Yi Hong Teoh","Roger G. Melko"],"pdf_url":"https://arxiv.org/pdf/2408.15715v1.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2408.15714v1","updated":"2024-08-28T11:21:23Z","published":"2024-08-28T11:21:23Z","title":"Pixels to Prose: Understanding the art of Image Captioning","summary":"  In the era of evolving artificial intelligence, machines are increasingly\nemulating human-like capabilities, including visual perception and linguistic\nexpression. Image captioning stands at the intersection of these domains,\nenabling machines to interpret visual content and generate descriptive text.\nThis paper provides a thorough review of image captioning techniques, catering\nto individuals entering the field of machine learning who seek a comprehensive\nunderstanding of available options, from foundational methods to\nstate-of-the-art approaches. Beginning with an exploration of primitive\narchitectures, the review traces the evolution of image captioning models to\nthe latest cutting-edge solutions. By dissecting the components of these\narchitectures, readers gain insights into the underlying mechanisms and can\nselect suitable approaches tailored to specific problem requirements without\nduplicating efforts. The paper also delves into the application of image\ncaptioning in the medical domain, illuminating its significance in various\nreal-world scenarios.\n  Furthermore, the review offers guidance on evaluating the performance of\nimage captioning systems, highlighting key metrics for assessment. By\nsynthesizing theoretical concepts with practical application, this paper equips\nreaders with the knowledge needed to navigate the complex landscape of image\ncaptioning and harness its potential for diverse applications in machine\nlearning and beyond.\n","authors":["Hrishikesh Singh","Aarti Sharma","Millie Pant"],"pdf_url":"https://arxiv.org/pdf/2408.15714v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.08235v3","updated":"2024-08-28T11:18:26Z","published":"2023-04-14T07:55:07Z","title":"A Platform-Agnostic Deep Reinforcement Learning Framework for Effective\n  Sim2Real Transfer towards Autonomous Driving","summary":"  Deep Reinforcement Learning (DRL) has shown remarkable success in solving\ncomplex tasks across various research fields. However, transferring DRL agents\nto the real world is still challenging due to the significant discrepancies\nbetween simulation and reality. To address this issue, we propose a robust DRL\nframework that leverages platform-dependent perception modules to extract\ntask-relevant information and train a lane-following and overtaking agent in\nsimulation. This framework facilitates the seamless transfer of the DRL agent\nto new simulated environments and the real world with minimal effort. We\nevaluate the performance of the agent in various driving scenarios in both\nsimulation and the real world, and compare it to human players and the PID\nbaseline in simulation. Our proposed framework significantly reduces the gaps\nbetween different platforms and the Sim2Real gap, enabling the trained agent to\nachieve similar performance in both simulation and the real world, driving the\nvehicle effectively.\n","authors":["Dianzhao Li","Ostap Okhrin"],"pdf_url":"https://arxiv.org/pdf/2304.08235v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03472v3","updated":"2024-08-28T11:18:00Z","published":"2023-08-07T11:02:44Z","title":"Improving the forecast accuracy of wind power by leveraging multiple\n  hierarchical structure","summary":"  Renewable energy generation is of utmost importance for global\ndecarbonization. Forecasting renewable energies, particularly wind energy, is\nchallenging due to the inherent uncertainty in wind energy generation, which\ndepends on weather conditions. Recent advances in hierarchical forecasting\nthrough reconciliation have demonstrated a significant increase in the quality\nof wind energy forecasts for short-term periods. We leverage the\ncross-sectional and temporal hierarchical structure of turbines in wind farms\nand build cross-temporal hierarchies to further investigate how integrated\ncross-sectional and temporal dimensions can add value to forecast accuracy in\nwind farms. We found that cross-temporal reconciliation was superior to\nindividual cross-sectional reconciliation at multiple temporal aggregations.\nAdditionally, machine learning based forecasts that were cross-temporally\nreconciled demonstrated high accuracy at coarser temporal granularities, which\nmay encourage adoption for short-term wind forecasts. Empirically, we provide\ninsights for decision-makers on the best methods for forecasting high-frequency\nwind data across different forecasting horizons and levels.\n","authors":["Lucas English","Mahdi Abolghasemi"],"pdf_url":"https://arxiv.org/pdf/2308.03472v3.pdf","comment":"41 pages, 14 figures"},{"id":"http://arxiv.org/abs/2408.15702v1","updated":"2024-08-28T11:02:23Z","published":"2024-08-28T11:02:23Z","title":"Evaluating Model Robustness Using Adaptive Sparse L0 Regularization","summary":"  Deep Neural Networks have demonstrated remarkable success in various domains\nbut remain susceptible to adversarial examples, which are slightly altered\ninputs designed to induce misclassification. While adversarial attacks\ntypically optimize under Lp norm constraints, attacks based on the L0 norm,\nprioritising input sparsity, are less studied due to their complex and non\nconvex nature. These sparse adversarial examples challenge existing defenses by\naltering a minimal subset of features, potentially uncovering more subtle DNN\nweaknesses. However, the current L0 norm attack methodologies face a trade off\nbetween accuracy and efficiency either precise but computationally intense or\nexpedient but imprecise. This paper proposes a novel, scalable, and effective\napproach to generate adversarial examples based on the L0 norm, aimed at\nrefining the robustness evaluation of DNNs against such perturbations.\n","authors":["Weiyou Liu","Zhenyang Li","Weitong Chen"],"pdf_url":"https://arxiv.org/pdf/2408.15702v1.pdf","comment":"Accepted by the 20th International Conference on Advanced Data Mining\n  and Applications (ADMA 2024)"},{"id":"http://arxiv.org/abs/2403.17550v2","updated":"2024-08-28T10:52:32Z","published":"2024-03-26T09:58:06Z","title":"DeepMIF: Deep Monotonic Implicit Fields for Large-Scale LiDAR 3D Mapping","summary":"  Recently, significant progress has been achieved in sensing real large-scale\noutdoor 3D environments, particularly by using modern acquisition equipment\nsuch as LiDAR sensors. Unfortunately, they are fundamentally limited in their\nability to produce dense, complete 3D scenes. To address this issue, recent\nlearning-based methods integrate neural implicit representations and\noptimizable feature grids to approximate surfaces of 3D scenes. However,\nnaively fitting samples along raw LiDAR rays leads to noisy 3D mapping results\ndue to the nature of sparse, conflicting LiDAR measurements. Instead, in this\nwork we depart from fitting LiDAR data exactly, instead letting the network\noptimize a non-metric monotonic implicit field defined in 3D space. To fit our\nfield, we design a learning system integrating a monotonicity loss that enables\noptimizing neural monotonic fields and leverages recent progress in large-scale\n3D mapping. Our algorithm achieves high-quality dense 3D mapping performance as\ncaptured by multiple quantitative and perceptual measures and visual results\nobtained for Mai City, Newer College, and KITTI benchmarks. The code of our\napproach will be made publicly available.\n","authors":["Kutay Yılmaz","Matthias Nießner","Anastasiia Kornilova","Alexey Artemov"],"pdf_url":"https://arxiv.org/pdf/2403.17550v2.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2312.03187v3","updated":"2024-08-28T10:00:01Z","published":"2023-12-05T23:33:49Z","title":"FERGI: Automatic Annotation of User Preferences for Text-to-Image\n  Generation from Spontaneous Facial Expression Reaction","summary":"  Researchers have proposed to use data of human preference feedback to\nfine-tune text-to-image generative models. However, the scalability of human\nfeedback collection has been limited by its reliance on manual annotation.\nTherefore, we develop and test a method to automatically score user preferences\nfrom their spontaneous facial expression reaction to the generated images. We\ncollect a dataset of Facial Expression Reaction to Generated Images (FERGI) and\nshow that the activations of multiple facial action units (AUs) are highly\ncorrelated with user evaluations of the generated images. We develop an FAU-Net\n(Facial Action Units Neural Network), which receives inputs from an AU\nestimation model, to automatically score user preferences for text-to-image\ngeneration based on their facial expression reactions, which is complementary\nto the pre-trained scoring models based on the input text prompts and generated\nimages. Integrating our FAU-Net valence score with the pre-trained scoring\nmodels improves their consistency with human preferences. This method of\nautomatic annotation with facial expression analysis can be potentially\ngeneralized to other generation tasks. The code is available at\nhttps://github.com/ShuangquanFeng/FERGI, and the dataset is also available at\nthe same link for research purposes.\n","authors":["Shuangquan Feng","Junhua Ma","Virginia R. de Sa"],"pdf_url":"https://arxiv.org/pdf/2312.03187v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.18316v2","updated":"2024-08-28T09:42:58Z","published":"2024-06-26T12:59:37Z","title":"Trade-off between Gradient Measurement Efficiency and Expressivity in\n  Deep Quantum Neural Networks","summary":"  Quantum neural networks (QNNs) require an efficient training algorithm to\nachieve practical quantum advantages. A promising approach is the use of\ngradient-based optimization algorithms, where gradients are estimated through\nquantum measurements. However, general QNNs lack an efficient gradient\nmeasurement algorithm, which poses a fundamental and practical challenge to\nrealizing scalable QNNs. In this work, we rigorously prove a trade-off between\ngradient measurement efficiency, defined as the mean number of simultaneously\nmeasurable gradient components, and expressivity in a wide class of deep QNNs,\nelucidating the theoretical limits and possibilities of efficient gradient\nestimation. This trade-off implies that a more expressive QNN requires a higher\nmeasurement cost in gradient estimation, whereas we can increase gradient\nmeasurement efficiency by reducing the QNN expressivity to suit a given task.\nWe further propose a general QNN ansatz called the stabilizer-logical product\nansatz (SLPA), which can reach the upper limit of the trade-off inequality by\nleveraging the symmetric structure of the quantum circuit. In learning an\nunknown symmetric function, the SLPA drastically reduces the quantum resources\nrequired for training while maintaining accuracy and trainability compared to a\nwell-designed symmetric circuit based on the parameter-shift method. Our\nresults not only reveal a theoretical understanding of efficient training in\nQNNs but also provide a standard and broadly applicable efficient QNN design.\n","authors":["Koki Chinzei","Shinichiro Yamano","Quoc Hoan Tran","Yasuhiro Endo","Hirotaka Oshima"],"pdf_url":"https://arxiv.org/pdf/2406.18316v2.pdf","comment":"31 pages, 11 figures"},{"id":"http://arxiv.org/abs/2408.15667v1","updated":"2024-08-28T09:40:40Z","published":"2024-08-28T09:40:40Z","title":"Towards reliable respiratory disease diagnosis based on cough sounds and\n  vision transformers","summary":"  Recent advancements in deep learning techniques have sparked performance\nboosts in various real-world applications including disease diagnosis based on\nmulti-modal medical data. Cough sound data-based respiratory disease (e.g.,\nCOVID-19 and Chronic Obstructive Pulmonary Disease) diagnosis has also\nattracted much attention. However, existing works usually utilise traditional\nmachine learning or deep models of moderate scales. On the other hand, the\ndeveloped approaches are trained and evaluated on small-scale data due to the\ndifficulty of curating and annotating clinical data on scale. To address these\nissues in prior works, we create a unified framework to evaluate various deep\nmodels from lightweight Convolutional Neural Networks (e.g., ResNet18) to\nmodern vision transformers and compare their performance in respiratory disease\nclassification. Based on the observations from such an extensive empirical\nstudy, we propose a novel approach to cough-based disease classification based\non both self-supervised and supervised learning on a large-scale cough data\nset. Experimental results demonstrate our proposed approach outperforms prior\narts consistently on two benchmark datasets for COVID-19 diagnosis and a\nproprietary dataset for COPD/non-COPD classification with an AUROC of 92.5%.\n","authors":["Qian Wang","Zhaoyang Bu","Jiaxuan Mao","Wenyu Zhu","Jingya Zhao","Wei Du","Guochao Shi","Min Zhou","Si Chen","Jieming Qu"],"pdf_url":"https://arxiv.org/pdf/2408.15667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15664v1","updated":"2024-08-28T09:31:09Z","published":"2024-08-28T09:31:09Z","title":"Auxiliary-Loss-Free Load Balancing Strategy for Mixture-of-Experts","summary":"  For Mixture-of-Experts (MoE) models, an unbalanced expert load will lead to\nrouting collapse or increased computational overhead. Existing methods commonly\nemploy an auxiliary loss to encourage load balance, but a large auxiliary loss\nwill introduce non-negligible interference gradients into training and thus\nimpair the model performance. In order to control load balance while not\nproducing undesired gradients during training, we propose Loss-Free Balancing,\nfeatured by an auxiliary-loss-free load balancing strategy. To be specific,\nbefore the top-K routing decision, Loss-Free Balancing will first apply an\nexpert-wise bias to the routing scores of each expert. By dynamically updating\nthe bias of each expert according to its recent load, Loss-Free Balancing can\nconsistently maintain a balanced distribution of expert load. In addition,\nsince Loss-Free Balancing does not produce any interference gradients, it also\nelevates the upper bound of model performance gained from MoE training. We\nvalidate the performance of Loss-Free Balancing on MoE models with up to 3B\nparameters trained on up to 200B tokens. Experimental results show that\nLoss-Free Balancing achieves both better performance and better load balance\ncompared with traditional auxiliary-loss-controlled load balancing strategies.\n","authors":["Lean Wang","Huazuo Gao","Chenggang Zhao","Xu Sun","Damai Dai"],"pdf_url":"https://arxiv.org/pdf/2408.15664v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14951v2","updated":"2024-08-28T09:08:11Z","published":"2024-08-27T10:54:51Z","title":"Domain-decoupled Physics-informed Neural Networks with Closed-form\n  Gradients for Fast Model Learning of Dynamical Systems","summary":"  Physics-informed neural networks (PINNs) are trained using physical equations\nand can also incorporate unmodeled effects by learning from data. PINNs for\ncontrol (PINCs) of dynamical systems are gaining interest due to their\nprediction speed compared to classical numerical integration methods for\nnonlinear state-space models, making them suitable for real-time control\napplications. We introduce the domain-decoupled physics-informed neural network\n(DD-PINN) to address current limitations of PINC in handling large and complex\nnonlinear dynamical systems. The time domain is decoupled from the feed-forward\nneural network to construct an Ansatz function, allowing for calculation of\ngradients in closed form. This approach significantly reduces training times,\nespecially for large dynamical systems, compared to PINC, which relies on\ngraph-based automatic differentiation. Additionally, the DD-PINN inherently\nfulfills the initial condition and supports higher-order excitation inputs,\nsimplifying the training process and enabling improved prediction accuracy.\nValidation on three systems - a nonlinear mass-spring-damper, a\nfive-mass-chain, and a two-link robot - demonstrates that the DD-PINN achieves\nsignificantly shorter training times. In cases where the PINC's prediction\ndiverges, the DD-PINN's prediction remains stable and accurate due to higher\nphysics loss reduction or use of a higher-order excitation input. The DD-PINN\nallows for fast and accurate learning of large dynamical systems previously out\nof reach for the PINC.\n","authors":["Henrik Krauss","Tim-Lukas Habich","Max Bartholdt","Thomas Seel","Moritz Schappler"],"pdf_url":"https://arxiv.org/pdf/2408.14951v2.pdf","comment":"Accepted to International Conference on Informatics in Control,\n  Automation and Robotics (ICINCO) 2024"},{"id":"http://arxiv.org/abs/2402.09066v2","updated":"2024-08-28T09:01:37Z","published":"2024-02-14T10:24:04Z","title":"Solid Waste Detection, Monitoring and Mapping in Remote Sensing Images:\n  A Survey","summary":"  The detection and characterization of illegal solid waste disposal sites are\nessential for environmental protection, particularly for mitigating pollution\nand health hazards. Improperly managed landfills contaminate soil and\ngroundwater via rainwater infiltration, posing threats to both animals and\nhumans. Traditional landfill identification approaches, such as on-site\ninspections, are time-consuming and expensive. Remote sensing is a\ncost-effective solution for the identification and monitoring of solid waste\ndisposal sites that enables broad coverage and repeated acquisitions over time.\nEarth Observation (EO) satellites, equipped with an array of sensors and\nimaging capabilities, have been providing high-resolution data for several\ndecades. Researchers proposed specialized techniques that leverage remote\nsensing imagery to perform a range of tasks such as waste site detection,\ndumping site monitoring, and assessment of suitable locations for new\nlandfills. This review aims to provide a detailed illustration of the most\nrelevant proposals for the detection and monitoring of solid waste sites by\ndescribing and comparing the approaches, the implemented techniques, and the\nemployed data. Furthermore, since the data sources are of the utmost importance\nfor developing an effective solid waste detection model, a comprehensive\noverview of the satellites and publicly available data sets is presented.\nFinally, this paper identifies the open issues in the state-of-the-art and\ndiscusses the relevant research directions for reducing the costs and improving\nthe effectiveness of novel solid waste detection methods.\n","authors":["Piero Fraternali","Luca Morandini","Sergio Luis Herrera González"],"pdf_url":"https://arxiv.org/pdf/2402.09066v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15640v1","updated":"2024-08-28T08:52:14Z","published":"2024-08-28T08:52:14Z","title":"GANs Conditioning Methods: A Survey","summary":"  In recent years, Generative Adversarial Networks (GANs) have seen significant\nadvancements, leading to their widespread adoption across various fields. The\noriginal GAN architecture enables the generation of images without any specific\ncontrol over the content, making it an unconditional generation process.\nHowever, many practical applications require precise control over the generated\noutput, which has led to the development of conditional GANs (cGANs) that\nincorporate explicit conditioning to guide the generation process. cGANs extend\nthe original framework by incorporating additional information (conditions),\nenabling the generation of samples that adhere to that specific criteria.\nVarious conditioning methods have been proposed, each differing in how they\nintegrate the conditioning information into both the generator and the\ndiscriminator networks. In this work, we review the conditioning methods\nproposed for GANs, exploring the characteristics of each method and\nhighlighting their unique mechanisms and theoretical foundations. Furthermore,\nwe conduct a comparative analysis of these methods, evaluating their\nperformance on various image datasets. Through these analyses, we aim to\nprovide insights into the strengths and limitations of various conditioning\ntechniques, guiding future research and application in generative modeling.\n","authors":["Anis Bourou","Auguste Genovesio","Valérie Mezger"],"pdf_url":"https://arxiv.org/pdf/2408.15640v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15633v1","updated":"2024-08-28T08:35:34Z","published":"2024-08-28T08:35:34Z","title":"Comparison of Model Predictive Control and Proximal Policy Optimization\n  for a 1-DOF Helicopter System","summary":"  This study conducts a comparative analysis of Model Predictive Control (MPC)\nand Proximal Policy Optimization (PPO), a Deep Reinforcement Learning (DRL)\nalgorithm, applied to a 1-Degree of Freedom (DOF) Quanser Aero 2 system.\nClassical control techniques such as MPC and Linear Quadratic Regulator (LQR)\nare widely used due to their theoretical foundation and practical\neffectiveness. However, with advancements in computational techniques and\nmachine learning, DRL approaches like PPO have gained traction in solving\noptimal control problems through environment interaction. This paper\nsystematically evaluates the dynamic response characteristics of PPO and MPC,\ncomparing their performance, computational resource consumption, and\nimplementation complexity. Experimental results show that while LQR achieves\nthe best steady-state accuracy, PPO excels in rise-time and adaptability,\nmaking it a promising approach for applications requiring rapid response and\nadaptability. Additionally, we have established a baseline for future\nRL-related research on this specific testbed. We also discuss the strengths and\nlimitations of each control strategy, providing recommendations for selecting\nappropriate controllers for real-world scenarios.\n","authors":["Georg Schäfer","Jakob Rehrl","Stefan Huber","Simon Hirlaender"],"pdf_url":"https://arxiv.org/pdf/2408.15633v1.pdf","comment":"Accepted at INDIN2024"},{"id":"http://arxiv.org/abs/2408.08454v2","updated":"2024-08-28T08:31:28Z","published":"2024-08-15T23:34:04Z","title":"Beyond Uniform Query Distribution: Key-Driven Grouped Query Attention","summary":"  The Transformer architecture has revolutionized deep learning through its\nSelf-Attention mechanism, which effectively captures contextual information.\nHowever, the memory footprint of Self-Attention presents significant challenges\nfor long-sequence tasks. Grouped Query Attention (GQA) addresses this issue by\ngrouping queries and mean-pooling the corresponding key-value heads - reducing\nthe number of overall parameters and memory requirements in a flexible manner\nwithout adversely compromising model accuracy. In this work, we introduce\nenhancements to GQA, focusing on two novel approaches that deviate from the\nstatic nature of grouping: Key-Distributed GQA (KDGQA) and Dynamic\nKey-Distributed GQA (DGQA), which leverage information from the norms of the\nkey heads to inform query allocation. Specifically, KDGQA looks at the ratios\nof the norms of the key heads during each forward pass, while DGQA examines the\nratios of the norms as they evolve through training. Additionally, we present\nPerturbed GQA (PGQA) as a case-study, which introduces variability in (static)\ngroup formation via subtracting noise from the attention maps. Our experiments\nwith up-trained Vision Transformers, for Image Classification on datasets such\nas CIFAR-10, CIFAR-100, Food101, and Tiny ImageNet, demonstrate the promise of\nthese variants in improving upon the original GQA through more informed and\nadaptive grouping mechanisms: specifically ViT-L experiences accuracy gains of\nup to 8% when utilizing DGQA in comparison to GQA and other variants. We\nfurther analyze the impact of the number of Key-Value Heads on performance,\nunderscoring the importance of utilizing query-key affinities. Code is\navailable on GitHub.\n","authors":["Zohaib Khan","Muhammad Khaquan","Omer Tafveez","Burhanuddin Samiwala","Agha Ali Raza"],"pdf_url":"https://arxiv.org/pdf/2408.08454v2.pdf","comment":"11 pages, 9 figures"},{"id":"http://arxiv.org/abs/2408.15621v1","updated":"2024-08-28T08:22:21Z","published":"2024-08-28T08:22:21Z","title":"Convergent Differential Privacy Analysis for General Federated Learning:\n  the f-DP Perspective","summary":"  Federated learning (FL) is an efficient collaborative training paradigm\nextensively developed with a focus on local privacy protection, and\ndifferential privacy (DP) is a classical approach to capture and ensure the\nreliability of local privacy. The powerful cooperation of FL and DP provides a\npromising learning framework for large-scale private clients, juggling both\nprivacy securing and trustworthy learning. As the predominant algorithm of DP,\nthe noisy perturbation has been widely studied and incorporated into various\nfederated algorithms, theoretically proven to offer significant privacy\nprotections. However, existing analyses in noisy FL-DP mostly rely on the\ncomposition theorem and cannot tightly quantify the privacy leakage challenges,\nwhich is nearly tight for small numbers of communication rounds but yields an\narbitrarily loose and divergent bound under the large communication rounds.\nThis implies a counterintuitive judgment, suggesting that FL may not provide\nadequate privacy protection during long-term training. To further investigate\nthe convergent privacy and reliability of the FL-DP framework, in this paper,\nwe comprehensively evaluate the worst privacy of two classical methods under\nthe non-convex and smooth objectives based on the f-DP analysis, i.e.\nNoisy-FedAvg and Noisy-FedProx methods. With the aid of the\nshifted-interpolation technique, we successfully prove that the worst privacy\nof the Noisy-FedAvg method achieves a tight convergent lower bound. Moreover,\nin the Noisy-FedProx method, with the regularization of the proxy term, the\nworst privacy has a stable constant lower bound. Our analysis further provides\na solid theoretical foundation for the reliability of privacy protection in\nFL-DP. Meanwhile, our conclusions can also be losslessly converted to other\nclassical DP analytical frameworks, e.g. $(\\epsilon,\\delta)$-DP and\nR$\\acute{\\text{e}}$nyi-DP (RDP).\n","authors":["Yan Sun","Li Shen","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2408.15621v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15620v1","updated":"2024-08-28T08:21:56Z","published":"2024-08-28T08:21:56Z","title":"CAPER: Enhancing Career Trajectory Prediction using Temporal Knowledge\n  Graph and Ternary Relationship","summary":"  The problem of career trajectory prediction (CTP) aims to predict one's\nfuture employer or job position. While several CTP methods have been developed\nfor this problem, we posit that none of these methods (1) jointly considers the\nmutual ternary dependency between three key units (i.e., user, position, and\ncompany) of a career and (2) captures the characteristic shifts of key units in\ncareer over time, leading to an inaccurate understanding of the job movement\npatterns in the labor market. To address the above challenges, we propose a\nnovel solution, named as CAPER, that solves the challenges via sophisticated\ntemporal knowledge graph (TKG) modeling. It enables the utilization of a\ngraph-structured knowledge base with rich expressiveness, effectively\npreserving the changes in job movement patterns. Furthermore, we devise an\nextrapolated career reasoning task on TKG for a realistic evaluation. The\nexperiments on a real-world career trajectory dataset demonstrate that CAPER\nconsistently and significantly outperforms four baselines, two recent TKG\nreasoning methods, and five state-of-the-art CTP methods in predicting one's\nfuture companies and positions-i.e., on average, yielding 6.80% and 34.58% more\naccurate predictions, respectively.\n","authors":["Yeon-Chang Lee","JaeHyun Lee","Michiharu Yamashita","Dongwon Lee","Sang-Wook Kim"],"pdf_url":"https://arxiv.org/pdf/2408.15620v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15619v1","updated":"2024-08-28T08:20:05Z","published":"2024-08-28T08:20:05Z","title":"Large-Scale Demand Prediction in Urban Rail using Multi-Graph Inductive\n  Representation Learning","summary":"  With the expansion of cities over time, URT (Urban Rail Transit) networks\nhave also grown significantly. Demand prediction plays an important role in\nsupporting planning, scheduling, fleet management, and other operational\ndecisions. In this study, we propose an Origin-Destination (OD) demand\nprediction model called Multi-Graph Inductive Representation Learning\n(mGraphSAGE) for large-scale URT networks under operational uncertainties. Our\nmain contributions are twofold: we enhance prediction results while ensuring\nscalability for large networks by relying simultaneously on multiple graphs,\nwhere each OD pair is a node on a graph and distinct OD relationships, such as\ntemporal and spatial correlations; we show the importance of including\noperational uncertainties such as train delays and cancellations as inputs in\ndemand prediction for daily operations. The model is validated on three\ndifferent scales of the URT network in Copenhagen, Denmark. Experimental\nresults show that by leveraging information from neighboring ODs and learning\nnode representations via sampling and aggregation, mGraphSAGE is particularly\nsuitable for OD demand prediction in large-scale URT networks, outperforming\nreference machine learning methods. Furthermore, during periods with train\ncancellations and delays, the performance gap between mGraphSAGE and other\nmethods improves compared to normal operating conditions, demonstrating its\nability to leverage system reliability information for predicting OD demand\nunder uncertainty.\n","authors":["Dang Viet Anh Nguyen","J. Victor Flensburg","Fabrizio Cerreto","Bianca Pascariu","Paola Pellegrini","Carlos Lima Azevedo","Filipe Rodrigues"],"pdf_url":"https://arxiv.org/pdf/2408.15619v1.pdf","comment":"18 pages, 3 figures"},{"id":"http://arxiv.org/abs/2408.12961v3","updated":"2024-08-28T08:15:18Z","published":"2024-08-23T10:12:08Z","title":"Symplectic Bregman divergences","summary":"  We present a generalization of Bregman divergences in symplectic vector\nspaces that we term symplectic Bregman divergences. Symplectic Bregman\ndivergences are derived from a symplectic generalization of the Fenchel-Young\ninequality which relies on the notion of symplectic subdifferentials. The\nsymplectic Fenchel-Young inequality is obtained using the symplectic Fenchel\ntransform which is defined with respect to the symplectic form. Since\nsymplectic forms can be generically built from pairings of dual systems, we get\na generalization of Bregman divergences in dual systems obtained by equivalent\nsymplectic Bregman divergences. In particular, when the symplectic form is\nderived from an inner product, we show that the corresponding symplectic\nBregman divergences amount to ordinary Bregman divergences with respect to\ncomposite inner products. Some potential applications of symplectic divergences\nin geometric mechanics, information geometry, and learning dynamics in machine\nlearning are touched upon.\n","authors":["Frank Nielsen"],"pdf_url":"https://arxiv.org/pdf/2408.12961v3.pdf","comment":"14 pages, 3 figures"},{"id":"http://arxiv.org/abs/2408.15609v1","updated":"2024-08-28T08:03:04Z","published":"2024-08-28T08:03:04Z","title":"Statistical QoS Provision in Business-Centric Networks","summary":"  More refined resource management and Quality of Service (QoS) provisioning is\na critical goal of wireless communication technologies. In this paper, we\npropose a novel Business-Centric Network (BCN) aimed at enabling scalable QoS\nprovisioning, based on a cross-layer framework that captures the relationship\nbetween application, transport parameters, and channels. We investigate both\ncontinuous flow and event-driven flow models, presenting key QoS metrics such\nas throughput, delay, and reliability. By jointly considering power and\nbandwidth allocation, transmission parameters, and AP network topology across\nlayers, we optimize weighted resource efficiency with statistical QoS\nprovisioning. To address the coupling among parameters, we propose a novel deep\nreinforcement learning (DRL) framework, which is Collaborative Optimization\namong Heterogeneous Actors with Experience Sharing (COHA-ES). Power and\nsub-channel (SC) Actors representing multiple APs are jointly optimized under\nthe unified guidance of a common critic. Additionally, we introduce a novel\nmultithreaded experience-sharing mechanism to accelerate training and enhance\nrewards. Extensive comparative experiments validate the effectiveness of our\nDRL framework in terms of convergence and efficiency. Moreover, comparative\nanalyses demonstrate the comprehensive advantages of the BCN structure in\nenhancing both spectral and energy efficiency.\n","authors":["Chang Wu","Yuang Chen","Hancheng Lu"],"pdf_url":"https://arxiv.org/pdf/2408.15609v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2408.15601v1","updated":"2024-08-28T07:49:29Z","published":"2024-08-28T07:49:29Z","title":"Grand canonical generative diffusion model for crystalline phases and\n  grain boundaries","summary":"  The diffusion model has emerged as a powerful tool for generating atomic\nstructures for materials science. This work calls attention to the deficiency\nof current particle-based diffusion models, which represent atoms as a point\ncloud, in generating even the simplest ordered crystalline structures. The\nproblem is attributed to particles being trapped in local minima during the\nscore-driven simulated annealing of the diffusion process, similar to the\nphysical process of force-driven simulated annealing. We develop a solution,\nthe grand canonical diffusion model, which adopts an alternative voxel-based\nrepresentation with continuous rather than fixed number of particles. The\nmethod is applied towards generation of several common crystalline phases as\nwell as the technologically important and challenging problem of grain boundary\nstructures.\n","authors":["Bo Lei","Enze Chen","Hyuna Kwon","Tim Hsu","Babak Sadigh","Vincenzo Lordi","Timofey Frolov","Fei Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.15601v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15600v1","updated":"2024-08-28T07:48:39Z","published":"2024-08-28T07:48:39Z","title":"Exploring Selective Layer Fine-Tuning in Federated Learning","summary":"  Federated learning (FL) has emerged as a promising paradigm for fine-tuning\nfoundation models using distributed data in a privacy-preserving manner. Under\nlimited computational resources, clients often find it more practical to\nfine-tune a selected subset of layers, rather than the entire model, based on\ntheir task-specific data. In this study, we provide a thorough theoretical\nexploration of selective layer fine-tuning in FL, emphasizing a flexible\napproach that allows the clients to adjust their selected layers according to\ntheir local data and resources. We theoretically demonstrate that the layer\nselection strategy has a significant impact on model convergence in two\ncritical aspects: the importance of selected layers and the heterogeneous\nchoices across clients. Drawing from these insights, we further propose a\nstrategic layer selection method that utilizes local gradients and regulates\nlayer selections across clients. The extensive experiments on both image and\ntext datasets demonstrate the effectiveness of the proposed strategy compared\nwith several baselines, highlighting its advances in identifying critical\nlayers that adapt to the client heterogeneity and training dynamics in FL.\n","authors":["Yuchang Sun","Yuexiang Xie","Bolin Ding","Yaliang Li","Jun Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.15600v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15593v1","updated":"2024-08-28T07:36:20Z","published":"2024-08-28T07:36:20Z","title":"Skills Regularized Task Decomposition for Multi-task Offline\n  Reinforcement Learning","summary":"  Reinforcement learning (RL) with diverse offline datasets can have the\nadvantage of leveraging the relation of multiple tasks and the common skills\nlearned across those tasks, hence allowing us to deal with real-world complex\nproblems efficiently in a data-driven way. In offline RL where only offline\ndata is used and online interaction with the environment is restricted, it is\nyet difficult to achieve the optimal policy for multiple tasks, especially when\nthe data quality varies for the tasks. In this paper, we present a skill-based\nmulti-task RL technique on heterogeneous datasets that are generated by\nbehavior policies of different quality. To learn the shareable knowledge across\nthose datasets effectively, we employ a task decomposition method for which\ncommon skills are jointly learned and used as guidance to reformulate a task in\nshared and achievable subtasks. In this joint learning, we use Wasserstein\nauto-encoder (WAE) to represent both skills and tasks on the same latent space\nand use the quality-weighted loss as a regularization term to induce tasks to\nbe decomposed into subtasks that are more consistent with high-quality skills\nthan others. To improve the performance of offline RL agents learned on the\nlatent space, we also augment datasets with imaginary trajectories relevant to\nhigh-quality skills for each task. Through experiments, we show that our\nmulti-task offline RL approach is robust to the mixed configurations of\ndifferent-quality datasets and it outperforms other state-of-the-art algorithms\nfor several robotic manipulation tasks and drone navigation tasks.\n","authors":["Minjong Yoo","Sangwoo Cho","Honguk Woo"],"pdf_url":"https://arxiv.org/pdf/2408.15593v1.pdf","comment":"12 pages, 5 figures, acceepted in NeurIPS 2022"},{"id":"http://arxiv.org/abs/2408.15591v1","updated":"2024-08-28T07:31:32Z","published":"2024-08-28T07:31:32Z","title":"VFLIP: A Backdoor Defense for Vertical Federated Learning via\n  Identification and Purification","summary":"  Vertical Federated Learning (VFL) focuses on handling vertically partitioned\ndata over FL participants. Recent studies have discovered a significant\nvulnerability in VFL to backdoor attacks which specifically target the distinct\ncharacteristics of VFL. Therefore, these attacks may neutralize existing\ndefense mechanisms designed primarily for Horizontal Federated Learning (HFL)\nand deep neural networks. In this paper, we present the first backdoor defense,\ncalled VFLIP, specialized for VFL. VFLIP employs the identification and\npurification techniques that operate at the inference stage, consequently\nimproving the robustness against backdoor attacks to a great extent. VFLIP\nfirst identifies backdoor-triggered embeddings by adopting a participant-wise\nanomaly detection approach. Subsequently, VFLIP conducts purification which\nremoves the embeddings identified as malicious and reconstructs all the\nembeddings based on the remaining embeddings. We conduct extensive experiments\non CIFAR10, CINIC10, Imagenette, NUS-WIDE, and BankMarketing to demonstrate\nthat VFLIP can effectively mitigate backdoor attacks in VFL.\nhttps://github.com/blingcho/VFLIP-esorics24\n","authors":["Yungi Cho","Woorim Han","Miseon Yu","Ho Bae","Yunheung Paek"],"pdf_url":"https://arxiv.org/pdf/2408.15591v1.pdf","comment":"Accepted by 29th European Symposium on Research in Computer Security\n  (ESORICS 2024)"},{"id":"http://arxiv.org/abs/2408.15590v1","updated":"2024-08-28T07:26:30Z","published":"2024-08-28T07:26:30Z","title":"Bayesian optimization of atomic structures with prior probabilities from\n  universal interatomic potentials","summary":"  The optimization of atomic structures plays a pivotal role in understanding\nand designing materials with desired properties. However, conventional methods\noften struggle with the formidable task of navigating the vast potential energy\nsurface, especially in high-dimensional spaces with numerous local minima.\nRecent advancements in machine learning-driven surrogate models offer a\npromising avenue for alleviating this computational burden. In this study, we\npropose a novel approach that combines the strengths of universal machine\nlearning potentials with a Bayesian approach of the GOFEE/BEACON framework. By\nleveraging the comprehensive chemical knowledge encoded in pretrained universal\nmachine learning potentials as a prior estimate of energy and forces, we enable\nthe Gaussian process to focus solely on capturing the intricate nuances of the\npotential energy surface. We demonstrate the efficacy of our approach through\ncomparative analyses across diverse systems, including periodic bulk materials,\nsurface structures, and a cluster.\n","authors":["Peder Lyngby","Casper Larsen","Karsten Wedel Jacobsen"],"pdf_url":"https://arxiv.org/pdf/2408.15590v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15562v1","updated":"2024-08-28T06:28:01Z","published":"2024-08-28T06:28:01Z","title":"Boosting Lossless Speculative Decoding via Feature Sampling and Partial\n  Alignment Distillation","summary":"  Lossless speculative decoding accelerates target large language model (LLM)\ninference by employing a lightweight draft model for generating tree-structured\ncandidates, which are subsequently verified in parallel by the target LLM.\nCurrently, effective approaches leverage feature-level rather than token-level\nautoregression within the draft model to facilitate more straightforward\npredictions and enhanced knowledge distillation. In this paper, we reassess\nthese approaches and propose FSPAD (Feature Sampling and Partial Alignment\nDistillation for Lossless Speculative Decoding), which introduces two\nstraightforward and effective components within the existing framework to boost\nlossless speculative decoding. Firstly, FSPAD utilizes token embeddings to\nsample features of the target LLM in high-dimensional space before feeding them\ninto the draft model, due to the inherent uncertainty of the features\npreventing the draft model from obtaining the specific token output by the\ntarget LLM. Secondly, FSPAD introduces partial alignment distillation to weaken\nthe draft model's connection between features and logits, aiming to reduce the\nconflict between feature alignment and logit confidence during training. Our\nexperiments include both greedy and non-greedy decoding on the largest and\nsmallest models from the Vicuna and LLaMA3-Instruct series, as well as tasks in\nmulti-turn conversation, translation, summarization, question answering,\nmathematical reasoning, and retrieval-augmented generation. The results show\nthat FSPAD outperforms the state-of-the-art method across all the\naforementioned tasks and target LLMs.\n","authors":["Lujun Gui","Bin Xiao","Lei Su","Weipeng Chen"],"pdf_url":"https://arxiv.org/pdf/2408.15562v1.pdf","comment":"The work was not submitted to AAAI 2025"},{"id":"http://arxiv.org/abs/2405.07626v2","updated":"2024-08-28T06:18:28Z","published":"2024-05-13T10:37:50Z","title":"AnomalyLLM: Few-shot Anomaly Edge Detection for Dynamic Graphs using\n  Large Language Models","summary":"  Detecting anomaly edges for dynamic graphs aims to identify edges\nsignificantly deviating from the normal pattern and can be applied in various\ndomains, such as cybersecurity, financial transactions and AIOps. With the\nevolving of time, the types of anomaly edges are emerging and the labeled\nanomaly samples are few for each type. Current methods are either designed to\ndetect randomly inserted edges or require sufficient labeled data for model\ntraining, which harms their applicability for real-world applications. In this\npaper, we study this problem by cooperating with the rich knowledge encoded in\nlarge language models(LLMs) and propose a method, namely AnomalyLLM. To align\nthe dynamic graph with LLMs, AnomalyLLM pre-trains a dynamic-aware encoder to\ngenerate the representations of edges and reprograms the edges using the\nprototypes of word embeddings. Along with the encoder, we design an in-context\nlearning framework that integrates the information of a few labeled samples to\nachieve few-shot anomaly detection. Experiments on four datasets reveal that\nAnomalyLLM can not only significantly improve the performance of few-shot\nanomaly detection, but also achieve superior results on new anomalies without\nany update of model parameters.\n","authors":["Shuo Liu","Di Yao","Lanting Fang","Zhetao Li","Wenbin Li","Kaiyu Feng","XiaoWen Ji","Jingping Bi"],"pdf_url":"https://arxiv.org/pdf/2405.07626v2.pdf","comment":"13pages"},{"id":"http://arxiv.org/abs/2408.15555v1","updated":"2024-08-28T06:08:46Z","published":"2024-08-28T06:08:46Z","title":"Latent Relationship Mining of Glaucoma Biomarkers: a TRI-LSTM based Deep\n  Learning","summary":"  In recently years, a significant amount of research has been conducted on\napplying deep learning methods for glaucoma classification and detection.\nHowever, the explainability of those established machine learning models\nremains a big concern. In this research, in contrast, we learn from cognitive\nscience concept and study how ophthalmologists judge glaucoma detection.\nSimulating experts' efforts, we propose a hierarchical decision making system,\ncentered around a holistic set of carefully designed biomarker-oriented machine\nlearning models. While biomarkers represent the key indicators of how\nophthalmologists identify glaucoma, they usually exhibit latent\ninter-relations. We thus construct a time series model, named TRI-LSTM, capable\nof calculating and uncovering potential and latent relationships among various\nbiomarkers of glaucoma. Our model is among the first efforts to explore the\nintrinsic connections among glaucoma biomarkers. We monitor temporal\nrelationships in patients' disease states over time and to capture and retain\nthe progression of disease-relevant clinical information from prior visits,\nthereby enriching biomarker's potential relationships. Extensive experiments\nover real-world dataset have demonstrated the effectiveness of the proposed\nmodel.\n","authors":["Cheng Huang","Junhao Shen","Qiuyu Luo","Karanjit Kooner","Tsengdar Lee","Yishen Liu","Jia Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.15555v1.pdf","comment":"9 pages, 4 images"},{"id":"http://arxiv.org/abs/2408.15554v1","updated":"2024-08-28T06:07:58Z","published":"2024-08-28T06:07:58Z","title":"A Novel Denoising Technique and Deep Learning Based Hybrid Wind Speed\n  Forecasting Model for Variable Terrain Conditions","summary":"  Wind flow can be highly unpredictable and can suffer substantial fluctuations\nin speed and direction due to the shape and height of hills, mountains, and\nvalleys, making accurate wind speed (WS) forecasting essential in complex\nterrain. This paper presents a novel and adaptive model for short-term\nforecasting of WS. The paper's key contributions are as follows: (a) The\nPartial Auto Correlation Function (PACF) is utilised to minimise the dimension\nof the set of Intrinsic Mode Functions (IMF), hence reducing training time; (b)\nThe sample entropy (SampEn) was used to calculate the complexity of the reduced\nset of IMFs. The proposed technique is adaptive since a specific Deep Learning\n(DL) model-feature combination was chosen based on complexity; (c) A novel\nbidirectional feature-LSTM framework for complicated IMFs has been suggested,\nresulting in improved forecasting accuracy; (d) The proposed model shows\nsuperior forecasting performance compared to the persistence, hybrid, Ensemble\nempirical mode decomposition (EEMD), and Variational Mode Decomposition\n(VMD)-based deep learning models. It has achieved the lowest variance in terms\nof forecasting accuracy between simple and complex terrain conditions 0.70%.\nDimension reduction of IMF's and complexity-based model-feature selection helps\nreduce the training time by 68.77% and improve forecasting quality by 58.58% on\naverage.\n","authors":["Sourav Malakar","Saptarsi Goswami","Amlan Chakrabarti","Bhaswati Ganguli"],"pdf_url":"https://arxiv.org/pdf/2408.15554v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15545v1","updated":"2024-08-28T05:41:52Z","published":"2024-08-28T05:41:52Z","title":"SciLitLLM: How to Adapt LLMs for Scientific Literature Understanding","summary":"  Scientific literature understanding is crucial for extracting targeted\ninformation and garnering insights, thereby significantly advancing scientific\ndiscovery. Despite the remarkable success of Large Language Models (LLMs), they\nface challenges in scientific literature understanding, primarily due to (1) a\nlack of scientific knowledge and (2) unfamiliarity with specialized scientific\ntasks.\n  To develop an LLM specialized in scientific literature understanding, we\npropose a hybrid strategy that integrates continual pre-training (CPT) and\nsupervised fine-tuning (SFT), to simultaneously infuse scientific domain\nknowledge and enhance instruction-following capabilities for domain-specific\ntasks.cIn this process, we identify two key challenges: (1) constructing\nhigh-quality CPT corpora, and (2) generating diverse SFT instructions. We\naddress these challenges through a meticulous pipeline, including PDF text\nextraction, parsing content error correction, quality filtering, and synthetic\ninstruction creation. Applying this strategy, we present a suite of LLMs:\nSciLitLLM, specialized in scientific literature understanding. These models\ndemonstrate promising performance on scientific literature understanding\nbenchmarks.\n  Our contributions are threefold: (1) We present an effective framework that\nintegrates CPT and SFT to adapt LLMs to scientific literature understanding,\nwhich can also be easily adapted to other domains. (2) We propose an LLM-based\nsynthesis method to generate diverse and high-quality scientific instructions,\nresulting in a new instruction set -- SciLitIns -- for supervised fine-tuning\nin less-represented scientific domains. (3) SciLitLLM achieves promising\nperformance improvements on scientific literature understanding benchmarks.\n","authors":["Sihang Li","Jian Huang","Jiaxi Zhuang","Yaorui Shi","Xiaochen Cai","Mingjun Xu","Xiang Wang","Linfeng Zhang","Guolin Ke","Hengxing Cai"],"pdf_url":"https://arxiv.org/pdf/2408.15545v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15535v1","updated":"2024-08-28T04:56:06Z","published":"2024-08-28T04:56:06Z","title":"Improving Thompson Sampling via Information Relaxation for Budgeted\n  Multi-armed Bandits","summary":"  We consider a Bayesian budgeted multi-armed bandit problem, in which each arm\nconsumes a different amount of resources when selected and there is a budget\nconstraint on the total amount of resources that can be used. Budgeted Thompson\nSampling (BTS) offers a very effective heuristic to this problem, but its\narm-selection rule does not take into account the remaining budget information.\nWe adopt \\textit{Information Relaxation Sampling} framework that generalizes\nThompson Sampling for classical $K$-armed bandit problems, and propose a series\nof algorithms that are randomized like BTS but more carefully optimize their\ndecisions with respect to the budget constraint. In a one-to-one correspondence\nwith these algorithms, a series of performance benchmarks that improve the\nconventional benchmark are also suggested. Our theoretical analysis and\nsimulation results show that our algorithms (and our benchmarks) make\nincremental improvements over BTS (respectively, the conventional benchmark)\nacross various settings including a real-world example.\n","authors":["Woojin Jeong","Seungki Min"],"pdf_url":"https://arxiv.org/pdf/2408.15535v1.pdf","comment":"accepted"},{"id":"http://arxiv.org/abs/2408.11293v2","updated":"2024-08-28T04:09:33Z","published":"2024-08-21T02:48:42Z","title":"ViIK: Flow-based Vision Inverse Kinematics Solver with Fusing Collision\n  Checking","summary":"  Inverse Kinematics (IK) is to find the robot's configurations that satisfy\nthe target pose of the end effector. In motion planning, diverse configurations\nwere required in case a feasible trajectory was not found. Meanwhile, collision\nchecking (CC), e.g. Oriented bounding box (OBB), Discrete Oriented Polytope\n(DOP), and Quickhull \\cite{quickhull}, needs to be done for each configuration\nprovided by the IK solver to ensure every goal configuration for motion\nplanning is available. This means the classical IK solver and CC algorithm\nshould be executed repeatedly for every configuration. Thus, the preparation\ntime is long when the required number of goal configurations is large, e.g.\nmotion planning in cluster environments. Moreover, structured maps, which might\nbe difficult to obtain, were required by classical collision-checking\nalgorithms. To sidestep such two issues, we propose a flow-based vision method\nthat can output diverse available configurations by fusing inverse kinematics\nand collision checking, named Vision Inverse Kinematics solver (ViIK).\nMoreover, ViIK uses RGB images as the perception of environments. ViIK can\noutput 1000 configurations within 40 ms, and the accuracy is about 3\nmillimeters and 1.5 degrees. The higher accuracy can be obtained by being\nrefined by the classical IK solver within a few iterations. The self-collision\nrates can be lower than 2%. The collision-with-env rates can be lower than 10%\nin most scenes. The code is available at: https://github.com/AdamQLMeng/ViIK.\n","authors":["Qinglong Meng","Chongkun Xia","Xueqian Wang"],"pdf_url":"https://arxiv.org/pdf/2408.11293v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15510v1","updated":"2024-08-28T03:45:49Z","published":"2024-08-28T03:45:49Z","title":"Measuring the Reliability of Causal Probing Methods: Tradeoffs,\n  Limitations, and the Plight of Nullifying Interventions","summary":"  Causal probing is an approach to interpreting foundation models, such as\nlarge language models, by training probes to recognize latent properties of\ninterest from embeddings, intervening on probes to modify this representation,\nand analyzing the resulting changes in the model's behavior. While some recent\nworks have cast doubt on the theoretical basis of several leading causal\nprobing intervention methods, it has been unclear how to systematically and\nempirically evaluate their effectiveness in practice. To address this problem,\nwe propose a general empirical analysis framework to evaluate the reliability\nof causal probing interventions, formally defining and quantifying two key\ncausal probing desiderata: completeness (fully transforming the representation\nof the target property) and selectivity (minimally impacting other properties).\nOur formalism allows us to make the first direct comparisons between different\nfamilies of causal probing methods (e.g., linear vs. nonlinear or\ncounterfactual vs. nullifying interventions). We conduct extensive experiments\nacross several leading methods, finding that (1) there is an inherent tradeoff\nbetween these criteria, and no method is able to consistently satisfy both at\nonce; and (2) across the board, nullifying interventions are always far less\ncomplete than counterfactual interventions, indicating that nullifying methods\nmay not be an effective approach to causal probing.\n","authors":["Marc Canby","Adam Davies","Chirag Rastogi","Julia Hockenmaier"],"pdf_url":"https://arxiv.org/pdf/2408.15510v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14502v2","updated":"2024-08-28T03:22:53Z","published":"2024-08-24T02:04:12Z","title":"Physics-Informed Neural Network for Concrete Manufacturing Process\n  Optimization","summary":"  Concrete manufacturing projects are one of the most common ones for\nconsulting agencies. Because of the highly non-linear dependency of input\nmaterials like ash, water, cement, superplastic, etc; with the resultant\nstrength of concrete, it gets difficult for machine learning models to\nsuccessfully capture this relation and perform cost optimizations. This paper\nhighlights how PINNs (Physics Informed Neural Networks) can be useful in the\ngiven situation. This state-of-the-art model shall also get compared with\ntraditional models like Linear Regression, Random Forest, Gradient Boosting,\nand Deep Neural Network. Results of the research highlights how well PINNs\nperformed even with reduced dataset, thus resolving one of the biggest issues\nof limited data availability for ML models. On an average, PINN got the loss\nvalue reduced by 26.3% even with 40% lesser data compared to the Deep Neural\nNetwork. In addition to predicting strength of the concrete given the quantity\nof raw materials, the paper also highlights the use of heuristic optimization\nmethod like Particle Swarm Optimization (PSO) in predicting quantity of raw\nmaterials required to manufacture concrete of given strength with least cost.\n","authors":["Sam Varghese","Rahul Anand","Dr. Gaurav Paliwal"],"pdf_url":"https://arxiv.org/pdf/2408.14502v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15501v1","updated":"2024-08-28T03:10:45Z","published":"2024-08-28T03:10:45Z","title":"MODULI: Unlocking Preference Generalization via Diffusion Models for\n  Offline Multi-Objective Reinforcement Learning","summary":"  Multi-objective Reinforcement Learning (MORL) seeks to develop policies that\nsimultaneously optimize multiple conflicting objectives, but it requires\nextensive online interactions. Offline MORL provides a promising solution by\ntraining on pre-collected datasets to generalize to any preference upon\ndeployment. However, real-world offline datasets are often conservatively and\nnarrowly distributed, failing to comprehensively cover preferences, leading to\nthe emergence of out-of-distribution (OOD) preference areas. Existing offline\nMORL algorithms exhibit poor generalization to OOD preferences, resulting in\npolicies that do not align with preferences. Leveraging the excellent\nexpressive and generalization capabilities of diffusion models, we propose\nMODULI (Multi-objective Diffusion Planner with Sliding Guidance), which employs\na preference-conditioned diffusion model as a planner to generate trajectories\nthat align with various preferences and derive action for decision-making. To\nachieve accurate generation, MODULI introduces two return normalization methods\nunder diverse preferences for refining guidance. To further enhance\ngeneralization to OOD preferences, MODULI proposes a novel sliding guidance\nmechanism, which involves training an additional slider adapter to capture the\ndirection of preference changes. Incorporating the slider, it transitions from\nin-distribution (ID) preferences to generating OOD preferences, patching, and\nextending the incomplete Pareto front. Extensive experiments on the D4MORL\nbenchmark demonstrate that our algorithm outperforms state-of-the-art Offline\nMORL baselines, exhibiting excellent generalization to OOD preferences.\n","authors":["Yifu Yuan","Zhenrui Zheng","Zibin Dong","Jianye Hao"],"pdf_url":"https://arxiv.org/pdf/2408.15501v1.pdf","comment":"23 pages, 7 figures"},{"id":"http://arxiv.org/abs/2408.15498v1","updated":"2024-08-28T03:00:43Z","published":"2024-08-28T03:00:43Z","title":"Deep Learning to Predict Late-Onset Breast Cancer Metastasis: the Single\n  Hyperparameter Grid Search (SHGS) Strategy for Meta Tuning Concerning Deep\n  Feed-forward Neural Network","summary":"  While machine learning has advanced in medicine, its widespread use in\nclinical applications, especially in predicting breast cancer metastasis, is\nstill limited. We have been dedicated to constructing a DFNN model to predict\nbreast cancer metastasis n years in advance. However, the challenge lies in\nefficiently identifying optimal hyperparameter values through grid search,\ngiven the constraints of time and resources. Issues such as the infinite\npossibilities for continuous hyperparameters like l1 and l2, as well as the\ntime-consuming and costly process, further complicate the task. To address\nthese challenges, we developed Single Hyperparameter Grid Search (SHGS)\nstrategy, serving as a preselection method before grid search. Our experiments\nwith SHGS applied to DFNN models for breast cancer metastasis prediction focus\non analyzing eight target hyperparameters: epochs, batch size, dropout, L1, L2,\nlearning rate, decay, and momentum. We created three figures, each depicting\nthe experiment results obtained from three LSM-I-10-Plus-year datasets. These\nfigures illustrate the relationship between model performance and the target\nhyperparameter values. For each hyperparameter, we analyzed whether changes in\nthis hyperparameter would affect model performance, examined if there were\nspecific patterns, and explored how to choose values for the particular\nhyperparameter. Our experimental findings reveal that the optimal value of a\nhyperparameter is not only dependent on the dataset but is also significantly\ninfluenced by the settings of other hyperparameters. Additionally, our\nexperiments suggested some reduced range of values for a target hyperparameter,\nwhich may be helpful for low-budget grid search. This approach serves as a\nprior experience and foundation for subsequent use of grid search to enhance\nmodel performance.\n","authors":["Yijun Zhou","Om Arora-Jain","Xia Jiang"],"pdf_url":"https://arxiv.org/pdf/2408.15498v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15495v1","updated":"2024-08-28T02:45:41Z","published":"2024-08-28T02:45:41Z","title":"Remove Symmetries to Control Model Expressivity","summary":"  When symmetry is present in the loss function, the model is likely to be\ntrapped in a low-capacity state that is sometimes known as a \"collapse.\" Being\ntrapped in these low-capacity states can be a major obstacle to training across\nmany scenarios where deep learning technology is applied. We first prove two\nconcrete mechanisms through which symmetries lead to reduced capacities and\nignored features during training. We then propose a simple and theoretically\njustified algorithm, syre, to remove almost all symmetry-induced low-capacity\nstates in neural networks. The proposed method is shown to improve the training\nof neural networks in scenarios when this type of entrapment is especially a\nconcern. A remarkable merit of the proposed method is that it is model-agnostic\nand does not require any knowledge of the symmetry.\n","authors":["Liu Ziyin","Yizhou Xu","Isaac Chuang"],"pdf_url":"https://arxiv.org/pdf/2408.15495v1.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2402.16905v2","updated":"2024-08-28T02:37:08Z","published":"2024-02-24T21:36:26Z","title":"Procedural Adherence and Interpretability Through Neuro-Symbolic\n  Generative Agents","summary":"  The surge in popularity of large language models (LLMs) has opened doors for\nnew approaches to the creation of interactive agents. However, managing and\ninterpreting the temporal behavior of such agents over the course of a\npotentially infinite interaction remain challenging. The stateful, long-term\nhorizon reasoning required for coherent agent behavior does not fit well into\nthe LLM paradigm. We propose a combination of formal logic-based program\nsynthesis and LLM content generation to bring guarantees of procedural\nadherence and interpretability to generative agent behavior. To illustrate the\nbenefit of procedural adherence and interpretability, we use Temporal Stream\nLogic (TSL) to generate an automaton that enforces an interpretable, high-level\ntemporal structure on an agent. With the automaton tracking the context of the\ninteraction and making decisions to guide the conversation accordingly, we can\ndrive content generation in a way that allows the LLM to focus on a shorter\ncontext window. We evaluated our approach on different tasks involved in\ncreating an interactive agent specialized for generating\nchoose-your-own-adventure games. We found that over all of the tasks, an\nautomaton-enhanced agent with procedural guarantees achieves at least 96%\nadherence to its temporal constraints, whereas a purely LLM-based agent\ndemonstrates as low as 14.67% adherence.\n","authors":["Raven Rothkopf","Hannah Tongxin Zeng","Mark Santolucito"],"pdf_url":"https://arxiv.org/pdf/2402.16905v2.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2408.06452v2","updated":"2024-08-28T01:46:48Z","published":"2024-08-12T19:01:49Z","title":"Wireless Channel Aware Data Augmentation Methods for Deep Learning-Based\n  Indoor Localization","summary":"  Indoor localization is a challenging problem that - unlike outdoor\nlocalization - lacks a universal and robust solution. Machine Learning (ML),\nparticularly Deep Learning (DL), methods have been investigated as a promising\napproach. Although such methods bring remarkable localization accuracy, they\nheavily depend on the training data collected from the environment. The data\ncollection is usually a laborious and time-consuming task, but Data\nAugmentation (DA) can be used to alleviate this issue. In this paper, different\nfrom previously used DA, we propose methods that utilize the domain knowledge\nabout wireless propagation channels and devices. The methods exploit the\ntypical hardware component drift in the transceivers and/or the statistical\nbehavior of the channel, in combination with the measured Power Delay Profile\n(PDP). We comprehensively evaluate the proposed methods to demonstrate their\neffectiveness. This investigation mainly focuses on the impact of factors such\nas the number of measurements, augmentation proportion, and the environment of\ninterest impact the effectiveness of the different DA methods. We show that in\nthe low-data regime (few actual measurements available), localization accuracy\nincreases up to 50%, matching non-augmented results in the high-data regime. In\naddition, the proposed methods may outperform the measurement-only high-data\nperformance by up to 33% using only 1/4 of the amount of measured data. We also\nexhibit the effect of different training data distribution and quality on the\neffectiveness of DA. Finally, we demonstrate the power of the proposed methods\nwhen employed along with Transfer Learning (TL) to address the data scarcity in\ntarget and/or source environments.\n","authors":["Omer Gokalp Serbetci","Daoud Burghal","Andreas F. Molisch"],"pdf_url":"https://arxiv.org/pdf/2408.06452v2.pdf","comment":"13 pages, 14 figures"},{"id":"http://arxiv.org/abs/2210.17230v4","updated":"2024-08-28T01:37:40Z","published":"2022-10-31T11:15:48Z","title":"Lipschitz-regularized gradient flows and generative particle algorithms\n  for high-dimensional scarce data","summary":"  We build a new class of generative algorithms capable of efficiently learning\nan arbitrary target distribution from possibly scarce, high-dimensional data\nand subsequently generate new samples. These generative algorithms are\nparticle-based and are constructed as gradient flows of Lipschitz-regularized\nKullback-Leibler or other $f$-divergences, where data from a source\ndistribution can be stably transported as particles, towards the vicinity of\nthe target distribution. As a highlighted result in data integration, we\ndemonstrate that the proposed algorithms correctly transport gene expression\ndata points with dimension exceeding 54K, while the sample size is typically\nonly in the hundreds.\n","authors":["Hyemin Gu","Panagiota Birmpa","Yannis Pantazis","Luc Rey-Bellet","Markos A. Katsoulakis"],"pdf_url":"https://arxiv.org/pdf/2210.17230v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15462v1","updated":"2024-08-28T00:56:03Z","published":"2024-08-28T00:56:03Z","title":"CTRQNets & LQNets: Continuous Time Recurrent and Liquid Quantum Neural\n  Networks","summary":"  Neural networks have continued to gain prevalence in the modern era for their\nability to model complex data through pattern recognition and behavior\nremodeling. However, the static construction of traditional neural networks\ninhibits dynamic intelligence. This makes them inflexible to temporal changes\nin data and unfit to capture complex dependencies. With the advent of quantum\ntechnology, there has been significant progress in creating quantum algorithms.\nIn recent years, researchers have developed quantum neural networks that\nleverage the capabilities of qubits to outperform classical networks. However,\ntheir current formulation exhibits a static construction limiting the system's\ndynamic intelligence. To address these weaknesses, we develop a Liquid Quantum\nNeural Network (LQNet) and a Continuous Time Recurrent Quantum Neural Network\n(CTRQNet). Both models demonstrate a significant improvement in accuracy\ncompared to existing quantum neural networks (QNNs), achieving accuracy\nincreases as high as 40\\% on CIFAR 10 through binary classification. We propose\nLQNets and CTRQNets might shine a light on quantum machine learning's black\nbox.\n","authors":["Alejandro Mayorga","Alexander Yuan","Andrew Yuan","Tyler Wooldridge","Xiaodi Wang"],"pdf_url":"https://arxiv.org/pdf/2408.15462v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15458v1","updated":"2024-08-28T00:47:55Z","published":"2024-08-28T00:47:55Z","title":"PersonalizedUS: Interpretable Breast Cancer Risk Assessment with Local\n  Coverage Uncertainty Quantification","summary":"  Correctly assessing the malignancy of breast lesions identified during\nultrasound examinations is crucial for effective clinical decision-making.\nHowever, the current \"golden standard\" relies on manual BI-RADS scoring by\nclinicians, often leading to unnecessary biopsies and a significant mental\nhealth burden on patients and their families. In this paper, we introduce\nPersonalizedUS, an interpretable machine learning system that leverages recent\nadvances in conformal prediction to provide precise and personalized risk\nestimates with local coverage guarantees and sensitivity, specificity, and\npredictive values above 0.9 across various threshold levels. In particular, we\nidentify meaningful lesion subgroups where distribution-free, model-agnostic\nconditional coverage holds, with approximately 90% of our prediction sets\ncontaining only the ground truth in most lesion subgroups, thus explicitly\ncharacterizing for which patients the model is most suitably applied. Moreover,\nwe make available a curated tabular dataset of 1936 biopsied breast lesions\nfrom a recent observational multicenter study and benchmark the performance of\nseveral state-of-the-art learning algorithms. We also report a successful case\nstudy of the deployed system in the same multicenter context. Concrete clinical\nbenefits include up to a 65% reduction in requested biopsies among BI-RADS 4a\nand 4b lesions, with minimal to no missed cancer cases.\n","authors":["Alek Fröhlich","Thiago Ramos","Gustavo Cabello","Isabela Buzatto","Rafael Izbicki","Daniel Tiezzi"],"pdf_url":"https://arxiv.org/pdf/2408.15458v1.pdf","comment":"9 pages, 5 figure, 2 tables"},{"id":"http://arxiv.org/abs/2303.11789v8","updated":"2024-08-28T00:28:46Z","published":"2023-03-20T08:37:08Z","title":"Decentralized Online Learning for Random Inverse Problems Over Graphs","summary":"  We propose a decentralized online learning algorithm for distributed random\ninverse problems over network graphs with online measurements, and unifies the\ndistributed parameter estimation in Hilbert spaces and the least mean square\nproblem in reproducing kernel Hilbert spaces (RKHS-LMS). We transform the\nconvergence of the algorithm into the asymptotic stability of a class of\ninhomogeneous random difference equations in Hilbert spaces with\n$L_{2}$-bounded martingale difference terms and develop the $L_2$-asymptotic\nstability theory in Hilbert spaces. We show that if the network graph is\nconnected and the sequence of forward operators satisfies the\ninfinite-dimensional spatio-temporal persistence of excitation condition, then\nthe estimates of all nodes are mean square and almost surely strongly\nconsistent. Moreover, we propose a decentralized online learning algorithm in\nRKHS based on non-stationary online data streams, and prove that the algorithm\nis mean square and almost surely strongly consistent if the operators induced\nby the random input data satisfy the infinite-dimensional spatio-temporal\npersistence of excitation condition.\n","authors":["Tao Li","Xiwei Zhang","Yan Chen"],"pdf_url":"https://arxiv.org/pdf/2303.11789v8.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13479v2","updated":"2024-08-28T00:19:50Z","published":"2024-08-24T05:38:31Z","title":"Quantum-machine-assisted Drug Discovery: Survey and Perspective","summary":"  Drug discovery and development is a highly complex and costly endeavor,\ntypically requiring over a decade and substantial financial investment to bring\na new drug to market. Traditional computer-aided drug design (CADD) has made\nsignificant progress in accelerating this process, but the development of\nquantum computing offers potential due to its unique capabilities. This paper\ndiscusses the integration of quantum computing into drug discovery and\ndevelopment, focusing on how quantum technologies might accelerate and enhance\nvarious stages of the drug development cycle. Specifically, we explore the\napplication of quantum computing in addressing challenges related to drug\ndiscovery, such as molecular simulation and the prediction of drug-target\ninteractions, as well as the optimization of clinical trial outcomes. By\nleveraging the inherent capabilities of quantum computing, we might be able to\nreduce the time and cost associated with bringing new drugs to market,\nultimately benefiting public health.\n","authors":["Yidong Zhou","Jintai Chen","Jinglei Cheng","Gopal Karemore","Marinka Zitnik","Frederic T. Chong","Junyu Liu","Tianfan Fu","Zhiding Liang"],"pdf_url":"https://arxiv.org/pdf/2408.13479v2.pdf","comment":"27 pages, 10 figures"},{"id":"http://arxiv.org/abs/2408.15451v1","updated":"2024-08-28T00:14:09Z","published":"2024-08-28T00:14:09Z","title":"Certified Causal Defense with Generalizable Robustness","summary":"  While machine learning models have proven effective across various scenarios,\nit is widely acknowledged that many models are vulnerable to adversarial\nattacks. Recently, there have emerged numerous efforts in adversarial defense.\nAmong them, certified defense is well known for its theoretical guarantees\nagainst arbitrary adversarial perturbations on input within a certain range\n(e.g., $l_2$ ball). However, most existing works in this line struggle to\ngeneralize their certified robustness in other data domains with distribution\nshifts. This issue is rooted in the difficulty of eliminating the negative\nimpact of spurious correlations on robustness in different domains. To address\nthis problem, in this work, we propose a novel certified defense framework\nGLEAN, which incorporates a causal perspective into the generalization problem\nin certified defense. More specifically, our framework integrates a certifiable\ncausal factor learning component to disentangle the causal relations and\nspurious correlations between input and label, and thereby exclude the negative\neffect of spurious correlations on defense. On top of that, we design a\ncausally certified defense strategy to handle adversarial attacks on latent\ncausal factors. In this way, our framework is not only robust against malicious\nnoises on data in the training distribution but also can generalize its\nrobustness across domains with distribution shifts. Extensive experiments on\nbenchmark datasets validate the superiority of our framework in certified\nrobustness generalization in different data domains. Code is available in the\nsupplementary materials.\n","authors":["Yiran Qiao","Yu Yin","Chen Chen","Jing Ma"],"pdf_url":"https://arxiv.org/pdf/2408.15451v1.pdf","comment":"Submitted to AAAI"},{"id":"http://arxiv.org/abs/2408.15450v1","updated":"2024-08-28T00:07:51Z","published":"2024-08-28T00:07:51Z","title":"Avoiding Generative Model Writer's Block With Embedding Nudging","summary":"  Generative image models, since introduction, have become a global phenomenon.\nFrom new arts becoming possible to new vectors of abuse, many new capabilities\nhave become available. One of the challenging issues with generative models is\ncontrolling the generation process specially to prevent specific generations\nclasses or instances . There are several reasons why one may want to control\nthe output of generative models, ranging from privacy and safety concerns to\napplication limitations or user preferences\n  To address memorization and privacy challenges, there has been considerable\nresearch dedicated to filtering prompts or filtering the outputs of these\nmodels. What all these solutions have in common is that at the end of the day\nthey stop the model from producing anything, hence limiting the usability of\nthe model. In this paper, we propose a method for addressing this usability\nissue by making it possible to steer away from unwanted concepts (when detected\nin model's output) and still generating outputs. In particular we focus on the\nlatent diffusion image generative models and how one can prevent them to\ngenerate particular images while generating similar images with limited\noverhead.\n  We focus on mitigating issues like image memorization, demonstrating our\ntechnique's effectiveness through qualitative and quantitative evaluations. Our\nmethod successfully prevents the generation of memorized training images while\nmaintaining comparable image quality and relevance to the unmodified model.\n","authors":["Ali Zand","Milad Nasr"],"pdf_url":"https://arxiv.org/pdf/2408.15450v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16170v1","updated":"2024-08-28T23:25:25Z","published":"2024-08-28T23:25:25Z","title":"CardBench: A Benchmark for Learned Cardinality Estimation in Relational\n  Databases","summary":"  Cardinality estimation is crucial for enabling high query performance in\nrelational databases. Recently learned cardinality estimation models have been\nproposed to improve accuracy but there is no systematic benchmark or datasets\nwhich allows researchers to evaluate the progress made by new learned\napproaches and even systematically develop new learned approaches. In this\npaper, we are releasing a benchmark, containing thousands of queries over 20\ndistinct real-world databases for learned cardinality estimation. In contrast\nto other initial benchmarks, our benchmark is much more diverse and can be used\nfor training and testing learned models systematically. Using this benchmark,\nwe explored whether learned cardinality estimation can be transferred to an\nunseen dataset in a zero-shot manner. We trained GNN-based and\ntransformer-based models to study the problem in three setups: 1-)\ninstance-based, 2-) zero-shot, and 3-) fine-tuned. Our results show that while\nwe get promising results for zero-shot cardinality estimation on simple single\ntable queries; as soon as we add joins, the accuracy drops. However, we show\nthat with fine-tuning, we can still utilize pre-trained models for cardinality\nestimation, significantly reducing training overheads compared to instance\nspecific models. We are open sourcing our scripts to collect statistics,\ngenerate queries and training datasets to foster more extensive research, also\nfrom the ML community on the important problem of cardinality estimation and in\nparticular improve on recent directions such as pre-trained cardinality\nestimation.\n","authors":["Yannis Chronis","Yawen Wang","Yu Gan","Sami Abu-El-Haija","Chelsea Lin","Carsten Binnig","Fatma Özcan"],"pdf_url":"https://arxiv.org/pdf/2408.16170v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16169v1","updated":"2024-08-28T23:20:17Z","published":"2024-08-28T23:20:17Z","title":"Simulating realistic short tandem repeat capillary electrophoretic\n  signal using a generative adversarial network","summary":"  DNA profiles are made up from multiple series of electrophoretic signal\nmeasuring fluorescence over time. Typically, human DNA analysts 'read' DNA\nprofiles using their experience to distinguish instrument noise, artefactual\nsignal, and signal corresponding to DNA fragments of interest. Recent work has\ndeveloped an artificial neural network, ANN, to carry out the task of\nclassifying fluorescence types into categories in DNA profile electrophoretic\nsignal. But the creation of the necessarily large amount of labelled training\ndata for the ANN is time consuming and expensive, and a limiting factor in the\nability to robustly train the ANN. If realistic, prelabelled, training data\ncould be simulated then this would remove the barrier to training an ANN with\nhigh efficacy. Here we develop a generative adversarial network, GAN, modified\nfrom the pix2pix GAN to achieve this task. With 1078 DNA profiles we train the\nGAN and achieve the ability to simulate DNA profile information, and then use\nthe generator from the GAN as a 'realism filter' that applies the noise and\nartefact elements exhibited in typical electrophoretic signal.\n","authors":["Duncan Taylor","Melissa Humphries"],"pdf_url":"https://arxiv.org/pdf/2408.16169v1.pdf","comment":"29 pages, 9 Figures"},{"id":"http://arxiv.org/abs/2408.16168v1","updated":"2024-08-28T23:20:03Z","published":"2024-08-28T23:20:03Z","title":"LeMON: Learning to Learn Multi-Operator Networks","summary":"  Single-operator learning involves training a deep neural network to learn a\nspecific operator, whereas recent work in multi-operator learning uses an\noperator embedding structure to train a single neural network on data from\nmultiple operators. Thus, multi-operator learning is capable of predicting a\nrange of operators within one model. In this work, we propose pretraining and\nfine-tuning strategies for solving PDEs using multi-operator learning. One key\naspect is that by increasing the number of families of operators used in\npretraining, a PDE foundation model can be fine-tuned to downstream tasks\ninvolving new PDEs with a limited number of samples, thus outperforming single\noperator neural networks. Specifically, a multi-operator learning model\npre-trained with data from diverse PDE families can predict unseen operators\nafter fine-tuning with only a limited number of operators from the new family,\nenabling them to serve as a data-free PDE solver. We also show that the\nproposed training and fine-tuning method is able to predict new operators in\nzero-shot prediction without samples. Additionally, we introduce a PDE-agnostic\nmeta-learning algorithm to improve the adaptability of the model to various\nPDEs by providing a better parameter initialization process. To address the\nneeds of applications with limited computing resources, we explore low-rank\nadaptation methods that reduce computational costs while enhancing solver\naccuracy. Lastly, by examining the scaling law with respect to the number of\noperator families, we establish and highlight its potential for broad\nadaptation in PDE-solving tasks.\n","authors":["Jingmin Sun","Zecheng Zhang","Hayden Schaeffer"],"pdf_url":"https://arxiv.org/pdf/2408.16168v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16167v1","updated":"2024-08-28T23:15:46Z","published":"2024-08-28T23:15:46Z","title":"Free Lunch in the Forest: Functionally-Identical Pruning of Boosted Tree\n  Ensembles","summary":"  Tree ensembles, including boosting methods, are highly effective and widely\nused for tabular data. However, large ensembles lack interpretability and\nrequire longer inference times. We introduce a method to prune a tree ensemble\ninto a reduced version that is \"functionally identical\" to the original model.\nIn other words, our method guarantees that the prediction function stays\nunchanged for any possible input. As a consequence, this pruning algorithm is\nlossless for any aggregated metric. We formalize the problem of functionally\nidentical pruning on ensembles, introduce an exact optimization model, and\nprovide a fast yet highly effective method to prune large ensembles. Our\nalgorithm iteratively prunes considering a finite set of points, which is\nincrementally augmented using an adversarial model. In multiple computational\nexperiments, we show that our approach is a \"free lunch\", significantly\nreducing the ensemble size without altering the model's behavior. Thus, we can\npreserve state-of-the-art performance at a fraction of the original model's\nsize.\n","authors":["Youssouf Emine","Alexandre Forel","Idriss Malek","Thibaut Vidal"],"pdf_url":"https://arxiv.org/pdf/2408.16167v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02325v2","updated":"2024-08-28T22:54:15Z","published":"2024-04-02T21:51:39Z","title":"Heat Death of Generative Models in Closed-Loop Learning","summary":"  Improvement and adoption of generative machine learning models is rapidly\naccelerating, as exemplified by the popularity of LLMs (Large Language Models)\nfor text, and diffusion models for image generation. As generative models\nbecome widespread, data they generate is incorporated into shared content\nthrough the public web. This opens the question of what happens when data\ngenerated by a model is fed back to the model in subsequent training campaigns.\nThis is a question about the stability of the training process, whether the\ndistribution of publicly accessible content, which we refer to as \"knowledge\",\nremains stable or collapses.\n  Small scale empirical experiments reported in the literature show that this\nclosed-loop training process is prone to degenerating. Models may start\nproducing gibberish data, or sample from only a small subset of the desired\ndata distribution (a phenomenon referred to as mode collapse). So far there has\nbeen only limited theoretical understanding of this process, in part due to the\ncomplexity of the deep networks underlying these generative models.\n  The aim of this paper is to provide insights into this process (that we refer\nto as \"generative closed-loop learning\") by studying the learning dynamics of\ngenerative models that are fed back their own produced content in addition to\ntheir original training dataset. The sampling of many of these models can be\ncontrolled via a \"temperature\" parameter. Using dynamical systems tools, we\nshow that, unless a sufficient amount of external data is introduced at each\niteration, any non-trivial temperature leads the model to asymptotically\ndegenerate. In fact, either the generative distribution collapses to a small\nset of outputs or becomes uniform over a large set of outputs.\n","authors":["Matteo Marchi","Stefano Soatto","Pratik Chaudhari","Paulo Tabuada"],"pdf_url":"https://arxiv.org/pdf/2404.02325v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16160v1","updated":"2024-08-28T22:45:15Z","published":"2024-08-28T22:45:15Z","title":"CLPNets: Coupled Lie-Poisson Neural Networks for Multi-Part Hamiltonian\n  Systems with Symmetries","summary":"  To accurately compute data-based prediction of Hamiltonian systems,\nespecially the long-term evolution of such systems, it is essential to utilize\nmethods that preserve the structure of the equations over time. We consider a\ncase that is particularly challenging for data-based methods: systems with\ninteracting parts that do not reduce to pure momentum evolution. Such systems\nare essential in scientific computations. For example, any discretization of a\ncontinuum elastic rod can be viewed as interacting elements that can move and\nrotate in space, with each discrete element moving on the group of rotations\nand translations $SE(3)$.\n  We develop a novel method of data-based computation and complete phase space\nlearning of such systems. We follow the original framework of \\emph{SympNets}\n(Jin et al, 2020) building the neural network from canonical phase space\nmappings, and transformations that preserve the Lie-Poisson structure\n(\\emph{LPNets}) as in (Eldred et al, 2024). We derive a novel system of\nmappings that are built into neural networks for coupled systems. We call such\nnetworks Coupled Lie-Poisson Neural Networks, or \\emph{CLPNets}. We consider\nincreasingly complex examples for the applications of CLPNets: rotation of two\nrigid bodies about a common axis, the free rotation of two rigid bodies, and\nfinally the evolution of two connected and interacting $SE(3)$ components. Our\nmethod preserves all Casimir invariants of each system to machine precision,\nirrespective of the quality of the training data, and preserves energy to high\naccuracy. Our method also shows good resistance to the curse of dimensionality,\nrequiring only a few thousand data points for all cases studied, with the\neffective dimension varying from three to eighteen. Additionally, the method is\nhighly economical in memory requirements, requiring only about 200 parameters\nfor the most complex case considered.\n","authors":["Christopher Eldred","François Gay-Balmaz","Vakhtang Putkaradze"],"pdf_url":"https://arxiv.org/pdf/2408.16160v1.pdf","comment":"52 pages, 9 figures"},{"id":"http://arxiv.org/abs/2305.10994v2","updated":"2024-08-28T22:22:29Z","published":"2023-05-18T14:14:42Z","title":"Graphical vs. Deep Generative Models: Measuring the Impact of\n  Differentially Private Mechanisms and Budgets on Utility","summary":"  Generative models trained with Differential Privacy (DP) can produce\nsynthetic data while reducing privacy risks. However, navigating their\nprivacy-utility tradeoffs makes finding the best models for specific\nsettings/tasks challenging. This paper bridges this gap by profiling how DP\ngenerative models for tabular data distribute privacy budgets across rows and\ncolumns, which is one of the primary sources of utility degradation. We compare\ngraphical and deep generative models, focusing on the key factors contributing\nto how privacy budgets are spent, i.e., underlying modeling techniques, DP\nmechanisms, and data dimensionality.\n  Through our measurement study, we shed light on the characteristics that make\ndifferent models suitable for various settings and tasks. For instance, we find\nthat graphical models distribute privacy budgets horizontally and thus cannot\nhandle relatively wide datasets for a fixed training time; also, the\nperformance on the task they were optimized for monotonically increases with\nmore data but could also overfit. Deep generative models spend their budgets\nper iteration, so their behavior is less predictable with varying dataset\ndimensions, but are more flexible as they could perform better if trained on\nmore features. Moreover, low levels of privacy ($\\epsilon\\geq100$) could help\nsome models generalize, achieving better results than without applying DP. We\nbelieve our work will aid the deployment of DP synthetic data techniques by\nnavigating through the best candidate models vis-a-vis the dataset features,\ndesired privacy levels, and downstream tasks.\n","authors":["Georgi Ganev","Kai Xu","Emiliano De Cristofaro"],"pdf_url":"https://arxiv.org/pdf/2305.10994v2.pdf","comment":"A shorter version of this paper appears in the Proceedings of the\n  31st ACM Conference on Computer and Communications Security (ACM CCS 2024).\n  This is the full version"},{"id":"http://arxiv.org/abs/2408.16154v1","updated":"2024-08-28T22:14:44Z","published":"2024-08-28T22:14:44Z","title":"Does Data-Efficient Generalization Exacerbate Bias in Foundation Models?","summary":"  Foundation models have emerged as robust models with label efficiency in\ndiverse domains. In medical imaging, these models contribute to the advancement\nof medical diagnoses due to the difficulty in obtaining labeled data. However,\nit is unclear whether using a large amount of unlabeled data, biased by the\npresence of sensitive attributes during pre-training, influences the fairness\nof the model. This research examines the bias in the Foundation model\n(RetFound) when it is applied to fine-tune the Brazilian Multilabel\nOphthalmological Dataset (BRSET), which has a different population than the\npre-training dataset. The model evaluation, in comparison with supervised\nlearning, shows that the Foundation Model has the potential to reduce the gap\nbetween the maximum AUC and minimum AUC evaluations across gender and age\ngroups. However, in a data-efficient generalization, the model increases the\nbias when the data amount decreases. These findings suggest that when deploying\na Foundation Model in real-life scenarios with limited data, the possibility of\nfairness issues should be considered.\n","authors":["Dilermando Queiroz","Anderson Carlos","Maíra Fatoretto","André Anjos","Lilian Berton","Luis Filipe Nakayama"],"pdf_url":"https://arxiv.org/pdf/2408.16154v1.pdf","comment":"Preprint of paper to be presented at Fairness and Ethics Towards\n  Transparent AI: Facing the Challenge through Model Debiasing (FAILED) during\n  ECCV 2024"},{"id":"http://arxiv.org/abs/2408.16147v1","updated":"2024-08-28T21:28:45Z","published":"2024-08-28T21:28:45Z","title":"Improving the Prediction of Individual Engagement in Recommendations\n  Using Cognitive Models","summary":"  For public health programs with limited resources, the ability to predict how\nbehaviors change over time and in response to interventions is crucial for\ndeciding when and to whom interventions should be allocated. Using data from a\nreal-world maternal health program, we demonstrate how a cognitive model based\non Instance-Based Learning (IBL) Theory can augment existing purely\ncomputational approaches. Our findings show that, compared to general\ntime-series forecasters (e.g., LSTMs), IBL models, which reflect human\ndecision-making processes, better predict the dynamics of individuals' states.\nAdditionally, IBL provides estimates of the volatility in individuals' states\nand their sensitivity to interventions, which can improve the efficiency of\ntraining of other time series models.\n","authors":["Roderick Seow","Yunfan Zhao","Duncan Wood","Milind Tambe","Cleotilde Gonzalez"],"pdf_url":"https://arxiv.org/pdf/2408.16147v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2406.14485v6","updated":"2024-08-28T17:08:55Z","published":"2024-06-20T16:48:14Z","title":"Proceedings of The second international workshop on eXplainable AI for\n  the Arts (XAIxArts)","summary":"  This second international workshop on explainable AI for the Arts (XAIxArts)\nbrought together a community of researchers in HCI, Interaction Design, AI,\nexplainable AI (XAI), and digital arts to explore the role of XAI for the Arts.\nWorkshop held at the 16th ACM Conference on Creativity and Cognition (C&C\n2024), Chicago, USA.\n","authors":["Nick Bryan-Kinns","Corey Ford","Shuoyang Zheng","Helen Kennedy","Alan Chamberlain","Makayla Lewis","Drew Hemment","Zijin Li","Qiong Wu","Lanxi Xiao","Gus Xia","Jeba Rezwana","Michael Clemens","Gabriel Vigliensoni"],"pdf_url":"https://arxiv.org/pdf/2406.14485v6.pdf","comment":"Proceedings of The second international workshop on eXplainable AI\n  for the Arts (XAIxArts)"},{"id":"http://arxiv.org/abs/2407.19976v2","updated":"2024-08-28T13:01:06Z","published":"2024-07-29T13:09:26Z","title":"MambaGesture: Enhancing Co-Speech Gesture Generation with Mamba and\n  Disentangled Multi-Modality Fusion","summary":"  Co-speech gesture generation is crucial for producing synchronized and\nrealistic human gestures that accompany speech, enhancing the animation of\nlifelike avatars in virtual environments. While diffusion models have shown\nimpressive capabilities, current approaches often overlook a wide range of\nmodalities and their interactions, resulting in less dynamic and contextually\nvaried gestures. To address these challenges, we present MambaGesture, a novel\nframework integrating a Mamba-based attention block, MambaAttn, with a\nmulti-modality feature fusion module, SEAD. The MambaAttn block combines the\nsequential data processing strengths of the Mamba model with the contextual\nrichness of attention mechanisms, enhancing the temporal coherence of generated\ngestures. SEAD adeptly fuses audio, text, style, and emotion modalities,\nemploying disentanglement to deepen the fusion process and yield gestures with\ngreater realism and diversity. Our approach, rigorously evaluated on the\nmulti-modal BEAT dataset, demonstrates significant improvements in Fr\\'echet\nGesture Distance (FGD), diversity scores, and beat alignment, achieving\nstate-of-the-art performance in co-speech gesture generation. Project website:\n$\\href{https://fcchit.github.io/mambagesture/}{\\textit{https://fcchit.github.io/mambagesture/}}$.\n","authors":["Chencan Fu","Yabiao Wang","Jiangning Zhang","Zhengkai Jiang","Xiaofeng Mao","Jiafu Wu","Weijian Cao","Chengjie Wang","Yanhao Ge","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2407.19976v2.pdf","comment":"Accepted by ACM MM 2024"},{"id":"http://arxiv.org/abs/2408.11982v2","updated":"2024-08-28T11:01:16Z","published":"2024-08-21T20:32:45Z","title":"AIM 2024 Challenge on Compressed Video Quality Assessment: Methods and\n  Results","summary":"  Video quality assessment (VQA) is a crucial task in the development of video\ncompression standards, as it directly impacts the viewer experience. This paper\npresents the results of the Compressed Video Quality Assessment challenge, held\nin conjunction with the Advances in Image Manipulation (AIM) workshop at ECCV\n2024. The challenge aimed to evaluate the performance of VQA methods on a\ndiverse dataset of 459 videos, encoded with 14 codecs of various compression\nstandards (AVC/H.264, HEVC/H.265, AV1, and VVC/H.266) and containing a\ncomprehensive collection of compression artifacts. To measure the methods\nperformance, we employed traditional correlation coefficients between their\npredictions and subjective scores, which were collected via large-scale\ncrowdsourced pairwise human comparisons. For training purposes, participants\nwere provided with the Compressed Video Quality Assessment Dataset (CVQAD), a\npreviously developed dataset of 1022 videos. Up to 30 participating teams\nregistered for the challenge, while we report the results of 6 teams, which\nsubmitted valid final solutions and code for reproducing the results. Moreover,\nwe calculated and present the performance of state-of-the-art VQA methods on\nthe developed dataset, providing a comprehensive benchmark for future research.\nThe dataset, results, and online leaderboard are publicly available at\nhttps://challenges.videoprocessing.ai/challenges/compressedvideo-quality-assessment.html.\n","authors":["Maksim Smirnov","Aleksandr Gushchin","Anastasia Antsiferova","Dmitry Vatolin","Radu Timofte","Ziheng Jia","Zicheng Zhang","Wei Sun","Jiaying Qian","Yuqin Cao","Yinan Sun","Yuxin Zhu","Xiongkuo Min","Guangtao Zhai","Kanjar De","Qing Luo","Ao-Xiang Zhang","Peng Zhang","Haibo Lei","Linyan Jiang","Yaqing Li","Wenhui Meng","Xiaoheng Tan","Haiqiang Wang","Xiaozhong Xu","Shan Liu","Zhenzhong Chen","Zhengxue Cheng","Jiahao Xiao","Jun Xu","Chenlong He","Qi Zheng","Ruoxi Zhu","Min Li","Yibo Fan","Zhengzhong Tu"],"pdf_url":"https://arxiv.org/pdf/2408.11982v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15542v1","updated":"2024-08-28T05:34:14Z","published":"2024-08-28T05:34:14Z","title":"Kangaroo: A Powerful Video-Language Model Supporting Long-context Video\n  Input","summary":"  Rapid advancements have been made in extending Large Language Models (LLMs)\nto Large Multi-modal Models (LMMs). However, extending input modality of LLMs\nto video data remains a challenging endeavor, especially for long videos. Due\nto insufficient access to large-scale high-quality video data and the excessive\ncompression of visual features, current methods exhibit limitations in\neffectively processing long videos. In this paper, we introduce Kangaroo, a\npowerful Video LMM aimed at addressing these challenges. Confronted with issue\nof inadequate training data, we develop a data curation system to build a\nlarge-scale dataset with high-quality annotations for vision-language\npre-training and instruction tuning. In addition, we design a curriculum\ntraining pipeline with gradually increasing resolution and number of input\nframes to accommodate long videos. Evaluation results demonstrate that, with 8B\nparameters, Kangaroo achieves state-of-the-art performance across a variety of\nvideo understanding benchmarks while exhibiting competitive results on others.\nParticularly, on benchmarks specialized for long videos, Kangaroo excels some\nlarger models with over 10B parameters and proprietary models.\n","authors":["Jiajun Liu","Yibing Wang","Hanghang Ma","Xiaoping Wu","Xiaoqi Ma","Xiaoming Wei","Jianbin Jiao","Enhua Wu","Jie Hu"],"pdf_url":"https://arxiv.org/pdf/2408.15542v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15521v1","updated":"2024-08-28T04:14:01Z","published":"2024-08-28T04:14:01Z","title":"A Simple Baseline with Single-encoder for Referring Image Segmentation","summary":"  Referring image segmentation (RIS) requires dense vision-language\ninteractions between visual pixels and textual words to segment objects based\non a given description. However, commonly adapted dual-encoders in RIS, e.g.,\nSwin transformer and BERT (uni-modal encoders) or CLIP (a multi-modal\ndual-encoder), lack dense multi-modal interactions during pre-training, leading\nto a gap with a pixel-level RIS task. To bridge this gap, existing RIS methods\noften rely on multi-modal fusion modules that interact two encoders, but this\napproach leads to high computational costs. In this paper, we present a novel\nRIS method with a single-encoder, i.e., BEiT-3, maximizing the potential of\nshared self-attention across all framework components. This enables seamless\ninteractions of two modalities from input to final prediction, producing\ngranularly aligned multi-modal features. Furthermore, we propose lightweight\nyet effective decoder modules, a Shared FPN and a Shared Mask Decoder, which\ncontribute to the high efficiency of our model. Our simple baseline with a\nsingle encoder achieves outstanding performances on the RIS benchmark datasets\nwhile maintaining computational efficiency, compared to the most recent SoTA\nmethods based on dual-encoders.\n","authors":["Seonghoon Yu","Ilchae Jung","Byeongju Han","Taeoh Kim","Yunho Kim","Dongyoon Wee","Jeany Son"],"pdf_url":"https://arxiv.org/pdf/2408.15521v1.pdf","comment":"ArXiv pre-print"},{"id":"http://arxiv.org/abs/2408.15461v1","updated":"2024-08-28T00:54:51Z","published":"2024-08-28T00:54:51Z","title":"Hand1000: Generating Realistic Hands from Text with Only 1,000 Images","summary":"  Text-to-image generation models have achieved remarkable advancements in\nrecent years, aiming to produce realistic images from textual descriptions.\nHowever, these models often struggle with generating anatomically accurate\nrepresentations of human hands. The resulting images frequently exhibit issues\nsuch as incorrect numbers of fingers, unnatural twisting or interlacing of\nfingers, or blurred and indistinct hands. These issues stem from the inherent\ncomplexity of hand structures and the difficulty in aligning textual\ndescriptions with precise visual depictions of hands. To address these\nchallenges, we propose a novel approach named Hand1000 that enables the\ngeneration of realistic hand images with target gesture using only 1,000\ntraining samples. The training of Hand1000 is divided into three stages with\nthe first stage aiming to enhance the model's understanding of hand anatomy by\nusing a pre-trained hand gesture recognition model to extract gesture\nrepresentation. The second stage further optimizes text embedding by\nincorporating the extracted hand gesture representation, to improve alignment\nbetween the textual descriptions and the generated hand images. The third stage\nutilizes the optimized embedding to fine-tune the Stable Diffusion model to\ngenerate realistic hand images. In addition, we construct the first publicly\navailable dataset specifically designed for text-to-hand image generation.\nBased on the existing hand gesture recognition dataset, we adopt advanced image\ncaptioning models and LLaMA3 to generate high-quality textual descriptions\nenriched with detailed gesture information. Extensive experiments demonstrate\nthat Hand1000 significantly outperforms existing models in producing\nanatomically correct hand images while faithfully representing other details in\nthe text, such as faces, clothing, and colors.\n","authors":["Haozhuo Zhang","Bin Zhu","Yu Cao","Yanbin Hao"],"pdf_url":"https://arxiv.org/pdf/2408.15461v1.pdf","comment":"Project page https://haozhuo-zhang.github.io/Hand1000-project-page/"},{"id":"http://arxiv.org/abs/2408.16132v1","updated":"2024-08-28T20:48:04Z","published":"2024-08-28T20:48:04Z","title":"SVDD 2024: The Inaugural Singing Voice Deepfake Detection Challenge","summary":"  With the advancements in singing voice generation and the growing presence of\nAI singers on media platforms, the inaugural Singing Voice Deepfake Detection\n(SVDD) Challenge aims to advance research in identifying AI-generated singing\nvoices from authentic singers. This challenge features two tracks: a controlled\nsetting track (CtrSVDD) and an in-the-wild scenario track (WildSVDD). The\nCtrSVDD track utilizes publicly available singing vocal data to generate\ndeepfakes using state-of-the-art singing voice synthesis and conversion\nsystems. Meanwhile, the WildSVDD track expands upon the existing SingFake\ndataset, which includes data sourced from popular user-generated content\nwebsites. For the CtrSVDD track, we received submissions from 47 teams, with 37\nsurpassing our baselines and the top team achieving a 1.65% equal error rate.\nFor the WildSVDD track, we benchmarked the baselines. This paper reviews these\nresults, discusses key findings, and outlines future directions for SVDD\nresearch.\n","authors":["You Zhang","Yongyi Zang","Jiatong Shi","Ryuichi Yamamoto","Tomoki Toda","Zhiyao Duan"],"pdf_url":"https://arxiv.org/pdf/2408.16132v1.pdf","comment":null}]},"2024-08-29T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2408.16768v1","updated":"2024-08-29T17:59:45Z","published":"2024-08-29T17:59:45Z","title":"SAM2Point: Segment Any 3D as Videos in Zero-shot and Promptable Manners","summary":"  We introduce SAM2Point, a preliminary exploration adapting Segment Anything\nModel 2 (SAM 2) for zero-shot and promptable 3D segmentation. SAM2Point\ninterprets any 3D data as a series of multi-directional videos, and leverages\nSAM 2 for 3D-space segmentation, without further training or 2D-3D projection.\nOur framework supports various prompt types, including 3D points, boxes, and\nmasks, and can generalize across diverse scenarios, such as 3D objects, indoor\nscenes, outdoor environments, and raw sparse LiDAR. Demonstrations on multiple\n3D datasets, e.g., Objaverse, S3DIS, ScanNet, Semantic3D, and KITTI, highlight\nthe robust generalization capabilities of SAM2Point. To our best knowledge, we\npresent the most faithful implementation of SAM in 3D, which may serve as a\nstarting point for future research in promptable 3D segmentation. Online Demo:\nhttps://huggingface.co/spaces/ZiyuG/SAM2Point . Code:\nhttps://github.com/ZiyuGuo99/SAM2Point .\n","authors":["Ziyu Guo","Renrui Zhang","Xiangyang Zhu","Chengzhuo Tong","Peng Gao","Chunyuan Li","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2408.16768v1.pdf","comment":"Work in progress. Online Demo:\n  https://huggingface.co/spaces/ZiyuG/SAM2Point . Code:\n  https://github.com/ZiyuGuo99/SAM2Point"},{"id":"http://arxiv.org/abs/2408.16756v1","updated":"2024-08-29T17:54:14Z","published":"2024-08-29T17:54:14Z","title":"How Far Can Cantonese NLP Go? Benchmarking Cantonese Capabilities of\n  Large Language Models","summary":"  The rapid evolution of large language models (LLMs) has transformed the\ncompetitive landscape in natural language processing (NLP), particularly for\nEnglish and other data-rich languages. However, underrepresented languages like\nCantonese, spoken by over 85 million people, face significant development gaps,\nwhich is particularly concerning given the economic significance of the\nGuangdong-Hong Kong-Macau Greater Bay Area, and in substantial\nCantonese-speaking populations in places like Singapore and North America.\nDespite its wide use, Cantonese has scant representation in NLP research,\nespecially compared to other languages from similarly developed regions. To\nbridge these gaps, we outline current Cantonese NLP methods and introduce new\nbenchmarks designed to evaluate LLM performance in factual generation,\nmathematical logic, complex reasoning, and general knowledge in Cantonese,\nwhich aim to advance open-source Cantonese LLM technology. We also propose\nfuture research directions and recommended models to enhance Cantonese LLM\ndevelopment.\n","authors":["Jiyue Jiang","Liheng Chen","Pengan Chen","Sheng Wang","Qinghang Bao","Lingpeng Kong","Yu Li","Chuan Wu"],"pdf_url":"https://arxiv.org/pdf/2408.16756v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16753v1","updated":"2024-08-29T17:49:18Z","published":"2024-08-29T17:49:18Z","title":"Reinforcement Learning without Human Feedback for Last Mile Fine-Tuning\n  of Large Language Models","summary":"  Reinforcement learning is used to align language models with human preference\nsignals after first pre-training the model to predict the next token of text\nwithin a large corpus using likelihood maximization. Before being deployed in a\nspecific domain, models are often further fine-tuned on task specific data.\nSince human preferences are often unavailable for the last step, it is\nperformed using likelihood maximization as that is the typical default method.\nHowever, reinforcement learning has other advantages besides facilitating\nalignment to a human derived reward function. For one, whereas likelihood\nmaximization is a form of imitation learning in which the model is trained on\nwhat to do under ideal conditions, reinforcement learning is not limited to\ndemonstrating actions just for optimally reached states and trains a model what\nto do under a range of scenarios as it explores the policy space. In addition,\nit also trains a model what not to do, suppressing competitive but poor\nactions. This work develops a framework for last-mile fine-tuning using\nreinforcement learning and tests whether it garners performance gains. The\nexperiments center on abstractive summarization, but the framework is general\nand broadly applicable. Use of the procedure produced significantly better\nresults than likelihood maximization when comparing raw predictions. For the\nspecific data tested, the gap could be bridged by employing post-processing of\nthe maximum likelihood outputs. Nonetheless, the framework offers a new avenue\nfor model optimization in situations where post-processing may be less\nstraightforward or effective, and it can be extended to include more complex\nclasses of undesirable outputs to penalize and train against, such as\nhallucinations.\n","authors":["Alec Solway"],"pdf_url":"https://arxiv.org/pdf/2408.16753v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16751v1","updated":"2024-08-29T17:46:18Z","published":"2024-08-29T17:46:18Z","title":"A Gradient Analysis Framework for Rewarding Good and Penalizing Bad\n  Examples in Language Models","summary":"  Beyond maximum likelihood estimation (MLE), the standard objective of a\nlanguage model (LM) that optimizes good examples probabilities, many studies\nhave explored ways that also penalize bad examples for enhancing the quality of\noutput distribution, including unlikelihood training, exponential maximizing\naverage treatment effect (ExMATE), and direct preference optimization (DPO). To\nsystematically compare these methods and further provide a unified recipe for\nLM optimization, in this paper, we present a unique angle of gradient analysis\nof loss functions that simultaneously reward good examples and penalize bad\nones in LMs. Through both mathematical results and experiments on\nCausalDialogue and Anthropic HH-RLHF datasets, we identify distinct functional\ncharacteristics among these methods. We find that ExMATE serves as a superior\nsurrogate for MLE, and that combining DPO with ExMATE instead of MLE further\nenhances both the statistical (5-7%) and generative (+18% win rate)\nperformance.\n","authors":["Yi-Lin Tuan","William Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2408.16751v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16749v1","updated":"2024-08-29T17:43:03Z","published":"2024-08-29T17:43:03Z","title":"Assessing Large Language Models for Online Extremism Research:\n  Identification, Explanation, and New Knowledge","summary":"  The United States has experienced a significant increase in violent\nextremism, prompting the need for automated tools to detect and limit the\nspread of extremist ideology online. This study evaluates the performance of\nBidirectional Encoder Representations from Transformers (BERT) and Generative\nPre-Trained Transformers (GPT) in detecting and classifying online domestic\nextremist posts. We collected social media posts containing \"far-right\" and\n\"far-left\" ideological keywords and manually labeled them as extremist or\nnon-extremist. Extremist posts were further classified into one or more of five\ncontributing elements of extremism based on a working definitional framework.\nThe BERT model's performance was evaluated based on training data size and\nknowledge transfer between categories. We also compared the performance of GPT\n3.5 and GPT 4 models using different prompts: na\\\"ive, layperson-definition,\nrole-playing, and professional-definition. Results showed that the best\nperforming GPT models outperformed the best performing BERT models, with more\ndetailed prompts generally yielding better results. However, overly complex\nprompts may impair performance. Different versions of GPT have unique\nsensitives to what they consider extremist. GPT 3.5 performed better at\nclassifying far-left extremist posts, while GPT 4 performed better at\nclassifying far-right extremist posts. Large language models, represented by\nGPT models, hold significant potential for online extremism classification\ntasks, surpassing traditional BERT models in a zero-shot setting. Future\nresearch should explore human-computer interactions in optimizing GPT models\nfor extremist detection and classification tasks to develop more efficient\n(e.g., quicker, less effort) and effective (e.g., fewer errors or mistakes)\nmethods for identifying extremist content.\n","authors":["Beidi Dong","Jin R. Lee","Ziwei Zhu","Balassubramanian Srinivasan"],"pdf_url":"https://arxiv.org/pdf/2408.16749v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16740v1","updated":"2024-08-29T17:34:10Z","published":"2024-08-29T17:34:10Z","title":"Theoretical and Methodological Framework for Studying Texts Produced by\n  Large Language Models","summary":"  This paper addresses the conceptual, methodological and technical challenges\nin studying large language models (LLMs) and the texts they produce from a\nquantitative linguistics perspective. It builds on a theoretical framework that\ndistinguishes between the LLM as a substrate and the entities the model\nsimulates. The paper advocates for a strictly non-anthropomorphic approach to\nmodels while cautiously applying methodologies used in studying human\nlinguistic behavior to the simulated entities. While natural language\nprocessing researchers focus on the models themselves, their architecture,\nevaluation, and methods for improving performance, we as quantitative linguists\nshould strive to build a robust theory concerning the characteristics of texts\nproduced by LLMs, how they differ from human-produced texts, and the properties\nof simulated entities. Additionally, we should explore the potential of LLMs as\nan instrument for studying human culture, of which language is an integral\npart.\n","authors":["Jiří Milička"],"pdf_url":"https://arxiv.org/pdf/2408.16740v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16737v1","updated":"2024-08-29T17:32:35Z","published":"2024-08-29T17:32:35Z","title":"Smaller, Weaker, Yet Better: Training LLM Reasoners via Compute-Optimal\n  Sampling","summary":"  Training on high-quality synthetic data from strong language models (LMs) is\na common strategy to improve the reasoning performance of LMs. In this work, we\nrevisit whether this strategy is compute-optimal under a fixed inference budget\n(e.g., FLOPs). To do so, we investigate the trade-offs between generating\nsynthetic data using a stronger but more expensive (SE) model versus a weaker\nbut cheaper (WC) model. We evaluate the generated data across three key\nmetrics: coverage, diversity, and false positive rate, and show that the data\nfrom WC models may have higher coverage and diversity, but also exhibit higher\nfalse positive rates. We then finetune LMs on data from SE and WC models in\ndifferent settings: knowledge distillation, self-improvement, and a novel\nweak-to-strong improvement setup where a weaker LM teaches reasoning to a\nstronger LM. Our findings reveal that models finetuned on WC-generated data\nconsistently outperform those trained on SE-generated data across multiple\nbenchmarks and multiple choices of WC and SE models. These results challenge\nthe prevailing practice of relying on SE models for synthetic data generation,\nsuggesting that WC may be the compute-optimal approach for training advanced LM\nreasoners.\n","authors":["Hritik Bansal","Arian Hosseini","Rishabh Agarwal","Vinh Q. Tran","Mehran Kazemi"],"pdf_url":"https://arxiv.org/pdf/2408.16737v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16725v1","updated":"2024-08-29T17:18:53Z","published":"2024-08-29T17:18:53Z","title":"Mini-Omni: Language Models Can Hear, Talk While Thinking in Streaming","summary":"  Recent advances in language models have achieved significant progress.\nGPT-4o, as a new milestone, has enabled real-time conversations with humans,\ndemonstrating near-human natural fluency. Such human-computer interaction\nnecessitates models with the capability to perform reasoning directly with the\naudio modality and generate output in streaming. However, this remains beyond\nthe reach of current academic models, as they typically depend on extra TTS\nsystems for speech synthesis, resulting in undesirable latency. This paper\nintroduces the Mini-Omni, an audio-based end-to-end conversational model,\ncapable of real-time speech interaction. To achieve this capability, we propose\na text-instructed speech generation method, along with batch-parallel\nstrategies during inference to further boost the performance. Our method also\nhelps to retain the original model's language capabilities with minimal\ndegradation, enabling other works to establish real-time interaction\ncapabilities. We call this training method \"Any Model Can Talk\". We also\nintroduce the VoiceAssistant-400K dataset to fine-tune models optimized for\nspeech output. To our best knowledge, Mini-Omni is the first fully end-to-end,\nopen-source model for real-time speech interaction, offering valuable potential\nfor future research.\n","authors":["Zhifei Xie","Changqiao Wu"],"pdf_url":"https://arxiv.org/pdf/2408.16725v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2408.15409v2","updated":"2024-08-29T17:00:24Z","published":"2024-08-27T21:19:37Z","title":"Awes, Laws, and Flaws From Today's LLM Research","summary":"  We perform a critical examination of the scientific methodology behind\ncontemporary large language model (LLM) research. For this we assess over 2,000\nresearch works based on criteria typical of what is considered good research\n(e.g. presence of statistical tests and reproducibility) and cross-validate it\nwith arguments that are at the centre of controversy (e.g., claims of emergent\nbehaviour, the use of LLMs as evaluators). We find multiple trends, such as\ndeclines in claims of emergent behaviour and ethics disclaimers; the rise of\nLLMs as evaluators in spite of a lack of consensus from the community about\ntheir useability; and an increase of claims of LLM reasoning abilities,\ntypically without leveraging human evaluation. This paper underscores the need\nfor more scrutiny and rigour by and from this field to live up to the\nfundamentals of a responsible scientific method that is ethical, reproducible,\nsystematic, and open to criticism.\n","authors":["Adrian de Wynter"],"pdf_url":"https://arxiv.org/pdf/2408.15409v2.pdf","comment":"Under review -- v1 was an old draft with an unrevised abstract (oops)"},{"id":"http://arxiv.org/abs/2405.11039v3","updated":"2024-08-29T16:57:38Z","published":"2024-05-17T18:31:26Z","title":"CC-GPX: Extracting High-Quality Annotated Geospatial Data from Common\n  Crawl","summary":"  The Common Crawl (CC) corpus is the largest open web crawl dataset containing\n9.5+ petabytes of data captured since 2008. The dataset is instrumental in\ntraining large language models, and as such it has been studied for\n(un)desirable content, and distilled for smaller, domain-specific datasets.\nHowever, to our knowledge, no research has been dedicated to using CC as a\nsource of annotated geospatial data. In this paper, we introduce an efficient\npipeline to extract annotated user-generated tracks from GPX files found in CC,\nand the resulting multimodal dataset with 1,416 pairings of human-written\ndescriptions and MultiLineString vector data from the 6 most recent CC\nreleases. The dataset can be used to study people's outdoor activity patterns,\nthe way people talk about their outdoor experiences, as well as for developing\ntrajectory generation or track annotation models, or for various other problems\nin place of synthetically generated routes. Our reproducible code is available\non GitHub: https://github.com/ilyankou/cc-gpx\n","authors":["Ilya Ilyankou","Meihui Wang","Stefano Cavazzi","James Haworth"],"pdf_url":"https://arxiv.org/pdf/2405.11039v3.pdf","comment":"Accepted as a poster to ACM SIGSPATIAL 2024"},{"id":"http://arxiv.org/abs/2406.04952v2","updated":"2024-08-29T16:49:29Z","published":"2024-06-07T14:16:37Z","title":"Quantifying Geospatial in the Common Crawl Corpus","summary":"  Large language models (LLMs) exhibit emerging geospatial capabilities,\nstemming from their pre-training on vast unlabelled text datasets that are\noften derived from the Common Crawl (CC) corpus. However, the geospatial\ncontent within CC remains largely unexplored, impacting our understanding of\nLLMs' spatial reasoning. This paper investigates the prevalence of geospatial\ndata in recent Common Crawl releases using Gemini 1.5, a powerful language\nmodel. By analyzing a sample of documents and manually revising the results, we\nestimate that 18.7% of web documents in CC contain geospatial information such\nas coordinates and addresses. We find little difference in prevalence between\nEnlgish- and non-English-language documents. Our findings provide quantitative\ninsights into the nature and extent of geospatial data in CC, and lay the\ngroundwork for future studies of geospatial biases of LLMs.\n","authors":["Ilya Ilyankou","Meihui Wang","Stefano Cavazzi","James Haworth"],"pdf_url":"https://arxiv.org/pdf/2406.04952v2.pdf","comment":"Accepted as a poster to ACM SIGSPATIAL 2024"},{"id":"http://arxiv.org/abs/2403.05527v3","updated":"2024-08-29T16:48:58Z","published":"2024-03-08T18:48:30Z","title":"GEAR: An Efficient KV Cache Compression Recipe for Near-Lossless\n  Generative Inference of LLM","summary":"  Key-value (KV) caching has become the de-facto to accelerate generation speed\nfor large language models (LLMs) inference. However, the growing cache demand\nwith increasing sequence length has transformed LLM inference to be a memory\nbound problem, significantly constraining the system throughput. Existing\nmethods rely on dropping unimportant tokens or quantizing all entries\nuniformly. Such methods, however, often incur high approximation errors to\nrepresent the compressed matrices. The autoregressive decoding process further\ncompounds the error of each step, resulting in critical deviation in model\ngeneration and deterioration of performance. To tackle this challenge, we\npropose GEAR, an efficient KV cache compression framework that achieves\nnear-lossless high-ratio compression. GEAR first applies quantization to\nmajority of entries of similar magnitudes to ultra-low precision. It then\nemploys a low rank matrix to approximate the quantization error, and a sparse\nmatrix to remedy individual errors from outlier entries. By adeptly integrating\nthree techniques, GEAR is able to fully exploit their synergistic potentials.\nOur experiments demonstrate that compared to alternatives, GEAR achieves\nnear-lossless 4-bit KV cache compression with up to 2.38x throughput\nimprovement, while reducing peak-memory size up to 2.29x. Our code is publicly\navailable at https://github.com/HaoKang-Timmy/GEAR.\n","authors":["Hao Kang","Qingru Zhang","Souvik Kundu","Geonhwa Jeong","Zaoxing Liu","Tushar Krishna","Tuo Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.05527v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16672v1","updated":"2024-08-29T16:21:00Z","published":"2024-08-29T16:21:00Z","title":"Jina-ColBERT-v2: A General-Purpose Multilingual Late Interaction\n  Retriever","summary":"  Multi-vector dense models, such as ColBERT, have proven highly effective in\ninformation retrieval. ColBERT's late interaction scoring approximates the\njoint query-document attention seen in cross-encoders while maintaining\ninference efficiency closer to traditional dense retrieval models, thanks to\nits bi-encoder architecture and recent optimizations in indexing and search. In\nthis paper, we introduce several improvements to the ColBERT model architecture\nand training pipeline, leveraging techniques successful in the more established\nsingle-vector embedding model paradigm, particularly those suited for\nheterogeneous multilingual data. Our new model, Jina-ColBERT-v2, demonstrates\nstrong performance across a range of English and multilingual retrieval tasks,\nwhile also cutting storage requirements by up to 50% compared to previous\nmodels.\n","authors":["Rohan Jha","Bo Wang","Michael Günther","Saba Sturua","Mohammad Kalim Akram","Han Xiao"],"pdf_url":"https://arxiv.org/pdf/2408.16672v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16667v1","updated":"2024-08-29T16:15:01Z","published":"2024-08-29T16:15:01Z","title":"Iterative Graph Alignment","summary":"  By compressing diverse narratives, LLMs go beyond memorization, achieving\nintelligence by capturing generalizable causal relationships. However, they\nsuffer from local 'representation gaps' due to insufficient training data\ndiversity, limiting their real-world utility, especially in tasks requiring\nstrict alignment to rules. Traditional alignment methods relying on heavy human\nannotations are inefficient and unscalable. Recent self-alignment techniques\nalso fall short, as they often depend on self-selection based prompting and\nmemorization-based learning. To address these issues, we introduce Iterative\nGraph Alignment (IGA), an annotation-free rule-based alignment algorithm. A\nteacher model (VLM) employs Iterative Graph Prompting (IGP) to create logical\ngraphs and reference answers. The student model (LLM) identifies local\nknowledge gaps by attempting to align its responses with these references,\ncollaborating with helper models to generate diverse answers. These aligned\nresponses are then used for iterative supervised fine-tuning (SFT). Our\nevaluations across five rule-based scenarios demonstrate IGP's effectiveness,\nwith a 73.12\\% alignment improvement in Claude Sonnet 3.5, and\nLlama3-8B-Instruct achieving an 86.20\\% improvement, outperforming Claude\nSonnet 3.5 in rule-based alignment.\n","authors":["Fangyuan Yu","Hardeep Singh Arora","Matt Johnson"],"pdf_url":"https://arxiv.org/pdf/2408.16667v1.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2407.04559v2","updated":"2024-08-29T15:58:09Z","published":"2024-07-05T14:48:15Z","title":"Not (yet) the whole story: Evaluating Visual Storytelling Requires More\n  than Measuring Coherence, Grounding, and Repetition","summary":"  Visual storytelling consists in generating a natural language story given a\ntemporally ordered sequence of images. This task is not only challenging for\nmodels, but also very difficult to evaluate with automatic metrics since there\nis no consensus about what makes a story 'good'. In this paper, we introduce a\nnovel method that measures story quality in terms of human likeness regarding\nthree key aspects highlighted in previous work: visual grounding, coherence,\nand repetitiveness. We then use this method to evaluate the stories generated\nby several models, showing that the foundation model LLaVA obtains the best\nresult, but only slightly so compared to TAPM, a 50-times smaller visual\nstorytelling model. Upgrading the visual and language components of TAPM\nresults in a model that yields competitive performance with a relatively low\nnumber of parameters. Finally, we carry out a human evaluation study, whose\nresults suggest that a 'good' story may require more than a human-like level of\nvisual grounding, coherence, and repetition.\n","authors":["Aditya K Surikuchi","Raquel Fernández","Sandro Pezzelle"],"pdf_url":"https://arxiv.org/pdf/2407.04559v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14698v2","updated":"2024-08-29T15:14:48Z","published":"2024-08-26T23:52:27Z","title":"Smart Multi-Modal Search: Contextual Sparse and Dense Embedding\n  Integration in Adobe Express","summary":"  As user content and queries become increasingly multi-modal, the need for\neffective multi-modal search systems has grown. Traditional search systems\noften rely on textual and metadata annotations for indexed images, while\nmulti-modal embeddings like CLIP enable direct search using text and image\nembeddings. However, embedding-based approaches face challenges in integrating\ncontextual features such as user locale and recency. Building a scalable\nmulti-modal search system requires fine-tuning several components. This paper\npresents a multi-modal search architecture and a series of AB tests that\noptimize embeddings and multi-modal technologies in Adobe Express template\nsearch. We address considerations such as embedding model selection, the roles\nof embeddings in matching and ranking, and the balance between dense and sparse\nembeddings. Our iterative approach demonstrates how utilizing sparse, dense,\nand contextual features enhances short and long query search, significantly\nreduces null rates (over 70\\%), and increases click-through rates (CTR). Our\nfindings provide insights into developing robust multi-modal search systems,\nthereby enhancing relevance for complex queries.\n","authors":["Cherag Aroraa","Tracy Holloway King","Jayant Kumar","Yi Lu","Sanat Sharma","Arvind Srikantan","David Uvalle","Josep Valls-Vargas","Harsha Vardhan"],"pdf_url":"https://arxiv.org/pdf/2408.14698v2.pdf","comment":"CIKM 2024 (International Conference on Information and Knowledge\n  Management), Multimodal Search and Recommendations Workshop"},{"id":"http://arxiv.org/abs/2405.05418v2","updated":"2024-08-29T14:50:10Z","published":"2024-05-08T20:39:54Z","title":"Mitigating Exaggerated Safety in Large Language Models","summary":"  As the popularity of Large Language Models (LLMs) grow, combining model\nsafety with utility becomes increasingly important. The challenge is making\nsure that LLMs can recognize and decline dangerous prompts without sacrificing\ntheir ability to be helpful. The problem of \"exaggerated safety\" demonstrates\nhow difficult this can be. To reduce excessive safety behaviours -- which was\ndiscovered to be 26.1% of safe prompts being misclassified as dangerous and\nrefused -- we use a combination of XSTest dataset prompts as well as\ninteractive, contextual, and few-shot prompting to examine the decision bounds\nof LLMs such as Llama2, Gemma Command R+, and Phi-3. We find that few-shot\nprompting works best for Llama2, interactive prompting works best Gemma, and\ncontextual prompting works best for Command R+ and Phi-3. Using a combination\nof these prompting strategies, we are able to mitigate exaggerated safety\nbehaviors by an overall 92.9% across all LLMs. Our work presents a multiple\nprompting strategies to jailbreak LLMs' decision-making processes, allowing\nthem to navigate the tight line between refusing unsafe prompts and remaining\nhelpful.\n","authors":["Ruchira Ray","Ruchi Bhalani"],"pdf_url":"https://arxiv.org/pdf/2405.05418v2.pdf","comment":"17 pages, 8 figures, 2 tables"},{"id":"http://arxiv.org/abs/2408.16586v1","updated":"2024-08-29T14:49:13Z","published":"2024-08-29T14:49:13Z","title":"Enhancing Dialogue Generation in Werewolf Game Through Situation\n  Analysis and Persuasion Strategies","summary":"  Recent advancements in natural language processing, particularly with large\nlanguage models (LLMs) like GPT-4, have significantly enhanced dialogue\nsystems, enabling them to generate more natural and fluent conversations.\nDespite these improvements, challenges persist, such as managing continuous\ndialogues, memory retention, and minimizing hallucinations. The AIWolfDial2024\naddresses these challenges by employing the Werewolf Game, an incomplete\ninformation game, to test the capabilities of LLMs in complex interactive\nenvironments. This paper introduces a LLM-based Werewolf Game AI, where each\nrole is supported by situation analysis to aid response generation.\nAdditionally, for the werewolf role, various persuasion strategies, including\nlogical appeal, credibility appeal, and emotional appeal, are employed to\neffectively persuade other players to align with its actions.\n","authors":["Zhiyang Qi","Michimasa Inaba"],"pdf_url":"https://arxiv.org/pdf/2408.16586v1.pdf","comment":"Accepted to the AIWolfDial2024 workshop at INLG 2024"},{"id":"http://arxiv.org/abs/2406.11455v2","updated":"2024-08-29T14:48:10Z","published":"2024-06-17T12:11:01Z","title":"Adaptive Reinforcement Learning Planning: Harnessing Large Language\n  Models for Complex Information Extraction","summary":"  Existing research on large language models (LLMs) shows that they can solve\ninformation extraction tasks through multi-step planning. However, their\nextraction behavior on complex sentences and tasks is unstable, emerging issues\nsuch as false positives and missing elements. We observe that decomposing\ncomplex extraction tasks and extracting them step by step can effectively\nimprove LLMs' performance, and the extraction orders of entities significantly\naffect the final results of LLMs. This paper proposes a two-stage multi-step\nmethod for LLM-based information extraction and adopts the RL framework to\nexecute the multi-step planning. We regard sequential extraction as a Markov\ndecision process, build an LLM-based extraction environment, design a decision\nmodule to adaptively provide the optimal order for sequential entity extraction\non different sentences, and utilize the DDQN algorithm to train the decision\nmodel. We also design the rewards and evaluation metrics suitable for the\nextraction results of LLMs. We conduct extensive experiments on multiple public\ndatasets to demonstrate the effectiveness of our method in improving the\ninformation extraction capabilities of LLMs.\n","authors":["Zepeng Ding","Ruiyang Ke","Wenhao Huang","Guochao Jiang","Yanda Li","Deqing Yang","Jiaqing Liang"],"pdf_url":"https://arxiv.org/pdf/2406.11455v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15710v2","updated":"2024-08-29T14:47:37Z","published":"2024-08-28T11:18:06Z","title":"Conan-embedding: General Text Embedding with More and Better Negative\n  Samples","summary":"  With the growing popularity of RAG, the capabilities of embedding models are\ngaining increasing attention. Embedding models are primarily trained through\ncontrastive loss learning, with negative examples being a key component.\nPrevious work has proposed various hard negative mining strategies, but these\nstrategies are typically employed as preprocessing steps. In this paper, we\npropose the conan-embedding model, which maximizes the utilization of more and\nhigher-quality negative examples. Specifically, since the model's ability to\nhandle preprocessed negative examples evolves during training, we propose\ndynamic hard negative mining method to expose the model to more challenging\nnegative examples throughout the training process. Secondly, contrastive\nlearning requires as many negative examples as possible but is limited by GPU\nmemory constraints. Therefore, we use a Cross-GPU balancing Loss to provide\nmore negative examples for embedding training and balance the batch size across\nmultiple tasks. Moreover, we also discovered that the prompt-response pairs\nfrom LLMs can be used for embedding training. Our approach effectively enhances\nthe capabilities of embedding models, currently ranking first on the Chinese\nleaderboard of Massive text embedding benchmark\n","authors":["Shiyu Li","Yang Tang","Shizhe Chen","Xi Chen"],"pdf_url":"https://arxiv.org/pdf/2408.15710v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16570v1","updated":"2024-08-29T14:37:05Z","published":"2024-08-29T14:37:05Z","title":"Predictability maximization and the origins of word order harmony","summary":"  We address the linguistic problem of the sequential arrangement of a head and\nits dependents from an information theoretic perspective. In particular, we\nconsider the optimal placement of a head that maximizes the predictability of\nthe sequence. We assume that dependents are statistically independent given a\nhead, in line with the open-choice principle and the core assumptions of\ndependency grammar. We demonstrate the optimality of harmonic order, i.e.,\nplacing the head last maximizes the predictability of the head whereas placing\nthe head first maximizes the predictability of dependents. We also show that\npostponing the head is the optimal strategy to maximize its predictability\nwhile bringing it forward is the optimal strategy to maximize the\npredictability of dependents. We unravel the advantages of the strategy of\nmaximizing the predictability of the head over maximizing the predictability of\ndependents. Our findings shed light on the placements of the head adopted by\nreal languages or emerging in different kinds of experiments.\n","authors":["Ramon Ferrer-i-Cancho"],"pdf_url":"https://arxiv.org/pdf/2408.16570v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17844v2","updated":"2024-08-29T14:06:57Z","published":"2024-07-25T07:58:19Z","title":"Innovative Speech-Based Deep Learning Approaches for Parkinson's Disease\n  Classification: A Systematic Review","summary":"  Parkinson's disease (PD), the second most prevalent neurodegenerative\ndisorder worldwide, frequently presents with early-stage speech impairments.\nRecent advancements in Artificial Intelligence (AI), particularly deep learning\n(DL), have significantly enhanced PD diagnosis through the analysis of speech\ndata. Nevertheless, the progress of research is restricted by the limited\navailability of publicly accessible speech-based PD datasets, primarily due to\nprivacy concerns. The goal of this systematic review is to explore the current\nlandscape of speech-based DL approaches for PD classification, based on 33\nscientific works published between 2020 and March 2024. We discuss their\navailable resources, capabilities, potential limitations, and issues related to\nbias, explainability, and privacy. Furthermore, this review provides an\noverview of publicly accessible speech-based datasets and open-source material\nfor PD. The DL approaches are categorized into end-to-end (E2E) learning,\ntransfer learning (TL) and deep acoustic features extraction (DAFE) approaches.\nAmong E2E approaches, Convolutional Neural Networks (CNNs) are prevalent,\nthough Transformers are increasingly popular. E2E approaches face challenges\nsuch as limited data and computational resources, especially with Transformers.\nTL addresses these issues by providing more robust PD diagnosis and better\ngeneralizability across languages. DAFE aims to improve the explainability and\ninterpretability of results by examining the specific effects of deep features\non both other DL approaches and more traditional machine learning (ML) methods.\nHowever, it often underperforms compared to E2E and TL approaches.\n","authors":["Lisanne van Gelderen","Cristian Tejedor-García"],"pdf_url":"https://arxiv.org/pdf/2407.17844v2.pdf","comment":"Submitted in Applied Sciences - peer reviewed Open Access journal.\n  This research was funded by the NWO research programme AiNed Fellowship\n  Grants under the project Responsible AI for Voice Diagnostics (RAIVD) - grant\n  number NGF.1607.22.013"},{"id":"http://arxiv.org/abs/2402.01805v4","updated":"2024-08-29T14:05:44Z","published":"2024-02-02T09:45:33Z","title":"Can LLMs perform structured graph reasoning?","summary":"  Pretrained Large Language Models (LLMs) have demonstrated various reasoning\ncapabilities through language-based prompts alone, particularly in unstructured\ntask settings (tasks purely based on language semantics). However, LLMs often\nstruggle with structured tasks, because of the inherent incompatibility of\ninput representation. Reducing structured tasks to uni-dimensional language\nsemantics often renders the problem trivial. Keeping the trade-off between LLM\ncompatibility and structure complexity in mind, we design various graph\nreasoning tasks as a proxy to semi-structured tasks in this paper, in order to\ntest the ability to navigate through representations beyond plain text in\nvarious LLMs. Particularly, we design 10 distinct problems of graph traversal,\neach representing increasing levels of complexity, and benchmark 5 different\ninstruct-finetuned LLMs (GPT-4, GPT-3.5, Claude-2, Llama-2 and Palm-2) on the\naforementioned tasks. Further, we analyse the performance of models across\nvarious settings such as varying sizes of graphs as well as different forms of\nk-shot prompting. We highlight various limitations, biases and properties of\nLLMs through this benchmarking process, such as an inverse relation to the\naverage degrees of freedom of traversal per node in graphs, the overall\nnegative impact of k-shot prompting on graph reasoning tasks, and a positive\nresponse bias which prevents LLMs from identifying the absence of a valid\nsolution. Finally, we introduce a new prompting technique specially designed\nfor graph traversal tasks (PathCompare), which demonstrates a notable increase\nin the performance of LLMs in comparison to standard prompting techniques such\nas Chain-of-Thought (CoT).\n","authors":["Palaash Agrawal","Shavak Vasania","Cheston Tan"],"pdf_url":"https://arxiv.org/pdf/2402.01805v4.pdf","comment":"International Conference on Pattern Recognition (ICPR), 2024"},{"id":"http://arxiv.org/abs/2408.16542v1","updated":"2024-08-29T14:00:57Z","published":"2024-08-29T14:00:57Z","title":"SALSA: Speedy ASR-LLM Synchronous Aggregation","summary":"  Harnessing pre-trained LLMs to improve ASR systems, particularly for\nlow-resource languages, is now an emerging area of research. Existing methods\nrange from using LLMs for ASR error correction to tightly coupled systems that\nreplace the ASR decoder with the LLM. These approaches either increase decoding\ntime or require expensive training of the cross-attention layers. We propose\nSALSA, which couples the decoder layers of the ASR to the LLM decoder, while\nsynchronously advancing both decoders. Such coupling is performed with a simple\nprojection of the last decoder state, and is thus significantly more training\nefficient than earlier approaches. A challenge of our proposed coupling is\nhandling the mismatch between the tokenizers of the LLM and ASR systems. We\nhandle this mismatch using cascading tokenization with respect to the LLM and\nASR vocabularies. We evaluate SALSA on 8 low-resource languages in the FLEURS\nbenchmark, yielding substantial WER reductions of up to 38%.\n","authors":["Ashish Mittal","Darshan Prabhu","Sunita Sarawagi","Preethi Jyothi"],"pdf_url":"https://arxiv.org/pdf/2408.16542v1.pdf","comment":"Accepted to INTERSPEECH 2024"},{"id":"http://arxiv.org/abs/2406.19307v2","updated":"2024-08-29T13:51:34Z","published":"2024-06-27T16:30:50Z","title":"The Odyssey of Commonsense Causality: From Foundational Benchmarks to\n  Cutting-Edge Reasoning","summary":"  Understanding commonsense causality is a unique mark of intelligence for\nhumans. It helps people understand the principles of the real world better and\nbenefits the decision-making process related to causation. For instance,\ncommonsense causality is crucial in judging whether a defendant's action causes\nthe plaintiff's loss in determining legal liability. Despite its significance,\na systematic exploration of this topic is notably lacking. Our comprehensive\nsurvey bridges this gap by focusing on taxonomies, benchmarks, acquisition\nmethods, qualitative reasoning, and quantitative measurements in commonsense\ncausality, synthesizing insights from over 200 representative articles. Our\nwork aims to provide a systematic overview, update scholars on recent\nadvancements, provide a pragmatic guide for beginners, and highlight promising\nfuture research directions in this vital field.\n","authors":["Shaobo Cui","Zhijing Jin","Bernhard Schölkopf","Boi Faltings"],"pdf_url":"https://arxiv.org/pdf/2406.19307v2.pdf","comment":"42 pages"},{"id":"http://arxiv.org/abs/2408.14874v2","updated":"2024-08-29T13:49:40Z","published":"2024-08-27T08:43:32Z","title":"Inverse-Q*: Token Level Reinforcement Learning for Aligning Large\n  Language Models Without Preference Data","summary":"  Reinforcement Learning from Human Feedback (RLHF) has proven effective in\naligning large language models with human intentions, yet it often relies on\ncomplex methodologies like Proximal Policy Optimization (PPO) that require\nextensive hyper-parameter tuning and present challenges in sample efficiency\nand stability. In this paper, we introduce Inverse-Q*, an innovative framework\nthat transcends traditional RL methods by optimizing token-level reinforcement\nlearning without the need for additional reward or value models. Inverse-Q*\nleverages direct preference optimization techniques but extends them by\nestimating the conditionally optimal policy directly from the model's\nresponses, facilitating more granular and flexible policy shaping. Our approach\nreduces reliance on human annotation and external supervision, making it\nespecially suitable for low-resource settings. We present extensive\nexperimental results demonstrating that Inverse-Q* not only matches but\npotentially exceeds the effectiveness of PPO in terms of convergence speed and\nthe alignment of model responses with human preferences. Our findings suggest\nthat Inverse-Q* offers a practical and robust alternative to conventional RLHF\napproaches, paving the way for more efficient and adaptable model training\napproaches.\n","authors":["Han Xia","Songyang Gao","Qiming Ge","Zhiheng Xi","Qi Zhang","Xuanjing Huang"],"pdf_url":"https://arxiv.org/pdf/2408.14874v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16518v1","updated":"2024-08-29T13:28:52Z","published":"2024-08-29T13:28:52Z","title":"CNIMA: A Universal Evaluation Framework and Automated Approach for\n  Assessing Second Language Dialogues","summary":"  We develop CNIMA (Chinese Non-Native Interactivity Measurement and\nAutomation), a Chinese-as-a-second-language labelled dataset with 10K\ndialogues. We annotate CNIMA using an evaluation framework -- originally\nintroduced for English-as-a-second-language dialogues -- that assesses\nmicro-level features (e.g.\\ backchannels) and macro-level interactivity labels\n(e.g.\\ topic management) and test the framework's transferability from English\nto Chinese. We found the framework robust across languages and revealed\nuniversal and language-specific relationships between micro-level and\nmacro-level features. Next, we propose an approach to automate the evaluation\nand find strong performance, creating a new tool for automated second language\nassessment. Our system can be adapted to other languages easily as it uses\nlarge language models and as such does not require large-scale annotated\ntraining data.\n","authors":["Rena Gao","Jingxuan Wu","Carsten Roever","Xuetong Wu","Jing Wu","Long Lv","Jey Han Lau"],"pdf_url":"https://arxiv.org/pdf/2408.16518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16502v1","updated":"2024-08-29T13:01:42Z","published":"2024-08-29T13:01:42Z","title":"LLMs vs Established Text Augmentation Techniques for Classification:\n  When do the Benefits Outweight the Costs?","summary":"  The generative large language models (LLMs) are increasingly being used for\ndata augmentation tasks, where text samples are LLM-paraphrased and then used\nfor classifier fine-tuning. However, a research that would confirm a clear\ncost-benefit advantage of LLMs over more established augmentation methods is\nlargely missing. To study if (and when) is the LLM-based augmentation\nadvantageous, we compared the effects of recent LLM augmentation methods with\nestablished ones on 6 datasets, 3 classifiers and 2 fine-tuning methods. We\nalso varied the number of seeds and collected samples to better explore the\ndownstream model accuracy space. Finally, we performed a cost-benefit analysis\nand show that LLM-based methods are worthy of deployment only when very small\nnumber of seeds is used. Moreover, in many cases, established methods lead to\nsimilar or better model accuracies.\n","authors":["Jan Cegin","Jakub Simko","Peter Brusilovsky"],"pdf_url":"https://arxiv.org/pdf/2408.16502v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2408.16493v1","updated":"2024-08-29T12:44:01Z","published":"2024-08-29T12:44:01Z","title":"Learning from Negative Samples in Generative Biomedical Entity Linking","summary":"  Generative models have become widely used in biomedical entity linking\n(BioEL) due to their excellent performance and efficient memory usage. However,\nthese models are usually trained only with positive samples--entities that\nmatch the input mention's identifier--and do not explicitly learn from hard\nnegative samples, which are entities that look similar but have different\nmeanings. To address this limitation, we introduce ANGEL (Learning from\nNegative Samples in Generative Biomedical Entity Linking), the first framework\nthat trains generative BioEL models using negative samples. Specifically, a\ngenerative model is initially trained to generate positive samples from the\nknowledge base for given input entities. Subsequently, both correct and\nincorrect outputs are gathered from the model's top-k predictions. The model is\nthen updated to prioritize the correct predictions through direct preference\noptimization. Our models fine-tuned with ANGEL outperform the previous best\nbaseline models by up to an average top-1 accuracy of 1.4% on five benchmarks.\nWhen incorporating our framework into pre-training, the performance improvement\nfurther increases to 1.7%, demonstrating its effectiveness in both the\npre-training and fine-tuning stages. Our code is available at\nhttps://github.com/dmis-lab/ANGEL.\n","authors":["Chanhwi Kim","Hyunjae Kim","Sihyeon Park","Jiwoo Lee","Mujeen Sung","Jaewoo Kang"],"pdf_url":"https://arxiv.org/pdf/2408.16493v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11512v2","updated":"2024-08-29T12:25:14Z","published":"2024-08-21T10:44:10Z","title":"IKUN for WMT24 General MT Task: LLMs Are here for Multilingual Machine\n  Translation","summary":"  This paper introduces two multilingual systems, IKUN and IKUN-C, developed\nfor the general machine translation task in WMT24. IKUN and IKUN-C represent an\nopen system and a constrained system, respectively, built on Llama-3-8b and\nMistral-7B-v0.3. Both systems are designed to handle all 11 language directions\nusing a single model. According to automatic evaluation metrics, IKUN-C\nachieved 6 first-place and 3 second-place finishes among all constrained\nsystems, while IKUN secured 1 first-place and 2 second-place finishes across\nboth open and constrained systems. These encouraging results suggest that large\nlanguage models (LLMs) are nearing the level of proficiency required for\neffective multilingual machine translation. The systems are based on a\ntwo-stage approach: first, continuous pre-training on monolingual data in 10\nlanguages, followed by fine-tuning on high-quality parallel data for 11\nlanguage directions. The primary difference between IKUN and IKUN-C lies in\ntheir monolingual pre-training strategy. IKUN-C is pre-trained using\nconstrained monolingual data, whereas IKUN leverages monolingual data from the\nOSCAR dataset. In the second phase, both systems are fine-tuned on parallel\ndata sourced from NTREX, Flores, and WMT16-23 for all 11 language pairs.\n","authors":["Baohao Liao","Christian Herold","Shahram Khadivi","Christof Monz"],"pdf_url":"https://arxiv.org/pdf/2408.11512v2.pdf","comment":"typo: 120K -> 12K vocabulary size"},{"id":"http://arxiv.org/abs/2408.16482v1","updated":"2024-08-29T12:18:04Z","published":"2024-08-29T12:18:04Z","title":"Self-Alignment: Improving Alignment of Cultural Values in LLMs via\n  In-Context Learning","summary":"  Improving the alignment of Large Language Models (LLMs) with respect to the\ncultural values that they encode has become an increasingly important topic. In\nthis work, we study whether we can exploit existing knowledge about cultural\nvalues at inference time to adjust model responses to cultural value probes. We\npresent a simple and inexpensive method that uses a combination of in-context\nlearning (ICL) and human survey data, and show that we can improve the\nalignment to cultural values across 5 models that include both English-centric\nand multilingual LLMs. Importantly, we show that our method could prove useful\nin test languages other than English and can improve alignment to the cultural\nvalues that correspond to a range of culturally diverse countries.\n","authors":["Rochelle Choenni","Ekaterina Shutova"],"pdf_url":"https://arxiv.org/pdf/2408.16482v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16446v1","updated":"2024-08-29T11:19:57Z","published":"2024-08-29T11:19:57Z","title":"Is text normalization relevant for classifying medieval charters?","summary":"  This study examines the impact of historical text normalization on the\nclassification of medieval charters, specifically focusing on document dating\nand locating. Using a data set of Middle High German charters from a digital\narchive, we evaluate various classifiers, including traditional and\ntransformer-based models, with and without normalization. Our results indicate\nthat the given normalization minimally improves locating tasks but reduces\naccuracy for dating, implying that original texts contain crucial features that\nnormalization may obscure. We find that support vector machines and gradient\nboosting outperform other models, questioning the efficiency of transformers\nfor this use case. Results suggest a selective approach to historical text\nnormalization, emphasizing the significance of preserving some textual\ncharacteristics that are critical for classification tasks in document\nanalysis.\n","authors":["Florian Atzenhofer-Baumgartner","Tamás Kovács"],"pdf_url":"https://arxiv.org/pdf/2408.16446v1.pdf","comment":"This preprint has not undergone peer review or any post-submission\n  improvements or corrections"},{"id":"http://arxiv.org/abs/2408.16444v1","updated":"2024-08-29T11:13:23Z","published":"2024-08-29T11:13:23Z","title":"SurveySum: A Dataset for Summarizing Multiple Scientific Articles into a\n  Survey Section","summary":"  Document summarization is a task to shorten texts into concise and\ninformative summaries. This paper introduces a novel dataset designed for\nsummarizing multiple scientific articles into a section of a survey. Our\ncontributions are: (1) SurveySum, a new dataset addressing the gap in\ndomain-specific summarization tools; (2) two specific pipelines to summarize\nscientific articles into a section of a survey; and (3) the evaluation of these\npipelines using multiple metrics to compare their performance. Our results\nhighlight the importance of high-quality retrieval stages and the impact of\ndifferent configurations on the quality of generated summaries.\n","authors":["Leandro Carísio Fernandes","Gustavo Bartz Guedes","Thiago Soares Laitz","Thales Sales Almeida","Rodrigo Nogueira","Roberto Lotufo","Jayr Pereira"],"pdf_url":"https://arxiv.org/pdf/2408.16444v1.pdf","comment":"15 pages, 6 figures, 1 table. Submitted to BRACIS 2024"},{"id":"http://arxiv.org/abs/2408.16440v1","updated":"2024-08-29T11:05:54Z","published":"2024-08-29T11:05:54Z","title":"Instruction-tuned Large Language Models for Machine Translation in the\n  Medical Domain","summary":"  Large Language Models (LLMs) have shown promising results on machine\ntranslation for high resource language pairs and domains. However, in\nspecialised domains (e.g. medical) LLMs have shown lower performance compared\nto standard neural machine translation models. The consistency in the machine\ntranslation of terminology is crucial for users, researchers, and translators\nin specialised domains. In this study, we compare the performance between\nbaseline LLMs and instruction-tuned LLMs in the medical domain. In addition, we\nintroduce terminology from specialised medical dictionaries into the\ninstruction formatted datasets for fine-tuning LLMs. The instruction-tuned LLMs\nsignificantly outperform the baseline models with automatic metrics.\n","authors":["Miguel Rios"],"pdf_url":"https://arxiv.org/pdf/2408.16440v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15496v2","updated":"2024-08-29T10:35:52Z","published":"2024-08-28T02:47:27Z","title":"ReMamba: Equip Mamba with Effective Long-Sequence Modeling","summary":"  While the Mamba architecture demonstrates superior inference efficiency and\ncompetitive performance on short-context natural language processing (NLP)\ntasks, empirical evidence suggests its capacity to comprehend long contexts is\nlimited compared to transformer-based models. In this study, we investigate the\nlong-context efficiency issues of the Mamba models and propose ReMamba, which\nenhances Mamba's ability to comprehend long contexts. ReMamba incorporates\nselective compression and adaptation techniques within a two-stage re-forward\nprocess, incurring minimal additional inference costs overhead. Experimental\nresults on the LongBench and L-Eval benchmarks demonstrate ReMamba's efficacy,\nimproving over the baselines by 3.2 and 1.6 points, respectively, and attaining\nperformance almost on par with same-size transformer models.\n","authors":["Danlong Yuan","Jiahao Liu","Bei Li","Huishuai Zhang","Jingang Wang","Xunliang Cai","Dongyan Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.15496v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11288v2","updated":"2024-08-29T10:10:55Z","published":"2024-04-17T11:52:47Z","title":"A Preference-driven Paradigm for Enhanced Translation with Large\n  Language Models","summary":"  Recent research has shown that large language models (LLMs) can achieve\nremarkable translation performance through supervised fine-tuning (SFT) using\nonly a small amount of parallel data. However, SFT simply instructs the model\nto imitate the reference translations at the token level, making it vulnerable\nto the noise present in the references. Hence, the assistance from SFT often\nreaches a plateau once the LLMs have achieved a certain level of translation\ncapability, and further increasing the size of parallel data does not provide\nadditional benefits. To overcome this plateau associated with imitation-based\nSFT, we propose a preference-based approach built upon the Plackett-Luce model.\nThe objective is to steer LLMs towards a more nuanced understanding of\ntranslation preferences from a holistic view, while also being more resilient\nin the absence of gold translations. We further build a dataset named MAPLE to\nverify the effectiveness of our approach, which includes multiple translations\nof varying quality for each source sentence. Extensive experiments demonstrate\nthe superiority of our approach in \"breaking the plateau\" across diverse LLMs\nand test settings. Our in-depth analysis underscores the pivotal role of\ndiverse translations and accurate preference scores in the success of our\napproach.\n","authors":["Dawei Zhu","Sony Trenous","Xiaoyu Shen","Dietrich Klakow","Bill Byrne","Eva Hasler"],"pdf_url":"https://arxiv.org/pdf/2404.11288v2.pdf","comment":"Accepted to NAACL 2024 (long, main)"},{"id":"http://arxiv.org/abs/2408.16390v1","updated":"2024-08-29T09:52:01Z","published":"2024-08-29T09:52:01Z","title":"MQM-Chat: Multidimensional Quality Metrics for Chat Translation","summary":"  The complexities of chats pose significant challenges for machine translation\nmodels. Recognizing the need for a precise evaluation metric to address the\nissues of chat translation, this study introduces Multidimensional Quality\nMetrics for Chat Translation (MQM-Chat). Through the experiments of five models\nusing MQM-Chat, we observed that all models generated certain fundamental\nerrors, while each of them has different shortcomings, such as omission, overly\ncorrecting ambiguous source content, and buzzword issues, resulting in the loss\nof stylized information. Our findings underscore the effectiveness of MQM-Chat\nin evaluating chat translation, emphasizing the importance of stylized content\nand dialogue consistency for future studies.\n","authors":["Yunmeng Li","Jun Suzuki","Makoto Morishita","Kaori Abe","Kentaro Inui"],"pdf_url":"https://arxiv.org/pdf/2408.16390v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.19097v2","updated":"2024-08-29T09:35:24Z","published":"2024-02-29T12:25:45Z","title":"TEncDM: Understanding the Properties of Diffusion Model in the Space of\n  Language Model Encodings","summary":"  This paper presents the Text Encoding Diffusion Model (TEncDM), a novel\napproach to diffusion modeling that operates in the space of pre-trained\nlanguage model encodings. In contrast to traditionally used embeddings,\nencodings integrate contextual information. In our approach, we also employ a\ntransformer-based decoder, specifically designed to incorporate context in the\ntoken prediction process. We conduct a comprehensive examination of the\ninfluence of the encoder, decoder, noise scheduler, and self-conditioning on\nzero-shot generation. Furthermore, we compare TEncDM with previous approaches\non three conditional text generation tasks: QQP, XSum, and Wiki-Auto. The\nresults show that TEncDM exhibits superior performance compared to existing\nnon-autoregressive diffusion models.\n","authors":["Alexander Shabalin","Viacheslav Meshchaninov","Egor Chimbulatov","Vladislav Lapikov","Roman Kim","Grigory Bartosh","Dmitry Molchanov","Sergey Markov","Dmitry Vetrov"],"pdf_url":"https://arxiv.org/pdf/2402.19097v2.pdf","comment":"14 pages, 13 figures"},{"id":"http://arxiv.org/abs/2404.01602v2","updated":"2024-08-29T08:49:14Z","published":"2024-04-02T02:46:18Z","title":"Helmsman of the Masses? Evaluate the Opinion Leadership of Large\n  Language Models in the Werewolf Game","summary":"  Large language models (LLMs) have exhibited memorable strategic behaviors in\nsocial deductive games. However, the significance of opinion leadership\nexhibited by LLM-based agents has been largely overlooked, which is crucial for\npractical applications in multi-agent and human-AI interaction settings.\nOpinion leaders are individuals who have a noticeable impact on the beliefs and\nbehaviors of others within a social group. In this work, we employ the Werewolf\ngame as a simulation platform to assess the opinion leadership of LLMs. The\ngame includes the role of the Sheriff, tasked with summarizing arguments and\nrecommending decision options, and therefore serves as a credible proxy for an\nopinion leader. We develop a framework integrating the Sheriff role and devise\ntwo novel metrics based on the critical characteristics of opinion leaders. The\nfirst metric measures the reliability of the opinion leader, and the second\nassesses the influence of the opinion leader on other players' decisions. We\nconduct extensive experiments to evaluate LLMs of different scales. In\naddition, we collect a Werewolf question-answering dataset (WWQA) to assess and\nenhance LLM's grasp of the game rules, and we also incorporate human\nparticipants for further analysis. The results suggest that the Werewolf game\nis a suitable test bed to evaluate the opinion leadership of LLMs, and few LLMs\npossess the capacity for opinion leadership.\n","authors":["Silin Du","Xiaowei Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.01602v2.pdf","comment":"Published as a conference paper at COLM 2024. 37 pages, 6 figures, 27\n  tables"},{"id":"http://arxiv.org/abs/2407.09816v4","updated":"2024-08-29T08:45:58Z","published":"2024-07-13T09:22:33Z","title":"MaskMoE: Boosting Token-Level Learning via Routing Mask in\n  Mixture-of-Experts","summary":"  Scaling the size of a model enhances its capabilities but significantly\nincreases computation complexity. Mixture-of-Experts models (MoE) address the\nissue by allowing model size to scale up without substantially increasing\ntraining or inference costs. In MoE, there is an important module called the\nrouter, which is used to distribute each token to the experts. Currently, the\nmainstream routing methods include dynamic routing and fixed routing. Despite\ntheir promising results, MoE models encounter several challenges. Primarily,\nfor dynamic routing methods, the dispersion of training tokens across multiple\nexperts can lead to underfitting, particularly for infrequent tokens.\nAdditionally, though fixed routing methods can mitigate that issue, they\ncompromise on the diversity of representations. In this paper, we propose\n\\textbf{MaskMoE}, a method designed to enhance token-level learning by\nemploying a routing \\textbf{mask}ing technique within the\n\\textbf{M}ixture-\\textbf{o}f-\\textbf{E}xperts model. MaskMoE is capable of\nmaintaining representation diversity while achieving more comprehensive\ntraining. Experimental results demonstrate that our method outperforms previous\ndominant Mixture-of-Experts models in terms of both perplexity (PPL) and\ndownstream task performance.\n","authors":["Zhenpeng Su","Zijia Lin","Xue Bai","Xing Wu","Yizhe Xiong","Haoran Lian","Guangyuan Ma","Hui Chen","Guiguang Ding","Wei Zhou","Songlin Hu"],"pdf_url":"https://arxiv.org/pdf/2407.09816v4.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2408.15533v2","updated":"2024-08-29T08:45:30Z","published":"2024-08-28T04:44:43Z","title":"LRP4RAG: Detecting Hallucinations in Retrieval-Augmented Generation via\n  Layer-wise Relevance Propagation","summary":"  Retrieval-Augmented Generation (RAG) has become a primary technique for\nmitigating hallucinations in large language models (LLMs). However, incomplete\nknowledge extraction and insufficient understanding can still mislead LLMs to\nproduce irrelevant or even contradictory responses, which means hallucinations\npersist in RAG. In this paper, we propose LRP4RAG, a method based on the\nLayer-wise Relevance Propagation (LRP) algorithm for detecting hallucinations\nin RAG. Specifically, we first utilize LRP to compute the relevance between the\ninput and output of the RAG generator. We then apply further extraction and\nresampling to the relevance matrix. The processed relevance data are input into\nmultiple classifiers to determine whether the output contains hallucinations.\nTo the best of our knowledge, this is the first time that LRP has been used for\ndetecting RAG hallucinations, and extensive experiments demonstrate that\nLRP4RAG outperforms existing baselines.\n","authors":["Haichuan Hu","Yuhan Sun","Quanjun Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.15533v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16345v1","updated":"2024-08-29T08:30:33Z","published":"2024-08-29T08:30:33Z","title":"The Unreasonable Ineffectiveness of Nucleus Sampling on Mitigating Text\n  Memorization","summary":"  This work analyses the text memorization behavior of large language models\n(LLMs) when subjected to nucleus sampling. Stochastic decoding methods like\nnucleus sampling are typically applied to overcome issues such as monotonous\nand repetitive text generation, which are often observed with\nmaximization-based decoding techniques. We hypothesize that nucleus sampling\nmight also reduce the occurrence of memorization patterns, because it could\nlead to the selection of tokens outside the memorized sequence. To test this\nhypothesis we create a diagnostic dataset with a known distribution of\nduplicates that gives us some control over the likelihood of memorization of\ncertain parts of the training data. Our analysis of two GPT-Neo models\nfine-tuned on this dataset interestingly shows that (i) an increase of the\nnucleus size reduces memorization only modestly, and (ii) even when models do\nnot engage in \"hard\" memorization -- a verbatim reproduction of training\nsamples -- they may still display \"soft\" memorization whereby they generate\noutputs that echo the training data but without a complete one-by-one\nresemblance.\n","authors":["Luka Borec","Philipp Sadler","David Schlangen"],"pdf_url":"https://arxiv.org/pdf/2408.16345v1.pdf","comment":"9 pages, Accepted at INLG 2024 (International Natural Language\n  Generation Conference)"},{"id":"http://arxiv.org/abs/2402.12326v2","updated":"2024-08-29T08:27:27Z","published":"2024-02-19T18:00:30Z","title":"PsychoGAT: A Novel Psychological Measurement Paradigm through\n  Interactive Fiction Games with LLM Agents","summary":"  Psychological measurement is essential for mental health, self-understanding,\nand personal development. Traditional methods, such as self-report scales and\npsychologist interviews, often face challenges with engagement and\naccessibility. While game-based and LLM-based tools have been explored to\nimprove user interest and automate assessment, they struggle to balance\nengagement with generalizability. In this work, we propose PsychoGAT\n(Psychological Game AgenTs) to achieve a generic gamification of psychological\nassessment. The main insight is that powerful LLMs can function both as adept\npsychologists and innovative game designers. By incorporating LLM agents into\ndesignated roles and carefully managing their interactions, PsychoGAT can\ntransform any standardized scales into personalized and engaging interactive\nfiction games. To validate the proposed method, we conduct psychometric\nevaluations to assess its effectiveness and employ human evaluators to examine\nthe generated content across various psychological constructs, including\ndepression, cognitive distortions, and personality traits. Results demonstrate\nthat PsychoGAT serves as an effective assessment tool, achieving statistically\nsignificant excellence in psychometric metrics such as reliability, convergent\nvalidity, and discriminant validity. Moreover, human evaluations confirm\nPsychoGAT's enhancements in content coherence, interactivity, interest,\nimmersion, and satisfaction.\n","authors":["Qisen Yang","Zekun Wang","Honghui Chen","Shenzhi Wang","Yifan Pu","Xin Gao","Wenhao Huang","Shiji Song","Gao Huang"],"pdf_url":"https://arxiv.org/pdf/2402.12326v2.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2407.14507v2","updated":"2024-08-29T08:24:42Z","published":"2024-07-19T17:59:03Z","title":"Internal Consistency and Self-Feedback in Large Language Models: A\n  Survey","summary":"  Large language models (LLMs) often exhibit deficient reasoning or generate\nhallucinations. To address these, studies prefixed with \"Self-\" such as\nSelf-Consistency, Self-Improve, and Self-Refine have been initiated. They share\na commonality: involving LLMs evaluating and updating themselves. Nonetheless,\nthese efforts lack a unified perspective on summarization, as existing surveys\npredominantly focus on categorization.\n  In this paper, we summarize a theoretical framework, Internal Consistency,\noffering explanations for reasoning deficiencies and hallucinations. Internal\nConsistency refers to the consistency in expressions among LLMs' latent,\ndecoding, or response layers based on sampling methodologies. Then, we\nintroduce another effective theoretical framework capable of mining Internal\nConsistency, named Self-Feedback. This framework consists of two modules:\nSelf-Evaluation and Self-Update. The former captures Internal Consistency\nSignals, while the latter leverages the signals to enhance either the model's\nresponse or the model itself. This framework has been employed in numerous\nstudies.\n  We systematically classify these studies by tasks and lines of work;\nsummarize relevant evaluation methods and benchmarks; and delve into the\nconcern, \"Does Self-Feedback Really Work?\" We also propose several critical\nviewpoints, including the \"Hourglass Evolution of Internal Consistency\",\n\"Consistency Is (Almost) Correctness\" hypothesis, and \"The Paradox of Latent\nand Explicit Reasoning\". The relevant resources are open-sourced at\nhttps://github.com/IAAR-Shanghai/ICSFSurvey.\n","authors":["Xun Liang","Shichao Song","Zifan Zheng","Hanyu Wang","Qingchen Yu","Xunkai Li","Rong-Hua Li","Peng Cheng","Zhonghao Wang","Feiyu Xiong","Zhiyu Li"],"pdf_url":"https://arxiv.org/pdf/2407.14507v2.pdf","comment":"24 pages, 9 figures, 7 tables, 14 equations"},{"id":"http://arxiv.org/abs/2408.16326v1","updated":"2024-08-29T08:02:09Z","published":"2024-08-29T08:02:09Z","title":"Critic-CoT: Boosting the reasoning abilities of large language model via\n  Chain-of-thoughts Critic","summary":"  Self-critic has become an important mechanism for enhancing the reasoning\nperformance of LLMs. However, current approaches mainly involve basic prompts\nwithout further training, which tend to be over-simplified, leading to limited\naccuracy.Moreover, there is a lack of in-depth investigation of the\nrelationship between LLM's ability to criticism and its task-solving\nperformance.To address these issues, we propose Critic-CoT, a novel framework\nthat pushes LLMs toward System-2-like critic capability, via step-wise CoT\nreasoning format and distant-supervision data construction, without the need\nfor human annotation. Experiments on GSM8K and MATH show that via filtering out\ninvalid solutions or iterative refinement, our enhanced model boosts\ntask-solving performance, which demonstrates the effectiveness of our method.\nFurther, we find that training on critique and refinement alone improves the\ngeneration. We hope our work could shed light on future research on improving\nthe reasoning and critic ability of LLMs.\n","authors":["Xin Zheng","Jie Lou","Boxi Cao","Xueru Wen","Yuqiu Ji","Hongyu Lin","Yaojie Lu","Xianpei Han","Debing Zhang","Le Sun"],"pdf_url":"https://arxiv.org/pdf/2408.16326v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16293v1","updated":"2024-08-29T06:49:20Z","published":"2024-08-29T06:49:20Z","title":"Physics of Language Models: Part 2.2, How to Learn From Mistakes on\n  Grade-School Math Problems","summary":"  Language models have demonstrated remarkable performance in solving reasoning\ntasks; however, even the strongest models still occasionally make reasoning\nmistakes. Recently, there has been active research aimed at improving reasoning\naccuracy, particularly by using pretrained language models to \"self-correct\"\ntheir mistakes via multi-round prompting. In this paper, we follow this line of\nwork but focus on understanding the usefulness of incorporating\n\"error-correction\" data directly into the pretraining stage. This data consists\nof erroneous solution steps immediately followed by their corrections. Using a\nsynthetic math dataset, we show promising results: this type of pretrain data\ncan help language models achieve higher reasoning accuracy directly (i.e.,\nthrough simple auto-regression, without multi-round prompting) compared to\npretraining on the same amount of error-free data. We also delve into many\ndetails, such as (1) how this approach differs from beam search, (2) how such\ndata can be prepared, (3) whether masking is needed on the erroneous tokens,\n(4) the amount of error required, (5) whether such data can be deferred to the\nfine-tuning stage, and many others.\n","authors":["Tian Ye","Zicheng Xu","Yuanzhi Li","Zeyuan Allen-Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.16293v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2407.20311"},{"id":"http://arxiv.org/abs/2408.16287v1","updated":"2024-08-29T06:38:55Z","published":"2024-08-29T06:38:55Z","title":"Measuring the Accuracy of Automatic Speech Recognition Solutions","summary":"  For d/Deaf and hard of hearing (DHH) people, captioning is an essential\naccessibility tool. Significant developments in artificial intelligence (AI)\nmean that Automatic Speech Recognition (ASR) is now a part of many popular\napplications. This makes creating captions easy and broadly available - but\ntranscription needs high levels of accuracy to be accessible. Scientific\npublications and industry report very low error rates, claiming AI has reached\nhuman parity or even outperforms manual transcription. At the same time the DHH\ncommunity reports serious issues with the accuracy and reliability of ASR.\nThere seems to be a mismatch between technical innovations and the real-life\nexperience for people who depend on transcription. Independent and\ncomprehensive data is needed to capture the state of ASR. We measured the\nperformance of eleven common ASR services with recordings of Higher Education\nlectures. We evaluated the influence of technical conditions like streaming,\nthe use of vocabularies, and differences between languages. Our results show\nthat accuracy ranges widely between vendors and for the individual audio\nsamples. We also measured a significant lower quality for streaming ASR, which\nis used for live events. Our study shows that despite the recent improvements\nof ASR, common services lack reliability in accuracy.\n","authors":["Korbinian Kuhn","Verena Kersken","Benedikt Reuter","Niklas Egger","Gottfried Zimmermann"],"pdf_url":"https://arxiv.org/pdf/2408.16287v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16276v1","updated":"2024-08-29T05:47:14Z","published":"2024-08-29T05:47:14Z","title":"Enhancing AI-Driven Psychological Consultation: Layered Prompts with\n  Large Language Models","summary":"  Psychological consultation is essential for improving mental health and\nwell-being, yet challenges such as the shortage of qualified professionals and\nscalability issues limit its accessibility. To address these challenges, we\nexplore the use of large language models (LLMs) like GPT-4 to augment\npsychological consultation services. Our approach introduces a novel layered\nprompting system that dynamically adapts to user input, enabling comprehensive\nand relevant information gathering. We also develop empathy-driven and\nscenario-based prompts to enhance the LLM's emotional intelligence and\ncontextual understanding in therapeutic settings. We validated our approach\nthrough experiments using a newly collected dataset of psychological\nconsultation dialogues, demonstrating significant improvements in response\nquality. The results highlight the potential of our prompt engineering\ntechniques to enhance AI-driven psychological consultation, offering a scalable\nand accessible solution to meet the growing demand for mental health support.\n","authors":["Rafael Souza","Jia-Hao Lim","Alexander Davis"],"pdf_url":"https://arxiv.org/pdf/2408.16276v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.11911v6","updated":"2024-08-29T05:14:36Z","published":"2023-09-21T09:22:07Z","title":"InstructERC: Reforming Emotion Recognition in Conversation with\n  Multi-task Retrieval-Augmented Large Language Models","summary":"  The field of emotion recognition of conversation (ERC) has been focusing on\nseparating sentence feature encoding and context modeling, lacking exploration\nin generative paradigms based on unified designs. In this study, we propose a\nnovel approach, InstructERC, to reformulate the ERC task from a discriminative\nframework to a generative framework based on Large Language Models (LLMs).\nInstructERC makes three significant contributions: (1) it introduces a simple\nyet effective retrieval template module, which helps the model explicitly\nintegrate multi-granularity dialogue supervision information. (2) We introduce\ntwo additional emotion alignment tasks, namely speaker identification and\nemotion prediction tasks, to implicitly model the dialogue role relationships\nand future emotional tendencies in conversations. (3) Pioneeringly, we unify\nemotion labels across benchmarks through the feeling wheel to fit real\napplication scenarios. InstructERC still perform impressively on this unified\ndataset. Our LLM-based plugin framework significantly outperforms all previous\nmodels and achieves comprehensive SOTA on three commonly used ERC datasets.\nExtensive analysis of parameter-efficient and data-scaling experiments provides\nempirical guidance for applying it in practical scenarios.\n","authors":["Shanglin Lei","Guanting Dong","Xiaoping Wang","Keheng Wang","Runqi Qiao","Sirui Wang"],"pdf_url":"https://arxiv.org/pdf/2309.11911v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16264v1","updated":"2024-08-29T05:02:52Z","published":"2024-08-29T05:02:52Z","title":"LoraMap: Harnessing the Power of LoRA Connections","summary":"  Large Language Models (LLMs) can benefit from mitigating hallucinations\nthrough fact-checking and overcoming substantial computational overhead with\nparameter-efficient techniques such as Low-Rank Adaptation (LoRA). While some\nstudies have explored the parallel integration of multiple LoRAs, these\napproaches need attention to the connections between them. This paper\ninvestigates methods to establish connections among multiple LoRAs. We create\nthree reasoning datasets tailored to fact-checking and fine-tune individual\nLoRAs, allowing them to view and reason from diverse perspectives. Then, we\nexplore strategies for allocating these reasoning LoRAs and introduce LoraMap,\nan approach to map connections between them. The results on the fact-checking\ntask demonstrate that the performance of LoraMap is superior to LoraHub, an\nexisting LoRA composition method. LoraMap also outperforms with significantly\nfewer parameters than LoraConcat, which concatenates LoRAs and further\nfine-tunes them.\n","authors":["Hyeryun Park","Jeongwon Kwak","Dongsuk Jang","Sumin Park","Jinwook Choi"],"pdf_url":"https://arxiv.org/pdf/2408.16264v1.pdf","comment":"13 pages, 9 figures, 5 tables"},{"id":"http://arxiv.org/abs/2408.16241v1","updated":"2024-08-29T03:50:24Z","published":"2024-08-29T03:50:24Z","title":"Making the Most of your Model: Methods for Finetuning and Applying\n  Pretrained Transformers","summary":"  This thesis provides methods and analysis of models which make progress on\nthis goal. The techniques outlined are task agnostic, and should provide\nbenefit when used with nearly any transformer LM. We introduce two new\nfinetuning methods which add new capabilities to the models they are used on.\nThe first adds a recurrence mechanism, which removes the fixed-window sized\nconstraint and improves the efficiency of a transformer decoder. The second\nallows masked language models (MLMs) to be used for initialization of both the\nencoder and decoder of a non-autoregressive sequence-to-sequence transformer,\nopening up generative applications of models which were previously only used\nfor natural language understanding tasks.\n  We also introduce two new techniques for improving the quality of predictions\nof any transformer decoder without additional finetuning. One, hidden state\noptimization, can be applied to any transformer decoder to improve the quality\nof predictions at inference time, especially for few-shot classification. The\nother, conditional beam search, allows practitioners to search for natural\nlanguage generation (NLG) model outputs with high likelihood while conditioning\non the event that the output is not degenerate (e.g. empty, repetitive, etc.).\n  Finally, we provide theoretical and empirical insights on the divergence of\nmodel-likelihood and output quality which has widely been observed in prior\nwork. These insights apply to any model which represents a distribution over\ntext, and apply to language models which are not transformers or even\nautoregressive. We argue that the NLP community has, to some extent,\nmisunderstood the implications of these findings, and encourage a point of view\nwhich has more nuance.\n","authors":["Davis Yoshida"],"pdf_url":"https://arxiv.org/pdf/2408.16241v1.pdf","comment":"PhD thesis"},{"id":"http://arxiv.org/abs/2408.13985v2","updated":"2024-08-29T02:40:12Z","published":"2024-08-26T02:35:37Z","title":"TF-Attack: Transferable and Fast Adversarial Attacks on Large Language\n  Models","summary":"  With the great advancements in large language models (LLMs), adversarial\nattacks against LLMs have recently attracted increasing attention. We found\nthat pre-existing adversarial attack methodologies exhibit limited\ntransferability and are notably inefficient, particularly when applied to LLMs.\nIn this paper, we analyze the core mechanisms of previous predominant\nadversarial attack methods, revealing that 1) the distributions of importance\nscore differ markedly among victim models, restricting the transferability; 2)\nthe sequential attack processes induces substantial time overheads. Based on\nthe above two insights, we introduce a new scheme, named TF-Attack, for\nTransferable and Fast adversarial attacks on LLMs. TF-Attack employs an\nexternal LLM as a third-party overseer rather than the victim model to identify\ncritical units within sentences. Moreover, TF-Attack introduces the concept of\nImportance Level, which allows for parallel substitutions of attacks. We\nconduct extensive experiments on 6 widely adopted benchmarks, evaluating the\nproposed method through both automatic and human metrics. Results show that our\nmethod consistently surpasses previous methods in transferability and delivers\nsignificant speed improvements, up to 20 times faster than earlier attack\nstrategies.\n","authors":["Zelin Li","Kehai Chen","Xuefeng Bai","Lemao Liu","Mingming Yang","Yang Xiang","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.13985v2.pdf","comment":"14 pages, 6 figures. arXiv admin note: text overlap with\n  arXiv:2305.17440 by other authors"},{"id":"http://arxiv.org/abs/2408.10903v5","updated":"2024-08-29T02:38:05Z","published":"2024-08-20T14:47:38Z","title":"BEYOND DIALOGUE: A Profile-Dialogue Alignment Framework Towards General\n  Role-Playing Language Model","summary":"  The rapid advancement of large language models (LLMs) has revolutionized\nrole-playing, enabling the development of general role-playing models. However,\ncurrent role-playing training has two significant issues: (I) Using a\npredefined role profile to prompt dialogue training for specific scenarios\nusually leads to inconsistencies and even conflicts between the dialogue and\nthe profile, resulting in training biases. (II) The model learns to imitate the\nrole based solely on the profile, neglecting profile-dialogue alignment at the\nsentence level. In this work, we propose a simple yet effective framework\ncalled BEYOND DIALOGUE, designed to overcome these hurdles. This framework\ninnovatively introduces \"beyond dialogue\" tasks to align dialogue with profile\ntraits based on each specific scenario, thereby eliminating biases during\ntraining. Furthermore, by adopting an innovative prompting mechanism that\ngenerates reasoning outcomes for training, the framework allows the model to\nachieve fine-grained alignment between profile and dialogue at the sentence\nlevel. The aforementioned methods are fully automated and low-cost.\nAdditionally, the integration of automated dialogue and objective evaluation\nmethods forms a comprehensive framework, paving the way for general\nrole-playing. Experimental results demonstrate that our model excels in\nadhering to and reflecting various dimensions of role profiles, outperforming\nmost proprietary general and specialized role-playing baselines. All code and\ndatasets are available at https://github.com/yuyouyu32/BeyondDialogue.\n","authors":["Yeyong Yu","Runsheng Yu","Haojie Wei","Zhanqiu Zhang","Quan Qian"],"pdf_url":"https://arxiv.org/pdf/2408.10903v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16221v1","updated":"2024-08-29T02:35:53Z","published":"2024-08-29T02:35:53Z","title":"SSDM: Scalable Speech Dysfluency Modeling","summary":"  Speech dysfluency modeling is the core module for spoken language learning,\nand speech therapy. However, there are three challenges. First, current\nstate-of-the-art solutions suffer from poor scalability. Second, there is a\nlack of a large-scale dysfluency corpus. Third, there is not an effective\nlearning framework. In this paper, we propose \\textit{SSDM: Scalable Speech\nDysfluency Modeling}, which (1) adopts articulatory gestures as scalable forced\nalignment; (2) introduces connectionist subsequence aligner (CSA) to achieve\ndysfluency alignment; (3) introduces a large-scale simulated dysfluency corpus\ncalled Libri-Dys; and (4) develops an end-to-end system by leveraging the power\nof large language models (LLMs). We expect SSDM to serve as a standard in the\narea of dysfluency modeling. Demo is available at\n\\url{https://eureka235.github.io}.\n","authors":["Jiachen Lian","Xuanru Zhou","Zoe Ezzes","Jet Vonk","Brittany Morin","David Baquirin","Zachary Mille","Maria Luisa Gorno Tempini","Gopala Anumanchipalli"],"pdf_url":"https://arxiv.org/pdf/2408.16221v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21191v2","updated":"2024-08-29T02:27:19Z","published":"2024-07-30T20:58:36Z","title":"GenRec: Generative Sequential Recommendation with Large Language Models","summary":"  Sequential recommendation is a task to capture hidden user preferences from\nhistorical user item interaction data and recommend next items for the user.\nSignificant progress has been made in this domain by leveraging classification\nbased learning methods. Inspired by the recent paradigm of 'pretrain, prompt\nand predict' in NLP, we consider sequential recommendation as a sequence to\nsequence generation task and propose a novel model named Generative\nRecommendation (GenRec). Unlike classification based models that learn explicit\nuser and item representations, GenRec utilizes the sequence modeling capability\nof Transformer and adopts the masked item prediction objective to effectively\nlearn the hidden bidirectional sequential patterns. Different from existing\ngenerative sequential recommendation models, GenRec does not rely on manually\ndesigned hard prompts. The input to GenRec is textual user item sequence and\nthe output is top ranked next items. Moreover, GenRec is lightweight and\nrequires only a few hours to train effectively in low-resource settings, making\nit highly applicable to real-world scenarios and helping to democratize large\nlanguage models in the sequential recommendation domain. Our extensive\nexperiments have demonstrated that GenRec generalizes on various public\nreal-world datasets and achieves state-of-the-art results. Our experiments also\nvalidate the effectiveness of the the proposed masked item prediction objective\nthat improves the model performance by a large margin.\n","authors":["Panfeng Cao","Pietro Lio"],"pdf_url":"https://arxiv.org/pdf/2407.21191v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16213v1","updated":"2024-08-29T02:12:58Z","published":"2024-08-29T02:12:58Z","title":"M4CXR: Exploring Multi-task Potentials of Multi-modal Large Language\n  Models for Chest X-ray Interpretation","summary":"  The rapid evolution of artificial intelligence, especially in large language\nmodels (LLMs), has significantly impacted various domains, including\nhealthcare. In chest X-ray (CXR) analysis, previous studies have employed LLMs,\nbut with limitations: either underutilizing the multi-tasking capabilities of\nLLMs or lacking clinical accuracy. This paper presents M4CXR, a multi-modal LLM\ndesigned to enhance CXR interpretation. The model is trained on a visual\ninstruction-following dataset that integrates various task-specific datasets in\na conversational format. As a result, the model supports multiple tasks such as\nmedical report generation (MRG), visual grounding, and visual question\nanswering (VQA). M4CXR achieves state-of-the-art clinical accuracy in MRG by\nemploying a chain-of-thought prompting strategy, in which it identifies\nfindings in CXR images and subsequently generates corresponding reports. The\nmodel is adaptable to various MRG scenarios depending on the available inputs,\nsuch as single-image, multi-image, and multi-study contexts. In addition to\nMRG, M4CXR performs visual grounding at a level comparable to specialized\nmodels and also demonstrates outstanding performance in VQA. Both quantitative\nand qualitative assessments reveal M4CXR's versatility in MRG, visual\ngrounding, and VQA, while consistently maintaining clinical accuracy.\n","authors":["Jonggwon Park","Soobum Kim","Byungmu Yoon","Jihun Hyun","Kyoyun Choi"],"pdf_url":"https://arxiv.org/pdf/2408.16213v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16209v1","updated":"2024-08-29T02:05:39Z","published":"2024-08-29T02:05:39Z","title":"From cart to truck: meaning shift through words in English in the last\n  two centuries","summary":"  This onomasiological study uses diachronic word embeddings to explore how\ndifferent words represented the same concepts over time, using historical word\ndata from 1800 to 2000. We identify shifts in energy, transport, entertainment,\nand computing domains, revealing connections between language and societal\nchanges.\n  Our approach consisted in using diachronic word embeddings trained using\nword2vec with skipgram and aligning them using orthogonal Procrustes. We\ndiscuss possible difficulties linked to the relationships the method\nidentifies. Moreover, we look at the ethical aspects of interpreting results,\nhighlighting the need for expert insights to understand the method's\nsignificance.\n","authors":["Esteban Rodríguez Betancourt","Edgar Casasola Murillo"],"pdf_url":"https://arxiv.org/pdf/2408.16209v1.pdf","comment":"7 pages, 1 figure"},{"id":"http://arxiv.org/abs/2408.16208v1","updated":"2024-08-29T02:03:05Z","published":"2024-08-29T02:03:05Z","title":"ReXamine-Global: A Framework for Uncovering Inconsistencies in Radiology\n  Report Generation Metrics","summary":"  Given the rapidly expanding capabilities of generative AI models for\nradiology, there is a need for robust metrics that can accurately measure the\nquality of AI-generated radiology reports across diverse hospitals. We develop\nReXamine-Global, a LLM-powered, multi-site framework that tests metrics across\ndifferent writing styles and patient populations, exposing gaps in their\ngeneralization. First, our method tests whether a metric is undesirably\nsensitive to reporting style, providing different scores depending on whether\nAI-generated reports are stylistically similar to ground-truth reports or not.\nSecond, our method measures whether a metric reliably agrees with experts, or\nwhether metric and expert scores of AI-generated report quality diverge for\nsome sites. Using 240 reports from 6 hospitals around the world, we apply\nReXamine-Global to 7 established report evaluation metrics and uncover serious\ngaps in their generalizability. Developers can apply ReXamine-Global when\ndesigning new report evaluation metrics, ensuring their robustness across\nsites. Additionally, our analysis of existing metrics can guide users of those\nmetrics towards evaluation procedures that work reliably at their sites of\ninterest.\n","authors":["Oishi Banerjee","Agustina Saenz","Kay Wu","Warren Clements","Adil Zia","Dominic Buensalido","Helen Kavnoudias","Alain S. Abi-Ghanem","Nour El Ghawi","Cibele Luna","Patricia Castillo","Khaled Al-Surimi","Rayyan A. Daghistani","Yuh-Min Chen","Heng-sheng Chao","Lars Heiliger","Moon Kim","Johannes Haubold","Frederic Jonske","Pranav Rajpurkar"],"pdf_url":"https://arxiv.org/pdf/2408.16208v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16180v1","updated":"2024-08-29T00:18:12Z","published":"2024-08-29T00:18:12Z","title":"Benchmarking Japanese Speech Recognition on ASR-LLM Setups with\n  Multi-Pass Augmented Generative Error Correction","summary":"  With the strong representational power of large language models (LLMs),\ngenerative error correction (GER) for automatic speech recognition (ASR) aims\nto provide semantic and phonetic refinements to address ASR errors. This work\nexplores how LLM-based GER can enhance and expand the capabilities of Japanese\nlanguage processing, presenting the first GER benchmark for Japanese ASR with\n0.9-2.6k text utterances. We also introduce a new multi-pass augmented\ngenerative error correction (MPA GER) by integrating multiple system hypotheses\non the input side with corrections from multiple LLMs on the output side and\nthen merging them. To the best of our knowledge, this is the first\ninvestigation of the use of LLMs for Japanese GER, which involves second-pass\nlanguage modeling on the output transcriptions generated by the ASR system\n(e.g., N-best hypotheses). Our experiments demonstrated performance improvement\nin the proposed methods of ASR quality and generalization both in SPREDS-U1-ja\nand CSJ data.\n","authors":["Yuka Ko","Sheng Li","Chao-Han Huck Yang","Tatsuya Kawahara"],"pdf_url":"https://arxiv.org/pdf/2408.16180v1.pdf","comment":"submitted to SLT2024"},{"id":"http://arxiv.org/abs/2408.13985v2","updated":"2024-08-29T02:40:12Z","published":"2024-08-26T02:35:37Z","title":"TF-Attack: Transferable and Fast Adversarial Attacks on Large Language\n  Models","summary":"  With the great advancements in large language models (LLMs), adversarial\nattacks against LLMs have recently attracted increasing attention. We found\nthat pre-existing adversarial attack methodologies exhibit limited\ntransferability and are notably inefficient, particularly when applied to LLMs.\nIn this paper, we analyze the core mechanisms of previous predominant\nadversarial attack methods, revealing that 1) the distributions of importance\nscore differ markedly among victim models, restricting the transferability; 2)\nthe sequential attack processes induces substantial time overheads. Based on\nthe above two insights, we introduce a new scheme, named TF-Attack, for\nTransferable and Fast adversarial attacks on LLMs. TF-Attack employs an\nexternal LLM as a third-party overseer rather than the victim model to identify\ncritical units within sentences. Moreover, TF-Attack introduces the concept of\nImportance Level, which allows for parallel substitutions of attacks. We\nconduct extensive experiments on 6 widely adopted benchmarks, evaluating the\nproposed method through both automatic and human metrics. Results show that our\nmethod consistently surpasses previous methods in transferability and delivers\nsignificant speed improvements, up to 20 times faster than earlier attack\nstrategies.\n","authors":["Zelin Li","Kehai Chen","Xuefeng Bai","Lemao Liu","Mingming Yang","Yang Xiang","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.13985v2.pdf","comment":"14 pages, 6 figures"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2408.16770v1","updated":"2024-08-29T17:59:54Z","published":"2024-08-29T17:59:54Z","title":"3D Whole-body Grasp Synthesis with Directional Controllability","summary":"  Synthesizing 3D whole-bodies that realistically grasp objects is useful for\nanimation, mixed reality, and robotics. This is challenging, because the hands\nand body need to look natural w.r.t. each other, the grasped object, as well as\nthe local scene (i.e., a receptacle supporting the object). Only recent work\ntackles this, with a divide-and-conquer approach; it first generates a\n\"guiding\" right-hand grasp, and then searches for bodies that match this.\nHowever, the guiding-hand synthesis lacks controllability and receptacle\nawareness, so it likely has an implausible direction (i.e., a body can't match\nthis without penetrating the receptacle) and needs corrections through major\npost-processing. Moreover, the body search needs exhaustive sampling and is\nexpensive. These are strong limitations. We tackle these with a novel method\ncalled CWGrasp. Our key idea is that performing geometry-based reasoning \"early\non,\" instead of \"too late,\" provides rich \"control\" signals for inference. To\nthis end, CWGrasp first samples a plausible reaching-direction vector (used\nlater for both the arm and hand) from a probabilistic model built via\nraycasting from the object and collision checking. Then, it generates a\nreaching body with a desired arm direction, as well as a \"guiding\" grasping\nhand with a desired palm direction that complies with the arm's one.\nEventually, CWGrasp refines the body to match the \"guiding\" hand, while\nplausibly contacting the scene. Notably, generating already-compatible \"parts\"\ngreatly simplifies the \"whole.\" Moreover, CWGrasp uniquely tackles both right-\nand left-hand grasps. We evaluate on the GRAB and ReplicaGrasp datasets.\nCWGrasp outperforms baselines, at lower runtime and budget, while all\ncomponents help performance. Code and models will be released.\n","authors":["Georgios Paschalidis","Romana Wilschut","Dimitrije Antić","Omid Taheri","Dimitrios Tzionas"],"pdf_url":"https://arxiv.org/pdf/2408.16770v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16768v1","updated":"2024-08-29T17:59:45Z","published":"2024-08-29T17:59:45Z","title":"SAM2Point: Segment Any 3D as Videos in Zero-shot and Promptable Manners","summary":"  We introduce SAM2Point, a preliminary exploration adapting Segment Anything\nModel 2 (SAM 2) for zero-shot and promptable 3D segmentation. SAM2Point\ninterprets any 3D data as a series of multi-directional videos, and leverages\nSAM 2 for 3D-space segmentation, without further training or 2D-3D projection.\nOur framework supports various prompt types, including 3D points, boxes, and\nmasks, and can generalize across diverse scenarios, such as 3D objects, indoor\nscenes, outdoor environments, and raw sparse LiDAR. Demonstrations on multiple\n3D datasets, e.g., Objaverse, S3DIS, ScanNet, Semantic3D, and KITTI, highlight\nthe robust generalization capabilities of SAM2Point. To our best knowledge, we\npresent the most faithful implementation of SAM in 3D, which may serve as a\nstarting point for future research in promptable 3D segmentation. Online Demo:\nhttps://huggingface.co/spaces/ZiyuG/SAM2Point . Code:\nhttps://github.com/ZiyuGuo99/SAM2Point .\n","authors":["Ziyu Guo","Renrui Zhang","Xiangyang Zhu","Chengzhuo Tong","Peng Gao","Chunyuan Li","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2408.16768v1.pdf","comment":"Work in progress. Online Demo:\n  https://huggingface.co/spaces/ZiyuG/SAM2Point . Code:\n  https://github.com/ZiyuGuo99/SAM2Point"},{"id":"http://arxiv.org/abs/2408.16769v1","updated":"2024-08-29T17:59:45Z","published":"2024-08-29T17:59:45Z","title":"PromptSmooth: Certifying Robustness of Medical Vision-Language Models\n  via Prompt Learning","summary":"  Medical vision-language models (Med-VLMs) trained on large datasets of\nmedical image-text pairs and later fine-tuned for specific tasks have emerged\nas a mainstream paradigm in medical image analysis. However, recent studies\nhave highlighted the susceptibility of these Med-VLMs to adversarial attacks,\nraising concerns about their safety and robustness. Randomized smoothing is a\nwell-known technique for turning any classifier into a model that is\ncertifiably robust to adversarial perturbations. However, this approach\nrequires retraining the Med-VLM-based classifier so that it classifies well\nunder Gaussian noise, which is often infeasible in practice. In this paper, we\npropose a novel framework called PromptSmooth to achieve efficient certified\nrobustness of Med-VLMs by leveraging the concept of prompt learning. Given any\npre-trained Med-VLM, PromptSmooth adapts it to handle Gaussian noise by\nlearning textual prompts in a zero-shot or few-shot manner, achieving a\ndelicate balance between accuracy and robustness, while minimizing the\ncomputational overhead. Moreover, PromptSmooth requires only a single model to\nhandle multiple noise levels, which substantially reduces the computational\ncost compared to traditional methods that rely on training a separate model for\neach noise level. Comprehensive experiments based on three Med-VLMs and across\nsix downstream datasets of various imaging modalities demonstrate the efficacy\nof PromptSmooth. Our code and models are available at\nhttps://github.com/nhussein/promptsmooth.\n","authors":["Noor Hussein","Fahad Shamshad","Muzammal Naseer","Karthik Nandakumar"],"pdf_url":"https://arxiv.org/pdf/2408.16769v1.pdf","comment":"Accepted to MICCAI 2024"},{"id":"http://arxiv.org/abs/2408.16767v1","updated":"2024-08-29T17:59:40Z","published":"2024-08-29T17:59:40Z","title":"ReconX: Reconstruct Any Scene from Sparse Views with Video Diffusion\n  Model","summary":"  Advancements in 3D scene reconstruction have transformed 2D images from the\nreal world into 3D models, producing realistic 3D results from hundreds of\ninput photos. Despite great success in dense-view reconstruction scenarios,\nrendering a detailed scene from insufficient captured views is still an\nill-posed optimization problem, often resulting in artifacts and distortions in\nunseen areas. In this paper, we propose ReconX, a novel 3D scene reconstruction\nparadigm that reframes the ambiguous reconstruction challenge as a temporal\ngeneration task. The key insight is to unleash the strong generative prior of\nlarge pre-trained video diffusion models for sparse-view reconstruction.\nHowever, 3D view consistency struggles to be accurately preserved in directly\ngenerated video frames from pre-trained models. To address this, given limited\ninput views, the proposed ReconX first constructs a global point cloud and\nencodes it into a contextual space as the 3D structure condition. Guided by the\ncondition, the video diffusion model then synthesizes video frames that are\nboth detail-preserved and exhibit a high degree of 3D consistency, ensuring the\ncoherence of the scene from various perspectives. Finally, we recover the 3D\nscene from the generated video through a confidence-aware 3D Gaussian Splatting\noptimization scheme. Extensive experiments on various real-world datasets show\nthe superiority of our ReconX over state-of-the-art methods in terms of quality\nand generalizability.\n","authors":["Fangfu Liu","Wenqiang Sun","Hanyang Wang","Yikai Wang","Haowen Sun","Junliang Ye","Jun Zhang","Yueqi Duan"],"pdf_url":"https://arxiv.org/pdf/2408.16767v1.pdf","comment":"Project page: https://liuff19.github.io/ReconX"},{"id":"http://arxiv.org/abs/2408.16766v1","updated":"2024-08-29T17:59:30Z","published":"2024-08-29T17:59:30Z","title":"CSGO: Content-Style Composition in Text-to-Image Generation","summary":"  The diffusion model has shown exceptional capabilities in controlled image\ngeneration, which has further fueled interest in image style transfer. Existing\nworks mainly focus on training free-based methods (e.g., image inversion) due\nto the scarcity of specific data. In this study, we present a data construction\npipeline for content-style-stylized image triplets that generates and\nautomatically cleanses stylized data triplets. Based on this pipeline, we\nconstruct a dataset IMAGStyle, the first large-scale style transfer dataset\ncontaining 210k image triplets, available for the community to explore and\nresearch. Equipped with IMAGStyle, we propose CSGO, a style transfer model\nbased on end-to-end training, which explicitly decouples content and style\nfeatures employing independent feature injection. The unified CSGO implements\nimage-driven style transfer, text-driven stylized synthesis, and text\nediting-driven stylized synthesis. Extensive experiments demonstrate the\neffectiveness of our approach in enhancing style control capabilities in image\ngeneration. Additional visualization and access to the source code can be\nlocated on the project page: \\url{https://csgo-gen.github.io/}.\n","authors":["Peng Xing","Haofan Wang","Yanpeng Sun","Qixun Wang","Xu Bai","Hao Ai","Renyuan Huang","Zechao Li"],"pdf_url":"https://arxiv.org/pdf/2408.16766v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16762v1","updated":"2024-08-29T17:57:05Z","published":"2024-08-29T17:57:05Z","title":"UV-free Texture Generation with Denoising and Geodesic Heat Diffusions","summary":"  Seams, distortions, wasted UV space, vertex-duplication, and varying\nresolution over the surface are the most prominent issues of the standard\nUV-based texturing of meshes. These issues are particularly acute when\nautomatic UV-unwrapping techniques are used. For this reason, instead of\ngenerating textures in automatically generated UV-planes like most\nstate-of-the-art methods, we propose to represent textures as coloured\npoint-clouds whose colours are generated by a denoising diffusion probabilistic\nmodel constrained to operate on the surface of 3D objects. Our sampling and\nresolution agnostic generative model heavily relies on heat diffusion over the\nsurface of the meshes for spatial communication between points. To enable\nprocessing of arbitrarily sampled point-cloud textures and ensure long-distance\ntexture consistency we introduce a fast re-sampling of the mesh spectral\nproperties used during the heat diffusion and introduce a novel\nheat-diffusion-based self-attention mechanism. Our code and pre-trained models\nare available at github.com/simofoti/UV3-TeD.\n","authors":["Simone Foti","Stefanos Zafeiriou","Tolga Birdal"],"pdf_url":"https://arxiv.org/pdf/2408.16762v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16760v1","updated":"2024-08-29T17:56:33Z","published":"2024-08-29T17:56:33Z","title":"OmniRe: Omni Urban Scene Reconstruction","summary":"  We introduce OmniRe, a holistic approach for efficiently reconstructing\nhigh-fidelity dynamic urban scenes from on-device logs. Recent methods for\nmodeling driving sequences using neural radiance fields or Gaussian Splatting\nhave demonstrated the potential of reconstructing challenging dynamic scenes,\nbut often overlook pedestrians and other non-vehicle dynamic actors, hindering\na complete pipeline for dynamic urban scene reconstruction. To that end, we\npropose a comprehensive 3DGS framework for driving scenes, named OmniRe, that\nallows for accurate, full-length reconstruction of diverse dynamic objects in a\ndriving log. OmniRe builds dynamic neural scene graphs based on Gaussian\nrepresentations and constructs multiple local canonical spaces that model\nvarious dynamic actors, including vehicles, pedestrians, and cyclists, among\nmany others. This capability is unmatched by existing methods. OmniRe allows us\nto holistically reconstruct different objects present in the scene,\nsubsequently enabling the simulation of reconstructed scenarios with all actors\nparticipating in real-time (~60Hz). Extensive evaluations on the Waymo dataset\nshow that our approach outperforms prior state-of-the-art methods\nquantitatively and qualitatively by a large margin. We believe our work fills a\ncritical gap in driving reconstruction.\n","authors":["Ziyu Chen","Jiawei Yang","Jiahui Huang","Riccardo de Lutio","Janick Martinez Esturo","Boris Ivanovic","Or Litany","Zan Gojcic","Sanja Fidler","Marco Pavone","Li Song","Yue Wang"],"pdf_url":"https://arxiv.org/pdf/2408.16760v1.pdf","comment":"See the project page for code, video results and demos:\n  https://ziyc.github.io/omnire/"},{"id":"http://arxiv.org/abs/2407.10972v2","updated":"2024-08-29T17:55:52Z","published":"2024-07-15T17:59:55Z","title":"VGBench: Evaluating Large Language Models on Vector Graphics\n  Understanding and Generation","summary":"  In the realm of vision models, the primary mode of representation is using\npixels to rasterize the visual world. Yet this is not always the best or unique\nway to represent visual content, especially for designers and artists who\ndepict the world using geometry primitives such as polygons. Vector graphics\n(VG), on the other hand, offer a textual representation of visual content,\nwhich can be more concise and powerful for content like cartoons, sketches and\nscientific figures. Recent studies have shown promising results on processing\nvector graphics with capable Large Language Models (LLMs). However, such works\nfocus solely on qualitative results, understanding, or a specific type of\nvector graphics. We propose VGBench, a comprehensive benchmark for LLMs on\nhandling vector graphics through diverse aspects, including (a) both visual\nunderstanding and generation, (b) evaluation of various vector graphics\nformats, (c) diverse question types, (d) wide range of prompting techniques,\n(e) under multiple LLMs and (f) comparison with VLMs on rasterized\nrepresentations. Evaluating on our collected 4279 understanding and 5845\ngeneration samples, we find that LLMs show strong capability on both aspects\nwhile exhibiting less desirable performance on low-level formats (SVG). Both\ndata and evaluation pipeline will be open-sourced at https://vgbench.github.io.\n","authors":["Bocheng Zou","Mu Cai","Jianrui Zhang","Yong Jae Lee"],"pdf_url":"https://arxiv.org/pdf/2407.10972v2.pdf","comment":"Project Page: https://vgbench.github.io"},{"id":"http://arxiv.org/abs/2408.16757v1","updated":"2024-08-29T17:55:07Z","published":"2024-08-29T17:55:07Z","title":"Dissecting Out-of-Distribution Detection and Open-Set Recognition: A\n  Critical Analysis of Methods and Benchmarks","summary":"  Detecting test-time distribution shift has emerged as a key capability for\nsafely deployed machine learning models, with the question being tackled under\nvarious guises in recent years. In this paper, we aim to provide a consolidated\nview of the two largest sub-fields within the community: out-of-distribution\n(OOD) detection and open-set recognition (OSR). In particular, we aim to\nprovide rigorous empirical analysis of different methods across settings and\nprovide actionable takeaways for practitioners and researchers. Concretely, we\nmake the following contributions: (i) We perform rigorous cross-evaluation\nbetween state-of-the-art methods in the OOD detection and OSR settings and\nidentify a strong correlation between the performances of methods for them;\n(ii) We propose a new, large-scale benchmark setting which we suggest better\ndisentangles the problem tackled by OOD detection and OSR, re-evaluating\nstate-of-the-art OOD detection and OSR methods in this setting; (iii) We\nsurprisingly find that the best performing method on standard benchmarks\n(Outlier Exposure) struggles when tested at scale, while scoring rules which\nare sensitive to the deep feature magnitude consistently show promise; and (iv)\nWe conduct empirical analysis to explain these phenomena and highlight\ndirections for future research. Code:\n\\url{https://github.com/Visual-AI/Dissect-OOD-OSR}\n","authors":["Hongjun Wang","Sagar Vaze","Kai Han"],"pdf_url":"https://arxiv.org/pdf/2408.16757v1.pdf","comment":"Accepted to IJCV, preprint version"},{"id":"http://arxiv.org/abs/2408.11817v2","updated":"2024-08-29T17:47:47Z","published":"2024-08-21T17:59:32Z","title":"GRAB: A Challenging GRaph Analysis Benchmark for Large Multimodal Models","summary":"  Large multimodal models (LMMs) have exhibited proficiencies across many\nvisual tasks. Although numerous well-known benchmarks exist to evaluate model\nperformance, they increasingly have insufficient headroom. As such, there is a\npressing need for a new generation of benchmarks challenging enough for the\nnext generation of LMMs. One area that LMMs show potential is graph analysis,\nspecifically, the tasks an analyst might typically perform when interpreting\nfigures such as estimating the mean, intercepts or correlations of functions\nand data series. In this work, we introduce GRAB, a graph analysis benchmark,\nfit for current and future frontier LMMs. Our benchmark is entirely synthetic,\nensuring high-quality, noise-free questions. GRAB is comprised of 2170\nquestions, covering four tasks and 23 graph properties. We evaluate 20 LMMs on\nGRAB, finding it to be a challenging benchmark, with the highest performing\nmodel attaining a score of just 21.7%. Finally, we conduct various ablations to\ninvestigate where the models succeed and struggle. We release GRAB to encourage\nprogress in this important, growing domain.\n","authors":["Jonathan Roberts","Kai Han","Samuel Albanie"],"pdf_url":"https://arxiv.org/pdf/2408.11817v2.pdf","comment":"V2: Fixed references formatting"},{"id":"http://arxiv.org/abs/2408.16730v1","updated":"2024-08-29T17:21:58Z","published":"2024-08-29T17:21:58Z","title":"VideoLLM-MoD: Efficient Video-Language Streaming with Mixture-of-Depths\n  Vision Computation","summary":"  A well-known dilemma in large vision-language models (e.g., GPT-4, LLaVA) is\nthat while increasing the number of vision tokens generally enhances visual\nunderstanding, it also significantly raises memory and computational costs,\nespecially in long-term, dense video frame streaming scenarios. Although\nlearnable approaches like Q-Former and Perceiver Resampler have been developed\nto reduce the vision token burden, they overlook the context causally modeled\nby LLMs (i.e., key-value cache), potentially leading to missed visual cues when\naddressing user queries. In this paper, we introduce a novel approach to reduce\nvision compute by leveraging redundant vision tokens \"skipping layers\" rather\nthan decreasing the number of vision tokens. Our method, VideoLLM-MoD, is\ninspired by mixture-of-depths LLMs and addresses the challenge of numerous\nvision tokens in long-term or streaming video. Specifically, for each\ntransformer layer, we learn to skip the computation for a high proportion\n(e.g., 80\\%) of vision tokens, passing them directly to the next layer. This\napproach significantly enhances model efficiency, achieving approximately\n\\textasciitilde42\\% time and \\textasciitilde30\\% memory savings for the entire\ntraining. Moreover, our method reduces the computation in the context and avoid\ndecreasing the vision tokens, thus preserving or even improving performance\ncompared to the vanilla model. We conduct extensive experiments to demonstrate\nthe effectiveness of VideoLLM-MoD, showing its state-of-the-art results on\nmultiple benchmarks, including narration, forecasting, and summarization tasks\nin COIN, Ego4D, and Ego-Exo4D datasets.\n","authors":["Shiwei Wu","Joya Chen","Kevin Qinghong Lin","Qimeng Wang","Yan Gao","Qianli Xu","Tong Xu","Yao Hu","Enhong Chen","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2408.16730v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09359v4","updated":"2024-08-29T17:21:27Z","published":"2024-04-14T21:14:47Z","title":"Evaluation Framework for Feedback Generation Methods in Skeletal\n  Movement Assessment","summary":"  The application of machine-learning solutions to movement assessment from\nskeleton videos has attracted significant research attention in recent years.\nThis advancement has made rehabilitation at home more accessible, utilizing\nmovement assessment algorithms that can operate on affordable equipment for\nhuman pose detection and analysis from 2D or 3D videos. While the primary\nobjective of automatic assessment tasks is to score movements, the automatic\ngeneration of feedback highlighting key movement issues has the potential to\nsignificantly enhance and accelerate the rehabilitation process. While numerous\nresearch works exist in the field of automatic movement assessment, only a\nhandful address feedback generation. In this study, we propose terminology and\ncriteria for the classification, evaluation, and comparison of feedback\ngeneration solutions. We discuss the challenges associated with each feedback\ngeneration approach and use our proposed criteria to classify existing\nsolutions. To our knowledge, this is the first work that formulates feedback\ngeneration in skeletal movement assessment.\n","authors":["Tal Hakim"],"pdf_url":"https://arxiv.org/pdf/2404.09359v4.pdf","comment":"Accepted to xAI4Biometrics 2024 at ECCV 2024"},{"id":"http://arxiv.org/abs/2408.16729v1","updated":"2024-08-29T17:20:59Z","published":"2024-08-29T17:20:59Z","title":"Prediction-Feedback DETR for Temporal Action Detection","summary":"  Temporal Action Detection (TAD) is fundamental yet challenging for real-world\nvideo applications. Leveraging the unique benefits of transformers, various\nDETR-based approaches have been adopted in TAD. However, it has recently been\nidentified that the attention collapse in self-attention causes the performance\ndegradation of DETR for TAD. Building upon previous research, this paper newly\naddresses the attention collapse problem in cross-attention within DETR-based\nTAD methods. Moreover, our findings reveal that cross-attention exhibits\npatterns distinct from predictions, indicating a short-cut phenomenon. To\nresolve this, we propose a new framework, Prediction-Feedback DETR (Pred-DETR),\nwhich utilizes predictions to restore the collapse and align the cross- and\nself-attention with predictions. Specifically, we devise novel\nprediction-feedback objectives using guidance from the relations of the\npredictions. As a result, Pred-DETR significantly alleviates the collapse and\nachieves state-of-the-art performance among DETR-based methods on various\nchallenging benchmarks including THUMOS14, ActivityNet-v1.3, HACS, and\nFineAction.\n","authors":["Jihwan Kim","Miso Lee","Cheol-Ho Cho","Jihyun Lee","Jae-Pil Heo"],"pdf_url":"https://arxiv.org/pdf/2408.16729v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11933v2","updated":"2024-08-29T17:16:13Z","published":"2024-06-17T15:41:57Z","title":"OpticalRS-4M: Scaling Efficient Masked Autoencoder Learning on Large\n  Remote Sensing Dataset","summary":"  Masked Image Modeling (MIM) has become an essential method for building\nfoundational visual models in remote sensing (RS). However, the limitations in\nsize and diversity of existing RS datasets restrict the ability of MIM methods\nto learn generalizable representations. Additionally, conventional MIM\ntechniques, which require reconstructing all tokens, introduce unnecessary\ncomputational overhead. To address these issues, we present a new pre-training\npipeline for RS models, featuring the creation of a large-scale RS dataset and\nan efficient MIM approach. We curated a high-quality dataset named OpticalRS-4M\nby collecting publicly available RS datasets and processing them through\nexclusion, slicing, and deduplication. OpticalRS-4M comprises 4 million optical\nimages covering various RS tasks, such as object detection and pixel\nsegmentation. To enhance efficiency, we propose SelectiveMAE, a pre-training\nmethod that dynamically encodes and reconstructs semantically rich patch\ntokens, thereby reducing the inefficiencies of traditional MIM models caused by\nredundant background pixels in RS images. Extensive experiments demonstrate\nthat OpticalRS-4M significantly improves classification, detection, and\nsegmentation performance, while SelectiveMAE increases training efficiency over\n2 times. This highlights the effectiveness and scalability of our pipeline in\ndeveloping RS foundational models.\n","authors":["Fengxiang Wang","Hongzhen Wang","Di Wang","Zonghao Guo","Zhenyu Zhong","Long Lan","Jing Zhang","Zhiyuan Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2406.11933v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16719v1","updated":"2024-08-29T17:11:38Z","published":"2024-08-29T17:11:38Z","title":"H-SGANet: Hybrid Sparse Graph Attention Network for Deformable Medical\n  Image Registration","summary":"  The integration of Convolutional Neural Network (ConvNet) and Transformer has\nemerged as a strong candidate for image registration, leveraging the strengths\nof both models and a large parameter space. However, this hybrid model,\ntreating brain MRI volumes as grid or sequence structures, faces challenges in\naccurately representing anatomical connectivity, diverse brain regions, and\nvital connections contributing to the brain's internal architecture. Concerns\nalso arise regarding the computational expense and GPU memory usage associated\nwith this model. To tackle these issues, a lightweight hybrid sparse graph\nattention network (H-SGANet) has been developed. This network incorporates a\ncentral mechanism, Sparse Graph Attention (SGA), based on a Vision Graph Neural\nNetwork (ViG) with predetermined anatomical connections. The SGA module expands\nthe model's receptive field and seamlessly integrates into the network. To\nfurther amplify the advantages of the hybrid network, the Separable\nSelf-Attention (SSA) is employed as an enhanced token mixer, integrated with\ndepth-wise convolution to constitute SSAFormer. This strategic integration is\ndesigned to more effectively extract long-range dependencies. As a hybrid\nConvNet-ViG-Transformer model, H-SGANet offers threefold benefits for\nvolumetric medical image registration. It optimizes fixed and moving images\nconcurrently through a hybrid feature fusion layer and an end-to-end learning\nframework. Compared to VoxelMorph, a model with a similar parameter count,\nH-SGANet demonstrates significant performance enhancements of 3.5% and 1.5% in\nDice score on the OASIS dataset and LPBA40 dataset, respectively.\n","authors":["Yufeng Zhou","Wenming Cao"],"pdf_url":"https://arxiv.org/pdf/2408.16719v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16704v1","updated":"2024-08-29T16:58:10Z","published":"2024-08-29T16:58:10Z","title":"One-Shot Learning Meets Depth Diffusion in Multi-Object Videos","summary":"  Creating editable videos that depict complex interactions between multiple\nobjects in various artistic styles has long been a challenging task in\nfilmmaking. Progress is often hampered by the scarcity of data sets that\ncontain paired text descriptions and corresponding videos that showcase these\ninteractions. This paper introduces a novel depth-conditioning approach that\nsignificantly advances this field by enabling the generation of coherent and\ndiverse videos from just a single text-video pair using a pre-trained\ndepth-aware Text-to-Image (T2I) model. Our method fine-tunes the pre-trained\nmodel to capture continuous motion by employing custom-designed spatial and\ntemporal attention mechanisms. During inference, we use the DDIM inversion to\nprovide structural guidance for video generation. This innovative technique\nallows for continuously controllable depth in videos, facilitating the\ngeneration of multiobject interactions while maintaining the concept generation\nand compositional strengths of the original T2I model across various artistic\nstyles, such as photorealism, animation, and impressionism.\n","authors":["Anisha Jain"],"pdf_url":"https://arxiv.org/pdf/2408.16704v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16700v1","updated":"2024-08-29T16:51:07Z","published":"2024-08-29T16:51:07Z","title":"GradBias: Unveiling Word Influence on Bias in Text-to-Image Generative\n  Models","summary":"  Recent progress in Text-to-Image (T2I) generative models has enabled\nhigh-quality image generation. As performance and accessibility increase, these\nmodels are gaining significant attraction and popularity: ensuring their\nfairness and safety is a priority to prevent the dissemination and perpetuation\nof biases. However, existing studies in bias detection focus on closed sets of\npredefined biases (e.g., gender, ethnicity). In this paper, we propose a\ngeneral framework to identify, quantify, and explain biases in an open set\nsetting, i.e. without requiring a predefined set. This pipeline leverages a\nLarge Language Model (LLM) to propose biases starting from a set of captions.\nNext, these captions are used by the target generative model for generating a\nset of images. Finally, Vision Question Answering (VQA) is leveraged for bias\nevaluation. We show two variations of this framework: OpenBias and GradBias.\nOpenBias detects and quantifies biases, while GradBias determines the\ncontribution of individual prompt words on biases. OpenBias effectively detects\nboth well-known and novel biases related to people, objects, and animals and\nhighly aligns with existing closed-set bias detection methods and human\njudgment. GradBias shows that neutral words can significantly influence biases\nand it outperforms several baselines, including state-of-the-art foundation\nmodels. Code available here: https://github.com/Moreno98/GradBias.\n","authors":["Moreno D'Incà","Elia Peruzzo","Massimiliano Mancini","Xingqian Xu","Humphrey Shi","Nicu Sebe"],"pdf_url":"https://arxiv.org/pdf/2408.16700v1.pdf","comment":"Under review. Code: https://github.com/Moreno98/GradBias"},{"id":"http://arxiv.org/abs/2408.16690v1","updated":"2024-08-29T16:37:58Z","published":"2024-08-29T16:37:58Z","title":"Generic Objects as Pose Probes for Few-Shot View Synthesis","summary":"  Radiance fields including NeRFs and 3D Gaussians demonstrate great potential\nin high-fidelity rendering and scene reconstruction, while they require a\nsubstantial number of posed images as inputs. COLMAP is frequently employed for\npreprocessing to estimate poses, while it necessitates a large number of\nfeature matches to operate effectively, and it struggles with scenes\ncharacterized by sparse features, large baselines between images, or a limited\nnumber of input images. We aim to tackle few-view NeRF reconstruction using\nonly 3 to 6 unposed scene images. Traditional methods often use calibration\nboards but they are not common in images. We propose a novel idea of utilizing\neveryday objects, commonly found in both images and real life, as \"pose\nprobes\". The probe object is automatically segmented by SAM, whose shape is\ninitialized from a cube. We apply a dual-branch volume rendering optimization\n(object NeRF and scene NeRF) to constrain the pose optimization and jointly\nrefine the geometry. Specifically, object poses of two views are first\nestimated by PnP matching in an SDF representation, which serves as initial\nposes. PnP matching, requiring only a few features, is suitable for\nfeature-sparse scenes. Additional views are incrementally incorporated to\nrefine poses from preceding views. In experiments, PoseProbe achieves\nstate-of-the-art performance in both pose estimation and novel view synthesis\nacross multiple datasets. We demonstrate its effectiveness, particularly in\nfew-view and large-baseline scenes where COLMAP struggles. In ablations, using\ndifferent objects in a scene yields comparable performance.\n","authors":["Zhirui Gao","Renjiao Yi","Chenyang Zhu","Ke Zhuang","Wei Chen","Kai Xu"],"pdf_url":"https://arxiv.org/pdf/2408.16690v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16684v1","updated":"2024-08-29T16:31:05Z","published":"2024-08-29T16:31:05Z","title":"PartFormer: Awakening Latent Diverse Representation from Vision\n  Transformer for Object Re-Identification","summary":"  Extracting robust feature representation is critical for object\nre-identification to accurately identify objects across non-overlapping\ncameras. Although having a strong representation ability, the Vision\nTransformer (ViT) tends to overfit on most distinct regions of training data,\nlimiting its generalizability and attention to holistic object features.\nMeanwhile, due to the structural difference between CNN and ViT, fine-grained\nstrategies that effectively address this issue in CNN do not continue to be\nsuccessful in ViT. To address this issue, by observing the latent diverse\nrepresentation hidden behind the multi-head attention, we present PartFormer,\nan innovative adaptation of ViT designed to overcome the granularity\nlimitations in object Re-ID tasks. The PartFormer integrates a Head\nDisentangling Block (HDB) that awakens the diverse representation of multi-head\nself-attention without the typical loss of feature richness induced by\nconcatenation and FFN layers post-attention. To avoid the homogenization of\nattention heads and promote robust part-based feature learning, two head\ndiversity constraints are imposed: attention diversity constraint and\ncorrelation diversity constraint. These constraints enable the model to exploit\ndiverse and discriminative feature representations from different attention\nheads. Comprehensive experiments on various object Re-ID benchmarks demonstrate\nthe superiority of the PartFormer. Specifically, our framework significantly\noutperforms state-of-the-art by 2.4\\% mAP scores on the most challenging MSMT17\ndataset.\n","authors":["Lei Tan","Pingyang Dai","Jie Chen","Liujuan Cao","Yongjian Wu","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2408.16684v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.18915v3","updated":"2024-08-29T16:07:30Z","published":"2024-06-27T06:12:01Z","title":"Manipulate-Anything: Automating Real-World Robots using Vision-Language\n  Models","summary":"  Large-scale endeavors like and widespread community efforts such as\nOpen-X-Embodiment have contributed to growing the scale of robot demonstration\ndata. However, there is still an opportunity to improve the quality, quantity,\nand diversity of robot demonstration data. Although vision-language models have\nbeen shown to automatically generate demonstration data, their utility has been\nlimited to environments with privileged state information, they require\nhand-designed skills, and are limited to interactions with few object\ninstances. We propose Manipulate-Anything, a scalable automated generation\nmethod for real-world robotic manipulation. Unlike prior work, our method can\noperate in real-world environments without any privileged state information,\nhand-designed skills, and can manipulate any static object. We evaluate our\nmethod using two setups. First, Manipulate-Anything successfully generates\ntrajectories for all 7 real-world and 14 simulation tasks, significantly\noutperforming existing methods like VoxPoser. Second, Manipulate-Anything's\ndemonstrations can train more robust behavior cloning policies than training\nwith human demonstrations, or from data generated by VoxPoser, Scaling-up, and\nCode-As-Policies. We believe Manipulate-Anything can be a scalable method for\nboth generating data for robotics and solving novel tasks in a zero-shot\nsetting. Project page: https://robot-ma.github.io/.\n","authors":["Jiafei Duan","Wentao Yuan","Wilbert Pumacay","Yi Ru Wang","Kiana Ehsani","Dieter Fox","Ranjay Krishna"],"pdf_url":"https://arxiv.org/pdf/2406.18915v3.pdf","comment":"Project page: https://robot-ma.github.io/. All supplementary\n  material, prompts and code can be found on the project page"},{"id":"http://arxiv.org/abs/2408.16662v1","updated":"2024-08-29T16:05:22Z","published":"2024-08-29T16:05:22Z","title":"Space3D-Bench: Spatial 3D Question Answering Benchmark","summary":"  Answering questions about the spatial properties of the environment poses\nchallenges for existing language and vision foundation models due to a lack of\nunderstanding of the 3D world notably in terms of relationships between\nobjects. To push the field forward, multiple 3D Q&A datasets were proposed\nwhich, overall, provide a variety of questions, but they individually focus on\nparticular aspects of 3D reasoning or are limited in terms of data modalities.\nTo address this, we present Space3D-Bench - a collection of 1000 general\nspatial questions and answers related to scenes of the Replica dataset which\noffers a variety of data modalities: point clouds, posed RGB-D images,\nnavigation meshes and 3D object detections. To ensure that the questions cover\na wide range of 3D objectives, we propose an indoor spatial questions taxonomy\ninspired by geographic information systems and use it to balance the dataset\naccordingly. Moreover, we provide an assessment system that grades natural\nlanguage responses based on predefined ground-truth answers by leveraging a\nVision Language Model's comprehension of both text and images to compare the\nresponses with ground-truth textual information or relevant visual data.\nFinally, we introduce a baseline called RAG3D-Chat integrating the world\nunderstanding of foundation models with rich context retrieval, achieving an\naccuracy of 67% on the proposed dataset.\n","authors":["Emilia Szymanska","Mihai Dusmanu","Jan-Willem Buurlage","Mahdi Rad","Marc Pollefeys"],"pdf_url":"https://arxiv.org/pdf/2408.16662v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16661v1","updated":"2024-08-29T16:05:05Z","published":"2024-08-29T16:05:05Z","title":"Eigen-Cluster VIS: Improving Weakly-supervised Video Instance\n  Segmentation by Leveraging Spatio-temporal Consistency","summary":"  The performance of Video Instance Segmentation (VIS) methods has improved\nsignificantly with the advent of transformer networks. However, these networks\noften face challenges in training due to the high annotation cost. To address\nthis, unsupervised and weakly-supervised methods have been developed to reduce\nthe dependency on annotations. This work introduces a novel weakly-supervised\nmethod called Eigen-cluster VIS that, without requiring any mask annotations,\nachieves competitive accuracy compared to other VIS approaches. This method is\nbased on two key innovations: a Temporal Eigenvalue Loss (TEL) and a clip-level\nQuality Cluster Coefficient (QCC). The TEL ensures temporal coherence by\nleveraging the eigenvalues of the Laplacian matrix derived from graph adjacency\nmatrices. By minimizing the mean absolute error (MAE) between the eigenvalues\nof adjacent frames, this loss function promotes smooth transitions and stable\nsegmentation boundaries over time, reducing temporal discontinuities and\nimproving overall segmentation quality. The QCC employs the K-means method to\nensure the quality of spatio-temporal clusters without relying on ground truth\nmasks. Using the Davies-Bouldin score, the QCC provides an unsupervised measure\nof feature discrimination, allowing the model to self-evaluate and adapt to\nvarying object distributions, enhancing robustness during the testing phase.\nThese enhancements are computationally efficient and straightforward, offering\nsignificant performance gains without additional annotated data. The proposed\nEigen-Cluster VIS method is evaluated on the YouTube-VIS 2019/2021 and OVIS\ndatasets, demonstrating that it effectively narrows the performance gap between\nthe fully-supervised and weakly-supervised VIS approaches. The code is\navailable on: https://github.com/farnooshar/EigenClusterVIS\n","authors":["Farnoosh Arefi","Amir M. Mansourian","Shohreh Kasaei"],"pdf_url":"https://arxiv.org/pdf/2408.16661v1.pdf","comment":"12 pages, 6 Figures, 5 tabels"},{"id":"http://arxiv.org/abs/2407.04559v2","updated":"2024-08-29T15:58:09Z","published":"2024-07-05T14:48:15Z","title":"Not (yet) the whole story: Evaluating Visual Storytelling Requires More\n  than Measuring Coherence, Grounding, and Repetition","summary":"  Visual storytelling consists in generating a natural language story given a\ntemporally ordered sequence of images. This task is not only challenging for\nmodels, but also very difficult to evaluate with automatic metrics since there\nis no consensus about what makes a story 'good'. In this paper, we introduce a\nnovel method that measures story quality in terms of human likeness regarding\nthree key aspects highlighted in previous work: visual grounding, coherence,\nand repetitiveness. We then use this method to evaluate the stories generated\nby several models, showing that the foundation model LLaVA obtains the best\nresult, but only slightly so compared to TAPM, a 50-times smaller visual\nstorytelling model. Upgrading the visual and language components of TAPM\nresults in a model that yields competitive performance with a relatively low\nnumber of parameters. Finally, we carry out a human evaluation study, whose\nresults suggest that a 'good' story may require more than a human-like level of\nvisual grounding, coherence, and repetition.\n","authors":["Aditya K Surikuchi","Raquel Fernández","Sandro Pezzelle"],"pdf_url":"https://arxiv.org/pdf/2407.04559v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16647v1","updated":"2024-08-29T15:52:56Z","published":"2024-08-29T15:52:56Z","title":"DriveGenVLM: Real-world Video Generation for Vision Language Model based\n  Autonomous Driving","summary":"  The advancement of autonomous driving technologies necessitates increasingly\nsophisticated methods for understanding and predicting real-world scenarios.\nVision language models (VLMs) are emerging as revolutionary tools with\nsignificant potential to influence autonomous driving. In this paper, we\npropose the DriveGenVLM framework to generate driving videos and use VLMs to\nunderstand them. To achieve this, we employ a video generation framework\ngrounded in denoising diffusion probabilistic models (DDPM) aimed at predicting\nreal-world video sequences. We then explore the adequacy of our generated\nvideos for use in VLMs by employing a pre-trained model known as Efficient\nIn-context Learning on Egocentric Videos (EILEV). The diffusion model is\ntrained with the Waymo open dataset and evaluated using the Fr\\'echet Video\nDistance (FVD) score to ensure the quality and realism of the generated videos.\nCorresponding narrations are provided by EILEV for these generated videos,\nwhich may be beneficial in the autonomous driving domain. These narrations can\nenhance traffic scene understanding, aid in navigation, and improve planning\ncapabilities. The integration of video generation with VLMs in the DriveGenVLM\nframework represents a significant step forward in leveraging advanced AI\nmodels to address complex challenges in autonomous driving.\n","authors":["Yongjie Fu","Anmol Jain","Xuan Di","Xu Chen","Zhaobin Mo"],"pdf_url":"https://arxiv.org/pdf/2408.16647v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16645v1","updated":"2024-08-29T15:51:06Z","published":"2024-08-29T15:51:06Z","title":"SODAWideNet++: Combining Attention and Convolutions for Salient Object\n  Detection","summary":"  Salient Object Detection (SOD) has traditionally relied on feature refinement\nmodules that utilize the features of an ImageNet pre-trained backbone. However,\nthis approach limits the possibility of pre-training the entire network because\nof the distinct nature of SOD and image classification. Additionally, the\narchitecture of these backbones originally built for Image classification is\nsub-optimal for a dense prediction task like SOD. To address these issues, we\npropose a novel encoder-decoder-style neural network called SODAWideNet++ that\nis designed explicitly for SOD. Inspired by the vision transformers ability to\nattain a global receptive field from the initial stages, we introduce the\nAttention Guided Long Range Feature Extraction (AGLRFE) module, which combines\nlarge dilated convolutions and self-attention. Specifically, we use attention\nfeatures to guide long-range information extracted by multiple dilated\nconvolutions, thus taking advantage of the inductive biases of a convolution\noperation and the input dependency brought by self-attention. In contrast to\nthe current paradigm of ImageNet pre-training, we modify 118K annotated images\nfrom the COCO semantic segmentation dataset by binarizing the annotations to\npre-train the proposed model end-to-end. Further, we supervise the background\npredictions along with the foreground to push our model to generate accurate\nsaliency predictions. SODAWideNet++ performs competitively on five different\ndatasets while only containing 35% of the trainable parameters compared to the\nstate-of-the-art models. The code and pre-computed saliency maps are provided\nat https://github.com/VimsLab/SODAWideNetPlusPlus.\n","authors":["Rohit Venkata Sai Dulam","Chandra Kambhamettu"],"pdf_url":"https://arxiv.org/pdf/2408.16645v1.pdf","comment":"Accepted at ICPR 2024"},{"id":"http://arxiv.org/abs/2408.16638v1","updated":"2024-08-29T15:42:06Z","published":"2024-08-29T15:42:06Z","title":"3D Pose-Based Temporal Action Segmentation for Figure Skating: A\n  Fine-Grained and Jump Procedure-Aware Annotation Approach","summary":"  Understanding human actions from videos is essential in many domains,\nincluding sports. In figure skating, technical judgments are performed by\nwatching skaters' 3D movements, and its part of the judging procedure can be\nregarded as a Temporal Action Segmentation (TAS) task. TAS tasks in figure\nskating that automatically assign temporal semantics to video are actively\nresearched. However, there is a lack of datasets and effective methods for TAS\ntasks requiring 3D pose data. In this study, we first created the FS-Jump3D\ndataset of complex and dynamic figure skating jumps using optical markerless\nmotion capture. We also propose a new fine-grained figure skating jump TAS\ndataset annotation method with which TAS models can learn jump procedures. In\nthe experimental results, we validated the usefulness of 3D pose features as\ninput and the fine-grained dataset for the TAS model in figure skating.\nFS-Jump3D Dataset is available at https://github.com/ryota-skating/FS-Jump3D.\n","authors":["Ryota Tanaka","Tomohiro Suzuki","Keisuke Fujii"],"pdf_url":"https://arxiv.org/pdf/2408.16638v1.pdf","comment":"10 pages, 7th ACM International Workshop on Multimedia Content\n  Analysis in Sports"},{"id":"http://arxiv.org/abs/2405.20743v2","updated":"2024-08-29T15:31:58Z","published":"2024-05-31T10:13:17Z","title":"Trajectory Forecasting through Low-Rank Adaptation of Discrete Latent\n  Codes","summary":"  Trajectory forecasting is crucial for video surveillance analytics, as it\nenables the anticipation of future movements for a set of agents, e.g.\nbasketball players engaged in intricate interactions with long-term intentions.\nDeep generative models offer a natural learning approach for trajectory\nforecasting, yet they encounter difficulties in achieving an optimal balance\nbetween sampling fidelity and diversity. We address this challenge by\nleveraging Vector Quantized Variational Autoencoders (VQ-VAEs), which utilize a\ndiscrete latent space to tackle the issue of posterior collapse. Specifically,\nwe introduce an instance-based codebook that allows tailored latent\nrepresentations for each example. In a nutshell, the rows of the codebook are\ndynamically adjusted to reflect contextual information (i.e., past motion\npatterns extracted from the observed trajectories). In this way, the\ndiscretization process gains flexibility, leading to improved reconstructions.\nNotably, instance-level dynamics are injected into the codebook through\nlow-rank updates, which restrict the customization of the codebook to a lower\ndimension space. The resulting discrete space serves as the basis of the\nsubsequent step, which regards the training of a diffusion-based predictive\nmodel. We show that such a two-fold framework, augmented with instance-level\ndiscretization, leads to accurate and diverse forecasts, yielding\nstate-of-the-art performance on three established benchmarks.\n","authors":["Riccardo Benaglia","Angelo Porrello","Pietro Buzzega","Simone Calderara","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2405.20743v2.pdf","comment":"15 pages, 3 figures, 5 tables"},{"id":"http://arxiv.org/abs/2408.16623v1","updated":"2024-08-29T15:31:51Z","published":"2024-08-29T15:31:51Z","title":"Turbulence Strength $C_n^2$ Estimation from Video using Physics-based\n  Deep Learning","summary":"  Images captured from a long distance suffer from dynamic image distortion due\nto turbulent flow of air cells with random temperatures, and thus refractive\nindices. This phenomenon, known as image dancing, is commonly characterized by\nits refractive-index structure constant $C_n^2$ as a measure of the turbulence\nstrength. For many applications such as atmospheric forecast model,\nlong-range/astronomy imaging, and aviation safety, optical communication\ntechnology, $C_n^2$ estimation is critical for accurately sensing the turbulent\nenvironment. Previous methods for $C_n^2$ estimation include estimation from\nmeteorological data (temperature, relative humidity, wind shear, etc.) for\nsingle-point measurements, two-ended pathlength measurements from optical\nscintillometer for path-averaged $C_n^2$, and more recently estimating $C_n^2$\nfrom passive video cameras for low cost and hardware complexity. In this paper,\nwe present a comparative analysis of classical image gradient methods for\n$C_n^2$ estimation and modern deep learning-based methods leveraging\nconvolutional neural networks. To enable this, we collect a dataset of video\ncapture along with reference scintillometer measurements for ground truth, and\nwe release this unique dataset to the scientific community. We observe that\ndeep learning methods can achieve higher accuracy when trained on similar data,\nbut suffer from generalization errors to other, unseen imagery as compared to\nclassical methods. To overcome this trade-off, we present a novel physics-based\nnetwork architecture that combines learned convolutional layers with a\ndifferentiable image gradient method that maintains high accuracy while being\ngeneralizable across image datasets.\n","authors":["Ripon Kumar Saha","Esen Salcin","Jihoo Kim","Joseph Smith","Suren Jayasuriya"],"pdf_url":"https://arxiv.org/pdf/2408.16623v1.pdf","comment":"Code Available: https://github.com/Riponcs/Cn2Estimation"},{"id":"http://arxiv.org/abs/2408.16622v1","updated":"2024-08-29T15:31:43Z","published":"2024-08-29T15:31:43Z","title":"Sparse Signal Reconstruction for Overdispersed Low-photon Count\n  Biomedical Imaging Using $\\ell_p$ Total Variation","summary":"  The negative binomial model, which generalizes the Poisson distribution\nmodel, can be found in applications involving low-photon signal recovery,\nincluding medical imaging. Recent studies have explored several regularization\nterms for the negative binomial model, such as the $\\ell_p$ quasi-norm with $0\n< p < 1$, $\\ell_1$ norm, and the total variation (TV) quasi-seminorm for\npromoting sparsity in signal recovery. These penalty terms have been shown to\nimprove image reconstruction outcomes. In this paper, we investigate the\n$\\ell_p$ quasi-seminorm, both isotropic and anisotropic $\\ell_p$ TV\nquasi-seminorms, within the framework of the negative binomial statistical\nmodel. This problem can be formulated as an optimization problem, which we\nsolve using a gradient-based approach. We present comparisons between the\nnegative binomial and Poisson statistical models using the $\\ell_p$ TV\nquasi-seminorm as well as common penalty terms. Our experimental results\nhighlight the efficacy of the proposed method.\n","authors":["Yu Lu","Roummel F. Marcia"],"pdf_url":"https://arxiv.org/pdf/2408.16622v1.pdf","comment":"5 pages, Accepted by the IEEE International Symposium on Biomedical\n  Imaging (ISBI)"},{"id":"http://arxiv.org/abs/2408.13140v2","updated":"2024-08-29T15:31:35Z","published":"2024-08-23T15:02:09Z","title":"Verification of Geometric Robustness of Neural Networks via Piecewise\n  Linear Approximation and Lipschitz Optimisation","summary":"  We address the problem of verifying neural networks against geometric\ntransformations of the input image, including rotation, scaling, shearing, and\ntranslation. The proposed method computes provably sound piecewise linear\nconstraints for the pixel values by using sampling and linear approximations in\ncombination with branch-and-bound Lipschitz optimisation. The method obtains\nprovably tighter over-approximations of the perturbation region than the\npresent state-of-the-art. We report results from experiments on a comprehensive\nset of verification benchmarks on MNIST and CIFAR10. We show that our proposed\nimplementation resolves up to 32% more verification cases than present\napproaches.\n","authors":["Ben Batten","Yang Zheng","Alessandro De Palma","Panagiotis Kouvaros","Alessio Lomuscio"],"pdf_url":"https://arxiv.org/pdf/2408.13140v2.pdf","comment":"ECAI 2024"},{"id":"http://arxiv.org/abs/2408.16621v1","updated":"2024-08-29T15:28:42Z","published":"2024-08-29T15:28:42Z","title":"Towards Infusing Auxiliary Knowledge for Distracted Driver Detection","summary":"  Distracted driving is a leading cause of road accidents globally.\nIdentification of distracted driving involves reliably detecting and\nclassifying various forms of driver distraction (e.g., texting, eating, or\nusing in-car devices) from in-vehicle camera feeds to enhance road safety. This\ntask is challenging due to the need for robust models that can generalize to a\ndiverse set of driver behaviors without requiring extensive annotated datasets.\nIn this paper, we propose KiD3, a novel method for distracted driver detection\n(DDD) by infusing auxiliary knowledge about semantic relations between entities\nin a scene and the structural configuration of the driver's pose. Specifically,\nwe construct a unified framework that integrates the scene graphs, and driver\npose information with the visual cues in video frames to create a holistic\nrepresentation of the driver's actions.Our results indicate that KiD3 achieves\na 13.64% accuracy improvement over the vision-only baseline by incorporating\nsuch auxiliary knowledge with visual information.\n","authors":["Ishwar B Balappanawar","Ashmit Chamoli","Ruwan Wickramarachchi","Aditya Mishra","Ponnurangam Kumaraguru","Amit P. Sheth"],"pdf_url":"https://arxiv.org/pdf/2408.16621v1.pdf","comment":"Accepted at KiL 2024: Workshop on Knowledge-infused Learning\n  co-located with 30th ACM KDD Conference"},{"id":"http://arxiv.org/abs/2408.14698v2","updated":"2024-08-29T15:14:48Z","published":"2024-08-26T23:52:27Z","title":"Smart Multi-Modal Search: Contextual Sparse and Dense Embedding\n  Integration in Adobe Express","summary":"  As user content and queries become increasingly multi-modal, the need for\neffective multi-modal search systems has grown. Traditional search systems\noften rely on textual and metadata annotations for indexed images, while\nmulti-modal embeddings like CLIP enable direct search using text and image\nembeddings. However, embedding-based approaches face challenges in integrating\ncontextual features such as user locale and recency. Building a scalable\nmulti-modal search system requires fine-tuning several components. This paper\npresents a multi-modal search architecture and a series of AB tests that\noptimize embeddings and multi-modal technologies in Adobe Express template\nsearch. We address considerations such as embedding model selection, the roles\nof embeddings in matching and ranking, and the balance between dense and sparse\nembeddings. Our iterative approach demonstrates how utilizing sparse, dense,\nand contextual features enhances short and long query search, significantly\nreduces null rates (over 70\\%), and increases click-through rates (CTR). Our\nfindings provide insights into developing robust multi-modal search systems,\nthereby enhancing relevance for complex queries.\n","authors":["Cherag Aroraa","Tracy Holloway King","Jayant Kumar","Yi Lu","Sanat Sharma","Arvind Srikantan","David Uvalle","Josep Valls-Vargas","Harsha Vardhan"],"pdf_url":"https://arxiv.org/pdf/2408.14698v2.pdf","comment":"CIKM 2024 (International Conference on Information and Knowledge\n  Management), Multimodal Search and Recommendations Workshop"},{"id":"http://arxiv.org/abs/2401.12972v3","updated":"2024-08-29T15:11:29Z","published":"2024-01-23T18:58:35Z","title":"On the Efficacy of Text-Based Input Modalities for Action Anticipation","summary":"  Anticipating future actions is a highly challenging task due to the diversity\nand scale of potential future actions; yet, information from different\nmodalities help narrow down plausible action choices. Each modality can provide\ndiverse and often complementary context for the model to learn from. While\nprevious multi-modal methods leverage information from modalities such as video\nand audio, we primarily explore how text descriptions of actions and objects\ncan also lead to more accurate action anticipation by providing additional\ncontextual cues, e.g., about the environment and its contents. We propose a\nMulti-modal Contrastive Anticipative Transformer (M-CAT), a video transformer\narchitecture that jointly learns from multi-modal features and text\ndescriptions of actions and objects. We train our model in two stages, where\nthe model first learns to align video clips with descriptions of future\nactions, and is subsequently fine-tuned to predict future actions. Compared to\nexisting methods, M-CAT has the advantage of learning additional context from\ntwo types of text inputs: rich descriptions of future actions during\npre-training, and, text descriptions for detected objects and actions during\nmodality feature fusion. Through extensive experimental evaluation, we\ndemonstrate that our model outperforms previous methods on the EpicKitchens\ndatasets, and show that using simple text descriptions of actions and objects\naid in more effective action anticipation. In addition, we examine the impact\nof object and action information obtained via text, and perform extensive\nablations.\n","authors":["Apoorva Beedu","Harish Haresamudram","Karan Samel","Irfan Essa"],"pdf_url":"https://arxiv.org/pdf/2401.12972v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16582v1","updated":"2024-08-29T14:48:00Z","published":"2024-08-29T14:48:00Z","title":"FastForensics: Efficient Two-Stream Design for Real-Time Image\n  Manipulation Detection","summary":"  With the rise in popularity of portable devices, the spread of falsified\nmedia on social platforms has become rampant. This necessitates the timely\nidentification of authentic content. However, most advanced detection methods\nare computationally heavy, hindering their real-time application. In this\npaper, we describe an efficient two-stream architecture for real-time image\nmanipulation detection. Our method consists of two-stream branches targeting\nthe cognitive and inspective perspectives. In the cognitive branch, we propose\nefficient wavelet-guided Transformer blocks to capture the global manipulation\ntraces related to frequency. This block contains an interactive wavelet-guided\nself-attention module that integrates wavelet transformation with efficient\nattention design, interacting with the knowledge from the inspective branch.\nThe inspective branch consists of simple convolutions that capture fine-grained\ntraces and interact bidirectionally with Transformer blocks to provide mutual\nsupport. Our method is lightweight ($\\sim$ 8M) but achieves competitive\nperformance compared to many other counterparts, demonstrating its efficacy in\nimage manipulation detection and its potential for portable integration.\n","authors":["Yangxiang Zhang","Yuezun Li","Ao Luo","Jiaran Zhou","Junyu Dong"],"pdf_url":"https://arxiv.org/pdf/2408.16582v1.pdf","comment":"BMVC 2024"},{"id":"http://arxiv.org/abs/2404.11054v3","updated":"2024-08-29T14:43:25Z","published":"2024-04-17T03:56:28Z","title":"Mumpy: Multilateral Temporal-view Pyramid Transformer for Video\n  Inpainting Detection","summary":"  The task of video inpainting detection is to expose the pixel-level inpainted\nregions within a video sequence. Existing methods usually focus on leveraging\nspatial and temporal inconsistencies. However, these methods typically employ\nfixed operations to combine spatial and temporal clues, limiting their\napplicability in different scenarios. In this paper, we introduce a novel\nMultilateral Temporal-view Pyramid Transformer ({\\em MumPy}) that collaborates\nspatial-temporal clues flexibly. Our method utilizes a newly designed\nmultilateral temporal-view encoder to extract various collaborations of\nspatial-temporal clues and introduces a deformable window-based temporal-view\ninteraction module to enhance the diversity of these collaborations.\nSubsequently, we develop a multi-pyramid decoder to aggregate the various types\nof features and generate detection maps. By adjusting the contribution strength\nof spatial and temporal clues, our method can effectively identify inpainted\nregions. We validate our method on existing datasets and also introduce a new\nchallenging and large-scale Video Inpainting dataset based on the YouTube-VOS\ndataset, which employs several more recent inpainting methods. The results\ndemonstrate the superiority of our method in both in-domain and cross-domain\nevaluation scenarios.\n","authors":["Ying Zhang","Yuezun Li","Bo Peng","Jiaran Zhou","Huiyu Zhou","Junyu Dong"],"pdf_url":"https://arxiv.org/pdf/2404.11054v3.pdf","comment":"BMVC 2024"},{"id":"http://arxiv.org/abs/2408.14810v2","updated":"2024-08-29T14:38:22Z","published":"2024-08-27T06:49:21Z","title":"Generalist Segmentation Algorithm for Photoreceptors Analysis in\n  Adaptive Optics Imaging","summary":"  Analyzing the cone photoreceptor pattern in images obtained from the living\nhuman retina using quantitative methods can be crucial for the early detection\nand management of various eye conditions. Confocal adaptive optics scanning\nlight ophthalmoscope (AOSLO) imaging enables visualization of the cones from\nreflections of waveguiding cone photoreceptors. While there have been\nsignificant improvements in automated algorithms for segmenting cones in\nconfocal AOSLO images, the process of labelling data remains labor-intensive\nand manual. This paper introduces a method based on deep learning (DL) for\ndetecting and segmenting cones in AOSLO images. The models were trained on a\nsemi-automatically labelled dataset of 20 AOSLO batches of images of 18\nparticipants for 0$^{\\circ}$, 1$^{\\circ}$, and 2$^{\\circ}$ from the foveal\ncenter. F1 scores were 0.968, 0.958, and 0.954 for 0$^{\\circ}$, 1$^{\\circ}$,\nand 2$^{\\circ}$, respectively, which is better than previously reported DL\napproaches. Our method minimizes the need for labelled data by only\nnecessitating a fraction of labelled cones, which is especially beneficial in\nthe field of ophthalmology, where labelled data can often be limited.\n","authors":["Mikhail Kulyabin","Aline Sindel","Hilde Pedersen","Stuart Gilson","Rigmor Baraas","Andreas Maier"],"pdf_url":"https://arxiv.org/pdf/2408.14810v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16563v1","updated":"2024-08-29T14:30:45Z","published":"2024-08-29T14:30:45Z","title":"MST-KD: Multiple Specialized Teachers Knowledge Distillation for Fair\n  Face Recognition","summary":"  As in school, one teacher to cover all subjects is insufficient to distill\nequally robust information to a student. Hence, each subject is taught by a\nhighly specialised teacher. Following a similar philosophy, we propose a\nmultiple specialized teacher framework to distill knowledge to a student\nnetwork. In our approach, directed at face recognition use cases, we train four\nteachers on one specific ethnicity, leading to four highly specialized and\nbiased teachers. Our strategy learns a project of these four teachers into a\ncommon space and distill that information to a student network. Our results\nhighlighted increased performance and reduced bias for all our experiments. In\naddition, we further show that having biased/specialized teachers is crucial by\nshowing that our approach achieves better results than when knowledge is\ndistilled from four teachers trained on balanced datasets. Our approach\nrepresents a step forward to the understanding of the importance of\nethnicity-specific features.\n","authors":["Eduarda Caldeira","Jaime S. Cardoso","Ana F. Sequeira","Pedro C. Neto"],"pdf_url":"https://arxiv.org/pdf/2408.16563v1.pdf","comment":"Accepted at ECCV 2024 ABAW"},{"id":"http://arxiv.org/abs/2408.16547v1","updated":"2024-08-29T14:10:14Z","published":"2024-08-29T14:10:14Z","title":"OP-Align: Object-level and Part-level Alignment for Self-supervised\n  Category-level Articulated Object Pose Estimation","summary":"  Category-level articulated object pose estimation focuses on the pose\nestimation of unknown articulated objects within known categories. Despite its\nsignificance, this task remains challenging due to the varying shapes and poses\nof objects, expensive dataset annotation costs, and complex real-world\nenvironments. In this paper, we propose a novel self-supervised approach that\nleverages a single-frame point cloud to solve this task. Our model consistently\ngenerates reconstruction with a canonical pose and joint state for the entire\ninput object, and it estimates object-level poses that reduce overall pose\nvariance and part-level poses that align each part of the input with its\ncorresponding part of the reconstruction. Experimental results demonstrate that\nour approach significantly outperforms previous self-supervised methods and is\ncomparable to the state-of-the-art supervised methods. To assess the\nperformance of our model in real-world scenarios, we also introduce a new\nreal-world articulated object benchmark dataset.\n","authors":["Yuchen Che","Ryo Furukawa","Asako Kanezaki"],"pdf_url":"https://arxiv.org/pdf/2408.16547v1.pdf","comment":"to be published in ECCV2024"},{"id":"http://arxiv.org/abs/2408.16544v1","updated":"2024-08-29T14:02:47Z","published":"2024-08-29T14:02:47Z","title":"Spurfies: Sparse Surface Reconstruction using Local Geometry Priors","summary":"  We introduce Spurfies, a novel method for sparse-view surface reconstruction\nthat disentangles appearance and geometry information to utilize local geometry\npriors trained on synthetic data. Recent research heavily focuses on 3D\nreconstruction using dense multi-view setups, typically requiring hundreds of\nimages. However, these methods often struggle with few-view scenarios. Existing\nsparse-view reconstruction techniques often rely on multi-view stereo networks\nthat need to learn joint priors for geometry and appearance from a large amount\nof data. In contrast, we introduce a neural point representation that\ndisentangles geometry and appearance to train a local geometry prior using a\nsubset of the synthetic ShapeNet dataset only. During inference, we utilize\nthis surface prior as additional constraint for surface and appearance\nreconstruction from sparse input views via differentiable volume rendering,\nrestricting the space of possible solutions. We validate the effectiveness of\nour method on the DTU dataset and demonstrate that it outperforms previous\nstate of the art by 35% in surface quality while achieving competitive novel\nview synthesis quality. Moreover, in contrast to previous works, our method can\nbe applied to larger, unbounded scenes, such as Mip-NeRF 360.\n","authors":["Kevin Raj","Christopher Wewer","Raza Yunus","Eddy Ilg","Jan Eric Lenssen"],"pdf_url":"https://arxiv.org/pdf/2408.16544v1.pdf","comment":"https://geometric-rl.mpi-inf.mpg.de/spurfies/"},{"id":"http://arxiv.org/abs/2408.16540v1","updated":"2024-08-29T13:58:34Z","published":"2024-08-29T13:58:34Z","title":"GRPose: Learning Graph Relations for Human Image Generation with Pose\n  Priors","summary":"  Recent methods using diffusion models have made significant progress in human\nimage generation with various additional controls such as pose priors. However,\nexisting approaches still struggle to generate high-quality images with\nconsistent pose alignment, resulting in unsatisfactory outputs. In this paper,\nwe propose a framework delving into the graph relations of pose priors to\nprovide control information for human image generation. The main idea is to\nestablish a graph topological structure between the pose priors and latent\nrepresentation of diffusion models to capture the intrinsic associations\nbetween different pose parts. A Progressive Graph Integrator (PGI) is designed\nto learn the spatial relationships of the pose priors with the graph structure,\nadopting a hierarchical strategy within an Adapter to gradually propagate\ninformation across different pose parts. A pose perception loss is further\nintroduced based on a pretrained pose estimation network to minimize the pose\ndifferences. Extensive qualitative and quantitative experiments conducted on\nthe Human-Art and LAION-Human datasets demonstrate that our model achieves\nsuperior performance, with a 9.98% increase in pose average precision compared\nto the latest benchmark model. The code is released on *******.\n","authors":["Xiangchen Yin","Donglin Di","Lei Fan","Hao Li","Chen Wei","Xiaofei Gou","Yang Song","Xiao Sun","Xun Yang"],"pdf_url":"https://arxiv.org/pdf/2408.16540v1.pdf","comment":"The code will be released at https://github.com/XiangchenYin/GRPose"},{"id":"http://arxiv.org/abs/2405.04964v2","updated":"2024-08-29T13:44:20Z","published":"2024-05-08T11:09:24Z","title":"Frequency-Assisted Mamba for Remote Sensing Image Super-Resolution","summary":"  Recent progress in remote sensing image (RSI) super-resolution (SR) has\nexhibited remarkable performance using deep neural networks, e.g.,\nConvolutional Neural Networks and Transformers. However, existing SR methods\noften suffer from either a limited receptive field or quadratic computational\noverhead, resulting in sub-optimal global representation and unacceptable\ncomputational costs in large-scale RSI. To alleviate these issues, we develop\nthe first attempt to integrate the Vision State Space Model (Mamba) for RSI-SR,\nwhich specializes in processing large-scale RSI by capturing long-range\ndependency with linear complexity. To achieve better SR reconstruction,\nbuilding upon Mamba, we devise a Frequency-assisted Mamba framework, dubbed\nFMSR, to explore the spatial and frequent correlations. In particular, our FMSR\nfeatures a multi-level fusion architecture equipped with the Frequency\nSelection Module (FSM), Vision State Space Module (VSSM), and Hybrid Gate\nModule (HGM) to grasp their merits for effective spatial-frequency fusion.\nConsidering that global and local dependencies are complementary and both\nbeneficial for SR, we further recalibrate these multi-level features for\naccurate feature fusion via learnable scaling adaptors. Extensive experiments\non AID, DOTA, and DIOR benchmarks demonstrate that our FMSR outperforms\nstate-of-the-art Transformer-based methods HAT-L in terms of PSNR by 0.11 dB on\naverage, while consuming only 28.05% and 19.08% of its memory consumption and\ncomplexity, respectively. Code will be available at\nhttps://github.com/XY-boy/FreMamba\n","authors":["Yi Xiao","Qiangqiang Yuan","Kui Jiang","Yuzeng Chen","Qiang Zhang","Chia-Wen Lin"],"pdf_url":"https://arxiv.org/pdf/2405.04964v2.pdf","comment":"Accepted by IEEE TMM"},{"id":"http://arxiv.org/abs/2408.16520v1","updated":"2024-08-29T13:31:15Z","published":"2024-08-29T13:31:15Z","title":"Towards Modality-agnostic Label-efficient Segmentation with\n  Entropy-Regularized Distribution Alignment","summary":"  Label-efficient segmentation aims to perform effective segmentation on input\ndata using only sparse and limited ground-truth labels for training. This topic\nis widely studied in 3D point cloud segmentation due to the difficulty of\nannotating point clouds densely, while it is also essential for cost-effective\nsegmentation on 2D images. Until recently, pseudo-labels have been widely\nemployed to facilitate training with limited ground-truth labels, and promising\nprogress has been witnessed in both the 2D and 3D segmentation. However,\nexisting pseudo-labeling approaches could suffer heavily from the noises and\nvariations in unlabelled data, which would result in significant discrepancies\nbetween generated pseudo-labels and current model predictions during training.\nWe analyze that this can further confuse and affect the model learning process,\nwhich shows to be a shared problem in label-efficient learning across both 2D\nand 3D modalities. To address this issue, we propose a novel learning strategy\nto regularize the pseudo-labels generated for training, thus effectively\nnarrowing the gaps between pseudo-labels and model predictions. More\nspecifically, our method introduces an Entropy Regularization loss and a\nDistribution Alignment loss for label-efficient learning, resulting in an ERDA\nlearning strategy. Interestingly, by using KL distance to formulate the\ndistribution alignment loss, ERDA reduces to a deceptively simple\ncross-entropy-based loss which optimizes both the pseudo-label generation\nmodule and the segmentation model simultaneously. In addition, we innovate in\nthe pseudo-label generation to make our ERDA consistently effective across both\n2D and 3D data modalities for segmentation. Enjoying simplicity and more\nmodality-agnostic pseudo-label generation, our method has shown outstanding\nperformance in fully utilizing all unlabeled data points for training across\n...\n","authors":["Liyao Tang","Zhe Chen","Shanshan Zhao","Chaoyue Wang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2408.16520v1.pdf","comment":"Extended version of arXiv:2305.15832; Code at\n  https://github.com/LiyaoTang/ERDA"},{"id":"http://arxiv.org/abs/2408.02674v2","updated":"2024-08-29T13:29:36Z","published":"2024-07-22T06:13:22Z","title":"On Feasibility of Intent Obfuscating Attacks","summary":"  Intent obfuscation is a common tactic in adversarial situations, enabling the\nattacker to both manipulate the target system and avoid culpability.\nSurprisingly, it has rarely been implemented in adversarial attacks on machine\nlearning systems. We are the first to propose using intent obfuscation to\ngenerate adversarial examples for object detectors: by perturbing another\nnon-overlapping object to disrupt the target object, the attacker hides their\nintended target. We conduct a randomized experiment on 5 prominent detectors --\nYOLOv3, SSD, RetinaNet, Faster R-CNN, and Cascade R-CNN -- using both targeted\nand untargeted attacks and achieve success on all models and attacks. We\nanalyze the success factors characterizing intent obfuscating attacks,\nincluding target object confidence and perturb object sizes. We then\ndemonstrate that the attacker can exploit these success factors to increase\nsuccess rates for all models and attacks. Finally, we discuss main takeaways\nand legal repercussions.\n","authors":["Zhaobin Li","Patrick Shafto"],"pdf_url":"https://arxiv.org/pdf/2408.02674v2.pdf","comment":"33 pages, 21 Figures. Includes technical appendix. To appear in AIES\n  2024"},{"id":"http://arxiv.org/abs/2406.19006v2","updated":"2024-08-29T13:23:45Z","published":"2024-06-27T08:45:31Z","title":"VideoMambaPro: A Leap Forward for Mamba in Video Understanding","summary":"  Video understanding requires the extraction of rich spatio-temporal\nrepresentations, which transformer models achieve through self-attention.\nUnfortunately, self-attention poses a computational burden. In NLP, Mamba has\nsurfaced as an efficient alternative for transformers. However, Mamba's\nsuccesses do not trivially extend to computer vision tasks, including those in\nvideo analysis. In this paper, we theoretically analyze the differences between\nself-attention and Mamba. We identify two limitations in Mamba's token\nprocessing: historical decay and element contradiction. We propose\nVideoMambaPro (VMP) that solves the identified limitations by adding masked\nbackward computation and elemental residual connections to a VideoMamba\nbackbone. VideoMambaPro shows state-of-the-art video action recognition\nperformance compared to transformer models, and surpasses VideoMamba by clear\nmargins: 7.9% and 8.1% top-1 on Kinetics-400 and Something-Something V2,\nrespectively. Our VideoMambaPro-M model achieves 91.9% top-1 on Kinetics-400,\nonly 0.2% below InternVideo2-6B but with only 1.2% of its parameters. The\ncombination of high performance and efficiency makes VideoMambaPro an\ninteresting alternative for transformer models.\n","authors":["Hui Lu","Albert Ali Salah","Ronald Poppe"],"pdf_url":"https://arxiv.org/pdf/2406.19006v2.pdf","comment":"Model weights are lost due to management error, will re-calculate and\n  update the results"},{"id":"http://arxiv.org/abs/2212.12130v6","updated":"2024-08-29T13:08:14Z","published":"2022-12-23T03:54:59Z","title":"Learning to Detect and Segment for Open Vocabulary Object Detection","summary":"  Open vocabulary object detection has been greatly advanced by the recent\ndevelopment of vision-language pretrained model, which helps recognize novel\nobjects with only semantic categories. The prior works mainly focus on\nknowledge transferring to the object proposal classification and employ\nclass-agnostic box and mask prediction. In this work, we propose CondHead, a\nprincipled dynamic network design to better generalize the box regression and\nmask segmentation for open vocabulary setting. The core idea is to\nconditionally parameterize the network heads on semantic embedding and thus the\nmodel is guided with class-specific knowledge to better detect novel\ncategories. Specifically, CondHead is composed of two streams of network heads,\nthe dynamically aggregated head and the dynamically generated head. The former\nis instantiated with a set of static heads that are conditionally aggregated,\nthese heads are optimized as experts and are expected to learn sophisticated\nprediction. The latter is instantiated with dynamically generated parameters\nand encodes general class-specific information. With such a conditional design,\nthe detection model is bridged by the semantic embedding to offer strongly\ngeneralizable class-wise box and mask prediction. Our method brings significant\nimprovement to the state-of-the-art open vocabulary object detection methods\nwith very minor overhead, e.g., it surpasses a RegionClip model by 3.0\ndetection AP on novel categories, with only 1.1% more computation.\n","authors":["Tao Wang","Nan Li"],"pdf_url":"https://arxiv.org/pdf/2212.12130v6.pdf","comment":"Accepted to CVPR2023, code will be available later"},{"id":"http://arxiv.org/abs/2408.16506v1","updated":"2024-08-29T13:08:12Z","published":"2024-08-29T13:08:12Z","title":"Alignment is All You Need: A Training-free Augmentation Strategy for\n  Pose-guided Video Generation","summary":"  Character animation is a transformative field in computer graphics and\nvision, enabling dynamic and realistic video animations from static images.\nDespite advancements, maintaining appearance consistency in animations remains\na challenge. Our approach addresses this by introducing a training-free\nframework that ensures the generated video sequence preserves the reference\nimage's subtleties, such as physique and proportions, through a dual alignment\nstrategy. We decouple skeletal and motion priors from pose information,\nenabling precise control over animation generation. Our method also improves\npixel-level alignment for conditional control from the reference character,\nenhancing the temporal consistency and visual cohesion of animations. Our\nmethod significantly enhances the quality of video generation without the need\nfor large datasets or expensive computational resources.\n","authors":["Xiaoyu Jin","Zunnan Xu","Mingwen Ou","Wenming Yang"],"pdf_url":"https://arxiv.org/pdf/2408.16506v1.pdf","comment":"CVG@ICML 2024"},{"id":"http://arxiv.org/abs/2408.16504v1","updated":"2024-08-29T13:02:12Z","published":"2024-08-29T13:02:12Z","title":"A Simple and Generalist Approach for Panoptic Segmentation","summary":"  Generalist vision models aim for one and the same architecture for a variety\nof vision tasks. While such shared architecture may seem attractive, generalist\nmodels tend to be outperformed by their bespoken counterparts, especially in\nthe case of panoptic segmentation. We address this problem by introducing two\nkey contributions, without compromising the desirable properties of generalist\nmodels. These contributions are: (i) a positional-embedding (PE) based loss for\nimproved centroid regressions; (ii) Edge Distance Sampling (EDS) for the better\nseparation of instance boundaries. The PE-based loss facilitates a better\nper-pixel regression of the associated instance's centroid, whereas EDS\ncontributes by carefully handling the void regions (caused by missing labels)\nand smaller instances. These two simple yet effective modifications\nsignificantly improve established baselines, while achieving state-of-the-art\nresults among all generalist solutions. More specifically, our method achieves\na panoptic quality(PQ) of 52.5 on the COCO dataset, which is an improvement of\n10 points over the best model with similar approach (Painter), and is superior\nby 2 to the best performing diffusion-based method Pix2Seq-$\\mathcal{D}$.\nFurthermore, we provide insights into and an in-depth analysis of our\ncontributions through exhaustive experiments. Our source code and model weights\nwill be made publicly available.\n","authors":["Nedyalko Prisadnikov","Wouter Van Gansbeke","Danda Pani Paudel","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2408.16504v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16503v1","updated":"2024-08-29T13:02:01Z","published":"2024-08-29T13:02:01Z","title":"Locally Grouped and Scale-Guided Attention for Dense Pest Counting","summary":"  This study introduces a new dense pest counting problem to predict densely\ndistributed pests captured by digital traps. Unlike traditional detection-based\ncounting models for sparsely distributed objects, trap-based pest counting must\ndeal with dense pest distributions that pose challenges such as severe\nocclusion, wide pose variation, and similar appearances in colors and textures.\nTo address these problems, it is essential to incorporate the local attention\nmechanism, which identifies locally important and unimportant areas to learn\nlocally grouped features, thereby enhancing discriminative performance.\nAccordingly, this study presents a novel design that integrates locally grouped\nand scale-guided attention into a multiscale CenterNet framework. To group\nlocal features with similar attributes, a straightforward method is introduced\nusing the heatmap predicted by the first hourglass containing pest centroid\ninformation, which eliminates the need for complex clustering models. To\nenhance attentiveness, the pixel attention module transforms the heatmap into a\nlearnable map. Subsequently, scale-guided attention is deployed to make the\nobject and background features more discriminative, achieving multiscale\nfeature fusion. Through experiments, the proposed model is verified to enhance\nobject features based on local grouping and discriminative feature attention\nlearning. Additionally, the proposed model is highly effective in overcoming\nocclusion and pose variation problems, making it more suitable for dense pest\ncounting. In particular, the proposed model outperforms state-of-the-art models\nby a large margin, with a remarkable contribution to dense pest counting.\n","authors":["Chang-Hwan Son"],"pdf_url":"https://arxiv.org/pdf/2408.16503v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16501v1","updated":"2024-08-29T13:00:37Z","published":"2024-08-29T13:00:37Z","title":"UAV-Based Human Body Detector Selection and Fusion for Geolocated\n  Saliency Map Generation","summary":"  The problem of reliably detecting and geolocating objects of different\nclasses in soft real-time is essential in many application areas, such as\nSearch and Rescue performed using Unmanned Aerial Vehicles (UAVs). This\nresearch addresses the complementary problems of system contextual vision-based\ndetector selection, allocation, and execution, in addition to the fusion of\ndetection results from teams of UAVs for the purpose of accurately and reliably\ngeolocating objects of interest in a timely manner. In an offline step, an\napplication-independent evaluation of vision-based detectors from a system\nperspective is first performed. Based on this evaluation, the most appropriate\nalgorithms for online object detection for each platform are selected\nautomatically before a mission, taking into account a number of practical\nsystem considerations, such as the available communication links, video\ncompression used, and the available computational resources. The detection\nresults are fused using a method for building maps of salient locations which\ntakes advantage of a novel sensor model for vision-based detections for both\npositive and negative observations. A number of simulated and real flight\nexperiments are also presented, validating the proposed method.\n","authors":["Piotr Rudol","Patrick Doherty","Mariusz Wzorek","Chattrakul Sombattheera"],"pdf_url":"https://arxiv.org/pdf/2408.16501v1.pdf","comment":"42 pages, 19 figures"},{"id":"http://arxiv.org/abs/2408.16500v1","updated":"2024-08-29T12:59:12Z","published":"2024-08-29T12:59:12Z","title":"CogVLM2: Visual Language Models for Image and Video Understanding","summary":"  Beginning with VisualGLM and CogVLM, we are continuously exploring VLMs in\npursuit of enhanced vision-language fusion, efficient higher-resolution\narchitecture, and broader modalities and applications. Here we propose the\nCogVLM2 family, a new generation of visual language models for image and video\nunderstanding including CogVLM2, CogVLM2-Video and GLM-4V. As an image\nunderstanding model, CogVLM2 inherits the visual expert architecture with\nimproved training recipes in both pre-training and post-training stages,\nsupporting input resolution up to $1344 \\times 1344$ pixels. As a video\nunderstanding model, CogVLM2-Video integrates multi-frame input with timestamps\nand proposes automated temporal grounding data construction. Notably, CogVLM2\nfamily has achieved state-of-the-art results on benchmarks like MMBench,\nMM-Vet, TextVQA, MVBench and VCGBench. All models are open-sourced in\nhttps://github.com/THUDM/CogVLM2 and https://github.com/THUDM/GLM-4,\ncontributing to the advancement of the field.\n","authors":["Wenyi Hong","Weihan Wang","Ming Ding","Wenmeng Yu","Qingsong Lv","Yan Wang","Yean Cheng","Shiyu Huang","Junhui Ji","Zhao Xue","Lei Zhao","Zhuoyi Yang","Xiaotao Gu","Xiaohan Zhang","Guanyu Feng","Da Yin","Zihan Wang","Ji Qi","Xixuan Song","Peng Zhang","Debing Liu","Bin Xu","Juanzi Li","Yuxiao Dong","Jie Tang"],"pdf_url":"https://arxiv.org/pdf/2408.16500v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16486v1","updated":"2024-08-29T12:34:01Z","published":"2024-08-29T12:34:01Z","title":"Adapting Vision-Language Models to Open Classes via Test-Time Prompt\n  Tuning","summary":"  Adapting pre-trained models to open classes is a challenging problem in\nmachine learning. Vision-language models fully explore the knowledge of text\nmodality, demonstrating strong zero-shot recognition performance, which is\nnaturally suited for various open-set problems. More recently, some research\nfocuses on fine-tuning such models to downstream tasks. Prompt tuning methods\nachieved huge improvements by learning context vectors on few-shot data.\nHowever, through the evaluation under open-set adaptation setting with the test\ndata including new classes, we find that there exists a dilemma that learned\nprompts have worse generalization abilities than hand-crafted prompts. In this\npaper, we consider combining the advantages of both and come up with a\ntest-time prompt tuning approach, which leverages the maximum concept matching\n(MCM) scores as dynamic weights to generate an input-conditioned prompt for\neach image during test. Through extensive experiments on 11 different datasets,\nwe show that our proposed method outperforms all comparison methods on average\nconsidering both base and new classes. The code is available at\nhttps://github.com/gaozhengqing/TTPT\n","authors":["Zhengqing Gao","Xiang Ao","Xu-Yao Zhang","Cheng-Lin Liu"],"pdf_url":"https://arxiv.org/pdf/2408.16486v1.pdf","comment":"PRCV 2024"},{"id":"http://arxiv.org/abs/2403.06702v3","updated":"2024-08-29T12:27:12Z","published":"2024-03-11T13:17:55Z","title":"Fast Text-to-3D-Aware Face Generation and Manipulation via Direct\n  Cross-modal Mapping and Geometric Regularization","summary":"  Text-to-3D-aware face (T3D Face) generation and manipulation is an emerging\nresearch hot spot in machine learning, which still suffers from low efficiency\nand poor quality. In this paper, we propose an End-to-End Efficient and\nEffective network for fast and accurate T3D face generation and manipulation,\ntermed $E^3$-FaceNet. Different from existing complex generation paradigms,\n$E^3$-FaceNet resorts to a direct mapping from text instructions to 3D-aware\nvisual space. We introduce a novel Style Code Enhancer to enhance cross-modal\nsemantic alignment, alongside an innovative Geometric Regularization objective\nto maintain consistency across multi-view generations. Extensive experiments on\nthree benchmark datasets demonstrate that $E^3$-FaceNet can not only achieve\npicture-like 3D face generation and manipulation, but also improve inference\nspeed by orders of magnitudes. For instance, compared with Latent3D,\n$E^3$-FaceNet speeds up the five-view generations by almost 470 times, while\nstill exceeding in generation quality. Our code is released at\nhttps://github.com/Aria-Zhangjl/E3-FaceNet.\n","authors":["Jinlu Zhang","Yiyi Zhou","Qiancheng Zheng","Xiaoxiong Du","Gen Luo","Jun Peng","Xiaoshuai Sun","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2403.06702v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16481v1","updated":"2024-08-29T12:16:55Z","published":"2024-08-29T12:16:55Z","title":"A Deep-Learning-Based Lable-free No-Reference Image Quality Assessment\n  Metric: Application in Sodium MRI Denoising","summary":"  New multinuclear MRI techniques, such as sodium MRI, generally suffer from\nlow image quality due to an inherently low signal. Postprocessing methods, such\nas image denoising, have been developed for image enhancement. However, the\nassessment of these enhanced images is challenging especially considering when\nthere is a lack of high resolution and high signal images as reference, such as\nin sodium MRI. No-reference Image Quality Assessment (NR-IQA) metrics are\napproaches to solve this problem. Existing learning-based NR-IQA metrics rely\non labels derived from subjective human opinions or metrics like\nSignal-to-Noise Ratio (SNR), which are either time-consuming or lack accurate\nground truths, resulting in unreliable assessment. We note that deep learning\n(DL) models have a unique characteristic in that they are specialized to a\ncharacteristic training set, meaning that deviations between the input testing\ndata from the training data will reduce prediction accuracy. Therefore, we\npropose a novel DL-based NR-IQA metric, the Model Specialization Metric (MSM),\nwhich does not depend on ground-truth images or labels. MSM measures the\ndifference between the input image and the model's prediction for evaluating\nthe quality of the input image. Experiments conducted on both simulated\ndistorted proton T1-weighted MR images and denoised sodium MR images\ndemonstrate that MSM exhibits a superior evaluation performance on various\nsimulated noises and distortions. MSM also has a substantial agreement with the\nexpert evaluations, achieving an averaged Cohen's Kappa coefficient of 0.6528,\noutperforming the existing NR-IQA metrics.\n","authors":["Shuaiyu Yuan","Tristan Whitmarsh","Dimitri A Kessler","Otso Arponen","Mary A McLean","Gabrielle Baxter","Frank Riemer","Aneurin J Kennerley","William J Brackenbury","Fiona J Gilbert","Joshua D Kaggie"],"pdf_url":"https://arxiv.org/pdf/2408.16481v1.pdf","comment":"13 pages, 3 figures"},{"id":"http://arxiv.org/abs/2408.16478v1","updated":"2024-08-29T12:15:10Z","published":"2024-08-29T12:15:10Z","title":"MICDrop: Masking Image and Depth Features via Complementary Dropout for\n  Domain-Adaptive Semantic Segmentation","summary":"  Unsupervised Domain Adaptation (UDA) is the task of bridging the domain gap\nbetween a labeled source domain, e.g., synthetic data, and an unlabeled target\ndomain. We observe that current UDA methods show inferior results on fine\nstructures and tend to oversegment objects with ambiguous appearance. To\naddress these shortcomings, we propose to leverage geometric information, i.e.,\ndepth predictions, as depth discontinuities often coincide with segmentation\nboundaries. We show that naively incorporating depth into current UDA methods\ndoes not fully exploit the potential of this complementary information. To this\nend, we present MICDrop, which learns a joint feature representation by masking\nimage encoder features while inversely masking depth encoder features. With\nthis simple yet effective complementary masking strategy, we enforce the use of\nboth modalities when learning the joint feature representation. To aid this\nprocess, we propose a feature fusion module to improve both global as well as\nlocal information sharing while being robust to errors in the depth\npredictions. We show that our method can be plugged into various recent UDA\nmethods and consistently improve results across standard UDA benchmarks,\nobtaining new state-of-the-art performances.\n","authors":["Linyan Yang","Lukas Hoyer","Mark Weber","Tobias Fischer","Dengxin Dai","Laura Leal-Taixé","Marc Pollefeys","Daniel Cremers","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2408.16478v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16472v1","updated":"2024-08-29T12:04:03Z","published":"2024-08-29T12:04:03Z","title":"Creating a Segmented Pointcloud of Grapevines by Combining Multiple\n  Viewpoints Through Visual Odometry","summary":"  Grapevine winter pruning is a labor-intensive and repetitive process that\nsignificantly influences the quality and quantity of the grape harvest and\nproduced wine of the following season. It requires a careful and expert\ndetection of the point to be cut. Because of its complexity, repetitive nature\nand time constraint, the task requires skilled labor that needs to be trained.\nThis extended abstract presents the computer vision pipeline employed in\nproject Vinum, using detectron2 as a segmentation network and keypoint visual\nodometry to merge different observation into a single pointcloud used to make\ninformed pruning decisions.\n","authors":["Michael Adlerstein","Angelo Bratta","João Carlos Virgolino Soares","Giovanni Dessy","Miguel Fernandes","Matteo Gatti","Claudio Semini"],"pdf_url":"https://arxiv.org/pdf/2408.16472v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16471v1","updated":"2024-08-29T12:01:23Z","published":"2024-08-29T12:01:23Z","title":"Improving 3D deep learning segmentation with biophysically motivated\n  cell synthesis","summary":"  Biomedical research increasingly relies on 3D cell culture models and\nAI-based analysis can potentially facilitate a detailed and accurate feature\nextraction on a single-cell level. However, this requires for a precise\nsegmentation of 3D cell datasets, which in turn demands high-quality ground\ntruth for training. Manual annotation, the gold standard for ground truth data,\nis too time-consuming and thus not feasible for the generation of large 3D\ntraining datasets. To address this, we present a novel framework for generating\n3D training data, which integrates biophysical modeling for realistic cell\nshape and alignment. Our approach allows the in silico generation of coherent\nmembrane and nuclei signals, that enable the training of segmentation models\nutilizing both channels for improved performance. Furthermore, we present a new\nGAN training scheme that generates not only image data but also matching\nlabels. Quantitative evaluation shows superior performance of biophysical\nmotivated synthetic training data, even outperforming manual annotation and\npretrained models. This underscores the potential of incorporating biophysical\nmodeling for enhancing synthetic training data quality.\n","authors":["Roman Bruch","Mario Vitacolonna","Elina Nürnberg","Simeon Sauer","Rüdiger Rudolf","Markus Reischl"],"pdf_url":"https://arxiv.org/pdf/2408.16471v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16469v1","updated":"2024-08-29T12:00:11Z","published":"2024-08-29T12:00:11Z","title":"Multi-source Domain Adaptation for Panoramic Semantic Segmentation","summary":"  Panoramic semantic segmentation has received widespread attention recently\ndue to its comprehensive 360\\degree field of view. However, labeling such\nimages demands greater resources compared to pinhole images. As a result, many\nunsupervised domain adaptation methods for panoramic semantic segmentation have\nemerged, utilizing real pinhole images or low-cost synthetic panoramic images.\nBut, the segmentation model lacks understanding of the panoramic structure when\nonly utilizing real pinhole images, and it lacks perception of real-world\nscenes when only adopting synthetic panoramic images. Therefore, in this paper,\nwe propose a new task of multi-source domain adaptation for panoramic semantic\nsegmentation, aiming to utilize both real pinhole and synthetic panoramic\nimages in the source domains, enabling the segmentation model to perform well\non unlabeled real panoramic images in the target domain. Further, we propose\nDeformation Transform Aligner for Panoramic Semantic Segmentation (DTA4PASS),\nwhich converts all pinhole images in the source domains into panoramic-like\nimages, and then aligns the converted source domains with the target domain.\nSpecifically, DTA4PASS consists of two main components: Unpaired Semantic\nMorphing (USM) and Distortion Gating Alignment (DGA). Firstly, in USM, the\nSemantic Dual-view Discriminator (SDD) assists in training the diffeomorphic\ndeformation network, enabling the effective transformation of pinhole images\nwithout paired panoramic views. Secondly, DGA assigns pinhole-like and\npanoramic-like features to each image by gating, and aligns these two features\nthrough uncertainty estimation. DTA4PASS outperforms the previous\nstate-of-the-art methods by 1.92% and 2.19% on the outdoor and indoor\nmulti-source domain adaptation scenarios, respectively. The source code will be\nreleased.\n","authors":["Jing Jiang","Sicheng Zhao","Jiankun Zhu","Wenbo Tang","Zhaopan Xu","Jidong Yang","Pengfei Xu","Hongxun Yao"],"pdf_url":"https://arxiv.org/pdf/2408.16469v1.pdf","comment":"9 pages, 7 figures, 5 tables"},{"id":"http://arxiv.org/abs/2403.06831v2","updated":"2024-08-29T11:57:39Z","published":"2024-03-11T15:48:17Z","title":"HDRTransDC: High Dynamic Range Image Reconstruction with Transformer\n  Deformation Convolution","summary":"  High Dynamic Range (HDR) imaging aims to generate an artifact-free HDR image\nwith realistic details by fusing multi-exposure Low Dynamic Range (LDR) images.\nCaused by large motion and severe under-/over-exposure among input LDR images,\nHDR imaging suffers from ghosting artifacts and fusion distortions. To address\nthese critical issues, we propose an HDR Transformer Deformation Convolution\n(HDRTransDC) network to generate high-quality HDR images, which consists of the\nTransformer Deformable Convolution Alignment Module (TDCAM) and the Dynamic\nWeight Fusion Block (DWFB). To solve the ghosting artifacts, the proposed TDCAM\nextracts long-distance content similar to the reference feature in the entire\nnon-reference features, which can accurately remove misalignment and fill the\ncontent occluded by moving objects. For the purpose of eliminating fusion\ndistortions, we propose DWFB to spatially adaptively select useful information\nacross frames to effectively fuse multi-exposed features. Extensive experiments\nshow that our method quantitatively and qualitatively achieves state-of-the-art\nperformance.\n","authors":["Shuaikang Shang","Xuejing Kang","Anlong Ming"],"pdf_url":"https://arxiv.org/pdf/2403.06831v2.pdf","comment":"We request to withdraw our manuscript due to identified issues:\n  inaccuracies in the description of a submodule's composition, principles, and\n  functionality in Section 3.2, and potential problems in metric calculation in\n  Sections 4.2 and 4.3. To prevent the spread of misleading information, we\n  believe it is necessary to temporarily withdraw the manuscript for further\n  research and verification"},{"id":"http://arxiv.org/abs/2408.16467v1","updated":"2024-08-29T11:56:02Z","published":"2024-08-29T11:56:02Z","title":"Spiking Diffusion Models","summary":"  Recent years have witnessed Spiking Neural Networks (SNNs) gaining attention\nfor their ultra-low energy consumption and high biological plausibility\ncompared with traditional Artificial Neural Networks (ANNs). Despite their\ndistinguished properties, the application of SNNs in the computationally\nintensive field of image generation is still under exploration. In this paper,\nwe propose the Spiking Diffusion Models (SDMs), an innovative family of\nSNN-based generative models that excel in producing high-quality samples with\nsignificantly reduced energy consumption. In particular, we propose a\nTemporal-wise Spiking Mechanism (TSM) that allows SNNs to capture more temporal\nfeatures from a bio-plasticity perspective. In addition, we propose a\nthreshold-guided strategy that can further improve the performances by up to\n16.7% without any additional training. We also make the first attempt to use\nthe ANN-SNN approach for SNN-based generation tasks. Extensive experimental\nresults reveal that our approach not only exhibits comparable performance to\nits ANN counterpart with few spiking time steps, but also outperforms previous\nSNN-based generative models by a large margin. Moreover, we also demonstrate\nthe high-quality generation ability of SDM on large-scale datasets, e.g., LSUN\nbedroom. This development marks a pivotal advancement in the capabilities of\nSNN-based generation, paving the way for future research avenues to realize\nlow-energy and low-latency generative applications. Our code is available at\nhttps://github.com/AndyCao1125/SDM.\n","authors":["Jiahang Cao","Hanzhong Guo","Ziqing Wang","Deming Zhou","Hao Cheng","Qiang Zhang","Renjing Xu"],"pdf_url":"https://arxiv.org/pdf/2408.16467v1.pdf","comment":"Accepted by IEEE Transactions on Artificial Intelligence"},{"id":"http://arxiv.org/abs/2408.16451v1","updated":"2024-08-29T11:31:28Z","published":"2024-08-29T11:31:28Z","title":"Weakly Supervised Object Detection for Automatic Tooth-marked Tongue\n  Recognition","summary":"  Tongue diagnosis in Traditional Chinese Medicine (TCM) is a crucial\ndiagnostic method that can reflect an individual's health status. Traditional\nmethods for identifying tooth-marked tongues are subjective and inconsistent\nbecause they rely on practitioner experience. We propose a novel fully\nautomated Weakly Supervised method using Vision transformer and Multiple\ninstance learning WSVM for tongue extraction and tooth-marked tongue\nrecognition. Our approach first accurately detects and extracts the tongue\nregion from clinical images, removing any irrelevant background information.\nThen, we implement an end-to-end weakly supervised object detection method. We\nutilize Vision Transformer (ViT) to process tongue images in patches and employ\nmultiple instance loss to identify tooth-marked regions with only image-level\nannotations. WSVM achieves high accuracy in tooth-marked tongue classification,\nand visualization experiments demonstrate its effectiveness in pinpointing\nthese regions. This automated approach enhances the objectivity and accuracy of\ntooth-marked tongue diagnosis. It provides significant clinical value by\nassisting TCM practitioners in making precise diagnoses and treatment\nrecommendations. Code is available at https://github.com/yc-zh/WSVM.\n","authors":["Yongcun Zhang","Jiajun Xu","Yina He","Shaozi Li","Zhiming Luo","Huangwei Lei"],"pdf_url":"https://arxiv.org/pdf/2408.16451v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16450v1","updated":"2024-08-29T11:30:21Z","published":"2024-08-29T11:30:21Z","title":"What to Preserve and What to Transfer: Faithful, Identity-Preserving\n  Diffusion-based Hairstyle Transfer","summary":"  Hairstyle transfer is a challenging task in the image editing field that\nmodifies the hairstyle of a given face image while preserving its other\nappearance and background features. The existing hairstyle transfer approaches\nheavily rely on StyleGAN, which is pre-trained on cropped and aligned face\nimages. Hence, they struggle to generalize under challenging conditions such as\nextreme variations of head poses or focal lengths. To address this issue, we\npropose a one-stage hairstyle transfer diffusion model, HairFusion, that\napplies to real-world scenarios. Specifically, we carefully design a\nhair-agnostic representation as the input of the model, where the original hair\ninformation is thoroughly eliminated. Next, we introduce a hair align\ncross-attention (Align-CA) to accurately align the reference hairstyle with the\nface image while considering the difference in their face shape. To enhance the\npreservation of the face image's original features, we leverage adaptive hair\nblending during the inference, where the output's hair regions are estimated by\nthe cross-attention map in Align-CA and blended with non-hair areas of the face\nimage. Our experimental results show that our method achieves state-of-the-art\nperformance compared to the existing methods in preserving the integrity of\nboth the transferred hairstyle and the surrounding features. The codes are\navailable at https://github.com/cychungg/HairFusion.\n","authors":["Chaeyeon Chung","Sunghyun Park","Jeongho Kim","Jaegul Choo"],"pdf_url":"https://arxiv.org/pdf/2408.16450v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16448v1","updated":"2024-08-29T11:24:51Z","published":"2024-08-29T11:24:51Z","title":"Enhancing Sound Source Localization via False Negative Elimination","summary":"  Sound source localization aims to localize objects emitting the sound in\nvisual scenes. Recent works obtaining impressive results typically rely on\ncontrastive learning. However, the common practice of randomly sampling\nnegatives in prior arts can lead to the false negative issue, where the sounds\nsemantically similar to visual instance are sampled as negatives and\nincorrectly pushed away from the visual anchor/query. As a result, this\nmisalignment of audio and visual features could yield inferior performance. To\naddress this issue, we propose a novel audio-visual learning framework which is\ninstantiated with two individual learning schemes: self-supervised predictive\nlearning (SSPL) and semantic-aware contrastive learning (SACL). SSPL explores\nimage-audio positive pairs alone to discover semantically coherent similarities\nbetween audio and visual features, while a predictive coding module for feature\nalignment is introduced to facilitate the positive-only learning. In this\nregard SSPL acts as a negative-free method to eliminate false negatives. By\ncontrast, SACL is designed to compact visual features and remove false\nnegatives, providing reliable visual anchor and audio negatives for contrast.\nDifferent from SSPL, SACL releases the potential of audio-visual contrastive\nlearning, offering an effective alternative to achieve the same goal.\nComprehensive experiments demonstrate the superiority of our approach over the\nstate-of-the-arts. Furthermore, we highlight the versatility of the learned\nrepresentation by extending the approach to audio-visual event classification\nand object detection tasks. Code and models are available at:\nhttps://github.com/zjsong/SACL.\n","authors":["Zengjie Song","Jiangshe Zhang","Yuxi Wang","Junsong Fan","Zhaoxiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.16448v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2203.13412"},{"id":"http://arxiv.org/abs/2408.16445v1","updated":"2024-08-29T11:16:34Z","published":"2024-08-29T11:16:34Z","title":"Mismatched: Evaluating the Limits of Image Matching Approaches and\n  Benchmarks","summary":"  Three-dimensional (3D) reconstruction from two-dimensional images is an\nactive research field in computer vision, with applications ranging from\nnavigation and object tracking to segmentation and three-dimensional modeling.\nTraditionally, parametric techniques have been employed for this task. However,\nrecent advancements have seen a shift towards learning-based methods. Given the\nrapid pace of research and the frequent introduction of new image matching\nmethods, it is essential to evaluate them. In this paper, we present a\ncomprehensive evaluation of various image matching methods using a\nstructure-from-motion pipeline. We assess the performance of these methods on\nboth in-domain and out-of-domain datasets, identifying key limitations in both\nthe methods and benchmarks. We also investigate the impact of edge detection as\na pre-processing step. Our analysis reveals that image matching for 3D\nreconstruction remains an open challenge, necessitating careful selection and\ntuning of models for specific scenarios, while also highlighting mismatches in\nhow metrics currently represent method performance.\n","authors":["Sierra Bonilla","Chiara Di Vece","Rema Daher","Xinwei Ju","Danail Stoyanov","Francisco Vasconcelos","Sophia Bano"],"pdf_url":"https://arxiv.org/pdf/2408.16445v1.pdf","comment":"19 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.16442v1","updated":"2024-08-29T11:07:48Z","published":"2024-08-29T11:07:48Z","title":"Integrating Features for Recognizing Human Activities through Optimized\n  Parameters in Graph Convolutional Networks and Transformer Architectures","summary":"  Human activity recognition is a major field of study that employs computer\nvision, machine vision, and deep learning techniques to categorize human\nactions. The field of deep learning has made significant progress, with\narchitectures that are extremely effective at capturing human dynamics. This\nstudy emphasizes the influence of feature fusion on the accuracy of activity\nrecognition. This technique addresses the limitation of conventional models,\nwhich face difficulties in identifying activities because of their limited\ncapacity to understand spatial and temporal features. The technique employs\nsensory data obtained from four publicly available datasets: HuGaDB, PKU-MMD,\nLARa, and TUG. The accuracy and F1-score of two deep learning models,\nspecifically a Transformer model and a Parameter-Optimized Graph Convolutional\nNetwork (PO-GCN), were evaluated using these datasets. The feature fusion\ntechnique integrated the final layer features from both models and inputted\nthem into a classifier. Empirical evidence demonstrates that PO-GCN outperforms\nstandard models in activity recognition. HuGaDB demonstrated a 2.3% improvement\nin accuracy and a 2.2% increase in F1-score. TUG showed a 5% increase in\naccuracy and a 0.5% rise in F1-score. On the other hand, LARa and PKU-MMD\nachieved lower accuracies of 64% and 69% respectively. This indicates that the\nintegration of features enhanced the performance of both the Transformer model\nand PO-GCN.\n","authors":["Mohammad Belal","Taimur Hassan","Abdelfatah Hassan","Nael Alsheikh","Noureldin Elhendawi","Irfan Hussain"],"pdf_url":"https://arxiv.org/pdf/2408.16442v1.pdf","comment":"6 pages, 1 figure, conference"},{"id":"http://arxiv.org/abs/2408.16431v1","updated":"2024-08-29T10:47:17Z","published":"2024-08-29T10:47:17Z","title":"Discriminative Spatial-Semantic VOS Solution: 1st Place Solution for 6th\n  LSVOS","summary":"  Video object segmentation (VOS) is a crucial task in computer vision, but\ncurrent VOS methods struggle with complex scenes and prolonged object motions.\nTo address these challenges, the MOSE dataset aims to enhance object\nrecognition and differentiation in complex environments, while the LVOS dataset\nfocuses on segmenting objects exhibiting long-term, intricate movements. This\nreport introduces a discriminative spatial-temporal VOS model that utilizes\ndiscriminative object features as query representations. The semantic\nunderstanding of spatial-semantic modules enables it to recognize object parts,\nwhile salient features highlight more distinctive object characteristics. Our\nmodel, trained on extensive VOS datasets, achieved first place\n(\\textbf{80.90\\%} $\\mathcal{J \\& F}$) on the test set of the 6th LSVOS\nchallenge in the VOS Track, demonstrating its effectiveness in tackling the\naforementioned challenges. The code will be available at\n\\href{https://github.com/yahooo-m/VOS-Solution}{code}.\n","authors":["Deshui Miao","Yameng Gu","Xin Li","Zhenyu He","Yaowei Wang","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2408.16431v1.pdf","comment":"1st Place Solution for 6th LSVOS VOS Track. arXiv admin note:\n  substantial text overlap with arXiv:2406.04600"},{"id":"http://arxiv.org/abs/2408.16426v1","updated":"2024-08-29T10:36:29Z","published":"2024-08-29T10:36:29Z","title":"COIN: Control-Inpainting Diffusion Prior for Human and Camera Motion\n  Estimation","summary":"  Estimating global human motion from moving cameras is challenging due to the\nentanglement of human and camera motions. To mitigate the ambiguity, existing\nmethods leverage learned human motion priors, which however often result in\noversmoothed motions with misaligned 2D projections. To tackle this problem, we\npropose COIN, a control-inpainting motion diffusion prior that enables\nfine-grained control to disentangle human and camera motions. Although\npre-trained motion diffusion models encode rich motion priors, we find it\nnon-trivial to leverage such knowledge to guide global motion estimation from\nRGB videos. COIN introduces a novel control-inpainting score distillation\nsampling method to ensure well-aligned, consistent, and high-quality motion\nfrom the diffusion prior within a joint optimization framework. Furthermore, we\nintroduce a new human-scene relation loss to alleviate the scale ambiguity by\nenforcing consistency among the humans, camera, and scene. Experiments on three\nchallenging benchmarks demonstrate the effectiveness of COIN, which outperforms\nthe state-of-the-art methods in terms of global human motion estimation and\ncamera motion estimation. As an illustrative example, COIN outperforms the\nstate-of-the-art method by 33% in world joint position error (W-MPJPE) on the\nRICH dataset.\n","authors":["Jiefeng Li","Ye Yuan","Davis Rempe","Haotian Zhang","Pavlo Molchanov","Cewu Lu","Jan Kautz","Umar Iqbal"],"pdf_url":"https://arxiv.org/pdf/2408.16426v1.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2408.16412v1","updated":"2024-08-29T10:20:05Z","published":"2024-08-29T10:20:05Z","title":"Text-Enhanced Zero-Shot Action Recognition: A training-free approach","summary":"  Vision-language models (VLMs) have demonstrated remarkable performance across\nvarious visual tasks, leveraging joint learning of visual and textual\nrepresentations. While these models excel in zero-shot image tasks, their\napplication to zero-shot video action recognition (ZSVAR) remains challenging\ndue to the dynamic and temporal nature of actions. Existing methods for ZS-VAR\ntypically require extensive training on specific datasets, which can be\nresource-intensive and may introduce domain biases. In this work, we propose\nText-Enhanced Action Recognition (TEAR), a simple approach to ZS-VAR that is\ntraining-free and does not require the availability of training data or\nextensive computational resources. Drawing inspiration from recent findings in\nvision and language literature, we utilize action descriptors for decomposition\nand contextual information to enhance zero-shot action recognition. Through\nexperiments on UCF101, HMDB51, and Kinetics-600 datasets, we showcase the\neffectiveness and applicability of our proposed approach in addressing the\nchallenges of ZS-VAR.\n","authors":["Massimo Bosetti","Shibingfeng Zhang","Bendetta Liberatori","Giacomo Zara","Elisa Ricci","Paolo Rota"],"pdf_url":"https://arxiv.org/pdf/2408.16412v1.pdf","comment":"accepted to ICPR 2024"},{"id":"http://arxiv.org/abs/2402.03973v2","updated":"2024-08-29T10:09:58Z","published":"2024-02-06T13:06:14Z","title":"A comparison between humans and AI at recognizing objects in unusual\n  poses","summary":"  Deep learning is closing the gap with human vision on several object\nrecognition benchmarks. Here we investigate this gap for challenging images\nwhere objects are seen in unusual poses. We find that humans excel at\nrecognizing objects in such poses. In contrast, state-of-the-art deep networks\nfor vision (EfficientNet, SWAG, ViT, SWIN, BEiT, ConvNext) and state-of-the-art\nlarge vision-language models (Claude 3.5, Gemini 1.5, GPT-4) are systematically\nbrittle on unusual poses, with the exception of Gemini showing excellent\nrobustness in that condition. As we limit image exposure time, human\nperformance degrades to the level of deep networks, suggesting that additional\nmental processes (requiring additional time) are necessary to identify objects\nin unusual poses. An analysis of error patterns of humans vs. networks reveals\nthat even time-limited humans are dissimilar to feed-forward deep networks. In\nconclusion, our comparison reveals that humans and deep networks rely on\ndifferent mechanisms for recognizing objects in unusual poses. Understanding\nthe nature of the mental processes taking place during extra viewing time may\nbe key to reproduce the robustness of human vision in silico.\n","authors":["Netta Ollikka","Amro Abbas","Andrea Perin","Markku Kilpeläinen","Stéphane Deny"],"pdf_url":"https://arxiv.org/pdf/2402.03973v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16395v1","updated":"2024-08-29T09:57:55Z","published":"2024-08-29T09:57:55Z","title":"IBO: Inpainting-Based Occlusion to Enhance Explainable Artificial\n  Intelligence Evaluation in Histopathology","summary":"  Histopathological image analysis is crucial for accurate cancer diagnosis and\ntreatment planning. While deep learning models, especially convolutional neural\nnetworks, have advanced this field, their \"black-box\" nature raises concerns\nabout interpretability and trustworthiness. Explainable Artificial Intelligence\n(XAI) techniques aim to address these concerns, but evaluating their\neffectiveness remains challenging. A significant issue with current\nocclusion-based XAI methods is that they often generate Out-of-Distribution\n(OoD) samples, leading to inaccurate evaluations. In this paper, we introduce\nInpainting-Based Occlusion (IBO), a novel occlusion strategy that utilizes a\nDenoising Diffusion Probabilistic Model to inpaint occluded regions in\nhistopathological images. By replacing cancerous areas with realistic,\nnon-cancerous tissue, IBO minimizes OoD artifacts and preserves data integrity.\nWe evaluate our method on the CAMELYON16 dataset through two phases: first, by\nassessing perceptual similarity using the Learned Perceptual Image Patch\nSimilarity (LPIPS) metric, and second, by quantifying the impact on model\npredictions through Area Under the Curve (AUC) analysis. Our results\ndemonstrate that IBO significantly improves perceptual fidelity, achieving\nnearly twice the improvement in LPIPS scores compared to the best existing\nocclusion strategy. Additionally, IBO increased the precision of XAI\nperformance prediction from 42% to 71% compared to traditional methods. These\nresults demonstrate IBO's potential to provide more reliable evaluations of XAI\ntechniques, benefiting histopathology and other applications. The source code\nfor this study is available at https://github.com/a-fsh-r/IBO.\n","authors":["Pardis Afshar","Sajjad Hashembeiki","Pouya Khani","Emad Fatemizadeh","Mohammad Hossein Rohban"],"pdf_url":"https://arxiv.org/pdf/2408.16395v1.pdf","comment":"19 pages, 6 figures"},{"id":"http://arxiv.org/abs/2405.14700v2","updated":"2024-08-29T09:44:53Z","published":"2024-05-23T15:34:53Z","title":"Sparse-Tuning: Adapting Vision Transformers with Efficient Fine-tuning\n  and Inference","summary":"  Parameter-efficient fine-tuning (PEFT) has emerged as a popular solution for\nadapting pre-trained Vision Transformer (ViT) models to downstream\napplications. While current PEFT methods have achieved parameter efficiency,\nthey overlook the efficiency of computation and GPU memory during both\nfine-tuning and inference, falling short of practical requirements. In this\npaper, we propose \\textbf{Sparse-Tuning}, a novel PEFT method that accounts for\nthe information redundancy in images and videos to boost the above efficiency.\nBy sparsely preserving the semantic-relevant tokens and merging irrelevant\nones, Sparse-Tuning minimizes the quantity of tokens processed at each layer,\nleading to a quadratic reduction in computational and memory overhead. To align\nour token sparsification strategy suitably with fine-tuning purposes, we\nfurther design Dense Adapters that establish dense connections from shallow\nlayers to deeper layers. These Dense Adapters integrate multi-level local\nfeatures to enrich the current tokens, improving both token preservation and\nmodel adaptation. Empirical results on VTAB-1K, three image datasets, and two\nvideo datasets show that our Sparse-Tuning reduces GFLOPs to \\textbf{62\\%-70\\%}\nof the original ViT-B while achieving state-of-the-art performance. Source code\nis available at \\url{https://github.com/liuting20/Sparse-Tuning}.\n","authors":["Ting Liu","Xuyang Liu","Siteng Huang","Liangtao Shi","Zunnan Xu","Yi Xin","Quanjun Yin","Xiaohong Liu"],"pdf_url":"https://arxiv.org/pdf/2405.14700v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16380v1","updated":"2024-08-29T09:41:36Z","published":"2024-08-29T09:41:36Z","title":"Exploiting temporal information to detect conversational groups in\n  videos and predict the next speaker","summary":"  Studies in human human interaction have introduced the concept of F formation\nto describe the spatial arrangement of participants during social interactions.\nThis paper has two objectives. It aims at detecting F formations in video\nsequences and predicting the next speaker in a group conversation. The proposed\napproach exploits time information and human multimodal signals in video\nsequences. In particular, we rely on measuring the engagement level of people\nas a feature of group belonging. Our approach makes use of a recursive neural\nnetwork, the Long Short Term Memory (LSTM), to predict who will take the\nspeaker's turn in a conversation group. Experiments on the MatchNMingle dataset\nled to 85% true positives in group detection and 98% accuracy in predicting the\nnext speaker.\n","authors":["Lucrezia Tosato","Victor Fortier","Isabelle Bloch","Catherine Pelachaud"],"pdf_url":"https://arxiv.org/pdf/2408.16380v1.pdf","comment":"Accepted to Pattern Recognition Letter, 8 pages, 10 figures"},{"id":"http://arxiv.org/abs/2408.15678v2","updated":"2024-08-29T09:37:38Z","published":"2024-08-28T10:07:17Z","title":"Deep Learning Based Speckle Filtering for Polarimetric SAR Images.\n  Application to Sentinel-1","summary":"  Speckle suppression in synthetic aperture radar (SAR) images is a key\nprocessing step which continues to be a research topic. A wide variety of\nmethods, using either spatially-based approaches or transform-based strategies,\nhave been developed and have shown to provide outstanding results. However,\nrecent advances in deep learning techniques and their application to SAR image\ndespeckling have been demonstrated to offer state-of-the-art results.\nUnfortunately, they have been mostly applied to single-polarimetric images. The\nextension of a deep learning-based approach for speckle removal to polarimetric\nSAR (PolSAR) images is complicated because of the complex nature of the\nmeasured covariance matrices for every image pixel, the properties of which\nmust be preserved during filtering. In this work, we propose a complete\nframework to remove speckle in polarimetric SAR images using a convolutional\nneural network. The methodology includes a reversible transformation of the\noriginal complex covariance matrix to obtain a set of real-valued intensity\nbands which are fed to the neural network. In addition, the proposed method\nincludes a change detection strategy to avoid the neural network to learn\nerroneous features in areas strongly affected by temporal changes, so that the\nnetwork only learns the underlying speckle component present in the data. The\nmethod is implemented and tested with dual-polarimetric images acquired by\nSentinel-1. Experiments show that the proposed approach offers exceptional\nresults in both speckle reduction and resolution preservation. More\nimportantly, it is also shown that the neural network is not generating\nartifacts or introducing bias in the filtered images, making them suitable for\nfurther polarimetric processing and exploitation.\n","authors":["Alejandro Mestre-Quereda","Juan M. Lopez-Sanchez"],"pdf_url":"https://arxiv.org/pdf/2408.15678v2.pdf","comment":"23 pages, 32 figures"},{"id":"http://arxiv.org/abs/2408.14342v2","updated":"2024-08-29T09:11:13Z","published":"2024-08-14T02:37:26Z","title":"Dual-Domain CLIP-Assisted Residual Optimization Perception Model for\n  Metal Artifact Reduction","summary":"  Metal artifacts in computed tomography (CT) imaging pose significant\nchallenges to accurate clinical diagnosis. The presence of high-density\nmetallic implants results in artifacts that deteriorate image quality,\nmanifesting in the forms of streaking, blurring, or beam hardening effects,\netc. Nowadays, various deep learning-based approaches, particularly generative\nmodels, have been proposed for metal artifact reduction (MAR). However, these\nmethods have limited perception ability in the diverse morphologies of\ndifferent metal implants with artifacts, which may generate spurious anatomical\nstructures and exhibit inferior generalization capability. To address the\nissues, we leverage visual-language model (VLM) to identify these morphological\nfeatures and introduce them into a dual-domain CLIP-assisted residual\noptimization perception model (DuDoCROP) for MAR. Specifically, a dual-domain\nCLIP (DuDoCLIP) is fine-tuned on the image domain and sinogram domain using\ncontrastive learning to extract semantic descriptions from anatomical\nstructures and metal artifacts. Subsequently, a diffusion model is guided by\nthe embeddings of DuDoCLIP, thereby enabling the dual-domain prior generation.\nAdditionally, we design prompt engineering for more precise image-text\ndescriptions that can enhance the model's perception capability. Then, a\ndownstream task is devised for the one-step residual optimization and\nintegration of dual-domain priors, while incorporating raw data fidelity.\nUltimately, a new perceptual indicator is proposed to validate the model's\nperception and generation performance. With the assistance of DuDoCLIP, our\nDuDoCROP exhibits at least 63.7% higher generalization capability compared to\nthe baseline model. Numerical experiments demonstrate that the proposed method\ncan generate more realistic image structures and outperform other SOTA\napproaches both qualitatively and quantitatively.\n","authors":["Xinrui Zhang","Ailong Cai","Shaoyu Wang","Linyuan Wang","Zhizhong Zheng","Lei Li","Bin Yan"],"pdf_url":"https://arxiv.org/pdf/2408.14342v2.pdf","comment":"14 pages, 18 figures"},{"id":"http://arxiv.org/abs/2408.13744v2","updated":"2024-08-29T09:08:54Z","published":"2024-08-25T07:08:58Z","title":"Enhancing Adaptive Deep Networks for Image Classification via\n  Uncertainty-aware Decision Fusion","summary":"  Handling varying computational resources is a critical issue in modern AI\napplications. Adaptive deep networks, featuring the dynamic employment of\nmultiple classifier heads among different layers, have been proposed to address\nclassification tasks under varying computing resources. Existing approaches\ntypically utilize the last classifier supported by the available resources for\ninference, as they believe that the last classifier always performs better\nacross all classes. However, our findings indicate that earlier classifier\nheads can outperform the last head for certain classes. Based on this\nobservation, we introduce the Collaborative Decision Making (CDM) module, which\nfuses the multiple classifier heads to enhance the inference performance of\nadaptive deep networks. CDM incorporates an uncertainty-aware fusion method\nbased on evidential deep learning (EDL), that utilizes the reliability\n(uncertainty values) from the first c-1 classifiers to improve the c-th\nclassifier' accuracy. We also design a balance term that reduces fusion\nsaturation and unfairness issues caused by EDL constraints to improve the\nfusion quality of CDM. Finally, a regularized training strategy that uses the\nlast classifier to guide the learning process of early classifiers is proposed\nto further enhance the CDM module's effect, called the Guided Collaborative\nDecision Making (GCDM) framework. The experimental evaluation demonstrates the\neffectiveness of our approaches. Results on ImageNet datasets show CDM and GCDM\nobtain 0.4% to 2.8% accuracy improvement (under varying computing resources) on\npopular adaptive networks. The code is available at the link\nhttps://github.com/Meteor-Stars/GCDM_AdaptiveNet.\n","authors":["Xu Zhang","Zhipeng Xie","Haiyang Yu","Qitong Wang","Peng Wang","Wei Wang"],"pdf_url":"https://arxiv.org/pdf/2408.13744v2.pdf","comment":"13 pages, 27 figures. In ACM Multimedia 2024"},{"id":"http://arxiv.org/abs/2408.16357v1","updated":"2024-08-29T08:56:48Z","published":"2024-08-29T08:56:48Z","title":"Law of Vision Representation in MLLMs","summary":"  We present the \"Law of Vision Representation\" in multimodal large language\nmodels (MLLMs). It reveals a strong correlation between the combination of\ncross-modal alignment, correspondence in vision representation, and MLLM\nperformance. We quantify the two factors using the cross-modal Alignment and\nCorrespondence score (AC score). Through extensive experiments involving\nthirteen different vision representation settings and evaluations across eight\nbenchmarks, we find that the AC score is linearly correlated to model\nperformance. By leveraging this relationship, we are able to identify and train\nthe optimal vision representation only, which does not require finetuning the\nlanguage model every time, resulting in a 99.7% reduction in computational\ncost.\n","authors":["Shijia Yang","Bohan Zhai","Quanzeng You","Jianbo Yuan","Hongxia Yang","Chenfeng Xu"],"pdf_url":"https://arxiv.org/pdf/2408.16357v1.pdf","comment":"The code is available at\n  https://github.com/bronyayang/Law_of_Vision_Representation_in_MLLMs"},{"id":"http://arxiv.org/abs/2401.03749v3","updated":"2024-08-29T08:52:40Z","published":"2024-01-08T09:20:46Z","title":"A Flying Bird Object Detection Method for Surveillance Video","summary":"  Aiming at the specific characteristics of flying bird objects in surveillance\nvideo, such as the typically non-obvious features in single-frame images, small\nsize in most instances, and asymmetric shapes, this paper proposes a Flying\nBird Object Detection method for Surveillance Video (FBOD-SV). Firstly, a new\nfeature aggregation module, the Correlation Attention Feature Aggregation\n(Co-Attention-FA) module, is designed to aggregate the features of the flying\nbird object according to the bird object's correlation on multiple consecutive\nframes of images. Secondly, a Flying Bird Object Detection Network (FBOD-Net)\nwith down-sampling followed by up-sampling is designed, which utilizes a large\nfeature layer that fuses fine spatial information and large receptive field\ninformation to detect special multi-scale (mostly small-scale) bird objects.\nFinally, the SimOTA dynamic label allocation method is applied to One-Category\nobject detection, and the SimOTA-OC dynamic label strategy is proposed to solve\nthe difficult problem of label allocation caused by irregular flying bird\nobjects. In this paper, the performance of the FBOD-SV is validated using\nexperimental datasets of flying bird objects in traction substation\nsurveillance videos. The experimental results show that the FBOD-SV effectively\nimproves the detection performance of flying bird objects in surveillance\nvideo.\n","authors":["Ziwei Sun","Zexi Hua","Hengchao Li","Yan Li"],"pdf_url":"https://arxiv.org/pdf/2401.03749v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16355v1","updated":"2024-08-29T08:51:25Z","published":"2024-08-29T08:51:25Z","title":"NeRF-CA: Dynamic Reconstruction of X-ray Coronary Angiography with\n  Extremely Sparse-views","summary":"  Dynamic three-dimensional (4D) reconstruction from two-dimensional X-ray\ncoronary angiography (CA) remains a significant clinical problem. Challenges\ninclude sparse-view settings, intra-scan motion, and complex vessel morphology\nsuch as structure sparsity and background occlusion. Existing CA reconstruction\nmethods often require extensive user interaction or large training datasets. On\nthe other hand, Neural Radiance Field (NeRF), a promising deep learning\ntechnique, has successfully reconstructed high-fidelity static scenes for\nnatural and medical scenes. Recent work, however, identified that sparse-views,\nbackground occlusion, and dynamics still pose a challenge when applying NeRF in\nthe X-ray angiography context. Meanwhile, many successful works for natural\nscenes propose regularization for sparse-view reconstruction or scene\ndecomposition to handle dynamics. However, these techniques do not directly\ntranslate to the CA context, where both challenges and background occlusion are\nsignificant. This paper introduces NeRF-CA, the first step toward a 4D CA\nreconstruction method that achieves reconstructions from sparse coronary\nangiograms with cardiac motion. We leverage the motion of the coronary artery\nto decouple the scene into a dynamic coronary artery component and static\nbackground. We combine this scene decomposition with tailored regularization\ntechniques. These techniques enforce the separation of the coronary artery from\nthe background by enforcing dynamic structure sparsity and scene smoothness. By\nuniquely combining these approaches, we achieve 4D reconstructions from as few\nas four angiogram sequences. This setting aligns with clinical workflows while\noutperforming state-of-the-art X-ray sparse-view NeRF reconstruction\ntechniques. We validate our approach quantitatively and qualitatively using 4D\nphantom datasets and ablation studies.\n","authors":["Kirsten W. H. Maas","Danny Ruijters","Anna Vilanova","Nicola Pezzotti"],"pdf_url":"https://arxiv.org/pdf/2408.16355v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16343v1","updated":"2024-08-29T08:26:00Z","published":"2024-08-29T08:26:00Z","title":"Toward Robust Early Detection of Alzheimer's Disease via an Integrated\n  Multimodal Learning Approach","summary":"  Alzheimer's Disease (AD) is a complex neurodegenerative disorder marked by\nmemory loss, executive dysfunction, and personality changes. Early diagnosis is\nchallenging due to subtle symptoms and varied presentations, often leading to\nmisdiagnosis with traditional unimodal diagnostic methods due to their limited\nscope. This study introduces an advanced multimodal classification model that\nintegrates clinical, cognitive, neuroimaging, and EEG data to enhance\ndiagnostic accuracy. The model incorporates a feature tagger with a tabular\ndata coding architecture and utilizes the TimesBlock module to capture\nintricate temporal patterns in Electroencephalograms (EEG) data. By employing\nCross-modal Attention Aggregation module, the model effectively fuses Magnetic\nResonance Imaging (MRI) spatial information with EEG temporal data,\nsignificantly improving the distinction between AD, Mild Cognitive Impairment,\nand Normal Cognition. Simultaneously, we have constructed the first AD\nclassification dataset that includes three modalities: EEG, MRI, and tabular\ndata. Our innovative approach aims to facilitate early diagnosis and\nintervention, potentially slowing the progression of AD. The source code and\nour private ADMC dataset are available at https://github.com/JustlfC03/MSTNet.\n","authors":["Yifei Chen","Shenghao Zhu","Zhaojie Fang","Chang Liu","Binfeng Zou","Yuhe Wang","Shuo Chang","Fan Jia","Feiwei Qin","Jin Fan","Yong Peng","Changmiao Wang"],"pdf_url":"https://arxiv.org/pdf/2408.16343v1.pdf","comment":"5 pages, 2 figures"},{"id":"http://arxiv.org/abs/2408.16340v1","updated":"2024-08-29T08:23:57Z","published":"2024-08-29T08:23:57Z","title":"Learned Image Transmission with Hierarchical Variational Autoencoder","summary":"  In this paper, we introduce an innovative hierarchical joint source-channel\ncoding (HJSCC) framework for image transmission, utilizing a hierarchical\nvariational autoencoder (VAE). Our approach leverages a combination of\nbottom-up and top-down paths at the transmitter to autoregressively generate\nmultiple hierarchical representations of the original image. These\nrepresentations are then directly mapped to channel symbols for transmission by\nthe JSCC encoder. We extend this framework to scenarios with a feedback link,\nmodeling transmission over a noisy channel as a probabilistic sampling process\nand deriving a novel generative formulation for JSCC with feedback. Compared\nwith existing approaches, our proposed HJSCC provides enhanced adaptability by\ndynamically adjusting transmission bandwidth, encoding these representations\ninto varying amounts of channel symbols. Additionally, we introduce a rate\nattention module to guide the JSCC encoder in optimizing its encoding strategy\nbased on prior information. Extensive experiments on images of varying\nresolutions demonstrate that our proposed model outperforms existing baselines\nin rate-distortion performance and maintains robustness against channel noise.\n","authors":["Guangyi Zhang","Hanlei Li","Yunlong Cai","Qiyu Hu","Guanding Yu","Runmin Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.16340v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16325v1","updated":"2024-08-29T08:00:07Z","published":"2024-08-29T08:00:07Z","title":"P2P-Bridge: Diffusion Bridges for 3D Point Cloud Denoising","summary":"  In this work, we tackle the task of point cloud denoising through a novel\nframework that adapts Diffusion Schr\\\"odinger bridges to points clouds. Unlike\nprevious approaches that predict point-wise displacements from point features\nor learned noise distributions, our method learns an optimal transport plan\nbetween paired point clouds. Experiments on object datasets like PU-Net and\nreal-world datasets such as ScanNet++ and ARKitScenes show that P2P-Bridge\nachieves significant improvements over existing methods. While our approach\ndemonstrates strong results using only point coordinates, we also show that\nincorporating additional features, such as color information or point-wise\nDINOv2 features, further enhances the performance. Code and pretrained models\nare available at https://p2p-bridge.github.io.\n","authors":["Mathias Vogel","Keisuke Tateno","Marc Pollefeys","Federico Tombari","Marie-Julie Rakotosaona","Francis Engelmann"],"pdf_url":"https://arxiv.org/pdf/2408.16325v1.pdf","comment":"ECCV 2024 Project page: https://p2p-bridge.github.io"},{"id":"http://arxiv.org/abs/2408.16322v1","updated":"2024-08-29T07:49:31Z","published":"2024-08-29T07:49:31Z","title":"BEVal: A Cross-dataset Evaluation Study of BEV Segmentation Models for\n  Autononomous Driving","summary":"  Current research in semantic bird's-eye view segmentation for autonomous\ndriving focuses solely on optimizing neural network models using a single\ndataset, typically nuScenes. This practice leads to the development of highly\nspecialized models that may fail when faced with different environments or\nsensor setups, a problem known as domain shift. In this paper, we conduct a\ncomprehensive cross-dataset evaluation of state-of-the-art BEV segmentation\nmodels to assess their performance across different training and testing\ndatasets and setups, as well as different semantic categories. We investigate\nthe influence of different sensors, such as cameras and LiDAR, on the models'\nability to generalize to diverse conditions and scenarios. Additionally, we\nconduct multi-dataset training experiments that improve models' BEV\nsegmentation performance compared to single-dataset training. Our work\naddresses the gap in evaluating BEV segmentation models under cross-dataset\nvalidation. And our findings underscore the importance of enhancing model\ngeneralizability and adaptability to ensure more robust and reliable BEV\nsegmentation approaches for autonomous driving applications.\n","authors":["Manuel Alejandro Diaz-Zapata","Wenqian Liu","Robin Baruffa","Christian Laugier"],"pdf_url":"https://arxiv.org/pdf/2408.16322v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16314v1","updated":"2024-08-29T07:32:01Z","published":"2024-08-29T07:32:01Z","title":"ResVG: Enhancing Relation and Semantic Understanding in Multiple\n  Instances for Visual Grounding","summary":"  Visual grounding aims to localize the object referred to in an image based on\na natural language query. Although progress has been made recently, accurately\nlocalizing target objects within multiple-instance distractions (multiple\nobjects of the same category as the target) remains a significant challenge.\nExisting methods demonstrate a significant performance drop when there are\nmultiple distractions in an image, indicating an insufficient understanding of\nthe fine-grained semantics and spatial relationships between objects. In this\npaper, we propose a novel approach, the Relation and Semantic-sensitive Visual\nGrounding (ResVG) model, to address this issue. Firstly, we enhance the model's\nunderstanding of fine-grained semantics by injecting semantic prior information\nderived from text queries into the model. This is achieved by leveraging\ntext-to-image generation models to produce images representing the semantic\nattributes of target objects described in queries. Secondly, we tackle the lack\nof training samples with multiple distractions by introducing a\nrelation-sensitive data augmentation method. This method generates additional\ntraining data by synthesizing images containing multiple objects of the same\ncategory and pseudo queries based on their spatial relationships. The proposed\nReSVG model significantly improves the model's ability to comprehend both\nobject semantics and spatial relations, leading to enhanced performance in\nvisual grounding tasks, particularly in scenarios with multiple-instance\ndistractions. We conduct extensive experiments to validate the effectiveness of\nour methods on five datasets. Code is available at\nhttps://github.com/minghangz/ResVG.\n","authors":["Minghang Zheng","Jiahua Zhang","Qingchao Chen","Yuxin Peng","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2408.16314v1.pdf","comment":"Accepted by ACM MM 2024"},{"id":"http://arxiv.org/abs/2408.16313v1","updated":"2024-08-29T07:22:16Z","published":"2024-08-29T07:22:16Z","title":"FA-YOLO: Research On Efficient Feature Selection YOLO Improved Algorithm\n  Based On FMDS and AGMF Modules","summary":"  Over the past few years, the YOLO series of models has emerged as one of the\ndominant methodologies in the realm of object detection. Many studies have\nadvanced these baseline models by modifying their architectures, enhancing data\nquality, and developing new loss functions. However, current models still\nexhibit deficiencies in processing feature maps, such as overlooking the fusion\nof cross-scale features and a static fusion approach that lacks the capability\nfor dynamic feature adjustment. To address these issues, this paper introduces\nan efficient Fine-grained Multi-scale Dynamic Selection Module (FMDS Module),\nwhich applies a more effective dynamic feature selection and fusion method on\nfine-grained multi-scale feature maps, significantly enhancing the detection\naccuracy of small, medium, and large-sized targets in complex environments.\nFurthermore, this paper proposes an Adaptive Gated Multi-branch Focus Fusion\nModule (AGMF Module), which utilizes multiple parallel branches to perform\ncomplementary fusion of various features captured by the gated unit branch,\nFMDS Module branch, and TripletAttention branch. This approach further enhances\nthe comprehensiveness, diversity, and integrity of feature fusion. This paper\nhas integrated the FMDS Module, AGMF Module, into Yolov9 to develop a novel\nobject detection model named FA-YOLO. Extensive experimental results show that\nunder identical experimental conditions, FA-YOLO achieves an outstanding 66.1%\nmean Average Precision (mAP) on the PASCAL VOC 2007 dataset, representing 1.0%\nimprovement over YOLOv9's 65.1%. Additionally, the detection accuracies of\nFA-YOLO for small, medium, and large targets are 44.1%, 54.6%, and 70.8%,\nrespectively, showing improvements of 2.0%, 3.1%, and 0.9% compared to YOLOv9's\n42.1%, 51.5%, and 69.9%.\n","authors":["Yukang Huo","Mingyuan Yao","Qingbin Tian","Tonghao Wang","Ruifeng Wang","Haihua Wang"],"pdf_url":"https://arxiv.org/pdf/2408.16313v1.pdf","comment":"11 pages and 4 figures"},{"id":"http://arxiv.org/abs/2408.16310v1","updated":"2024-08-29T07:16:28Z","published":"2024-08-29T07:16:28Z","title":"Bootstrap Segmentation Foundation Model under Distribution Shift via\n  Object-Centric Learning","summary":"  Foundation models have made incredible strides in achieving zero-shot or\nfew-shot generalization, leveraging prompt engineering to mimic the\nproblem-solving approach of human intelligence. However, when it comes to some\nfoundation models like Segment Anything, there is still a challenge in\nperforming well on out-of-distribution data, including camouflaged and medical\nimages. Inconsistent prompting strategies during fine-tuning and testing\nfurther compound the issue, leading to decreased performance. Drawing\ninspiration from how human cognition processes new environments, we introduce\nSlotSAM, a method that reconstructs features from the encoder in a\nself-supervised manner to create object-centric representations. These\nrepresentations are then integrated into the foundation model, bolstering its\nobject-level perceptual capabilities while reducing the impact of\ndistribution-related variables. The beauty of SlotSAM lies in its simplicity\nand adaptability to various tasks, making it a versatile solution that\nsignificantly enhances the generalization abilities of foundation models.\nThrough limited parameter fine-tuning in a bootstrap manner, our approach paves\nthe way for improved generalization in novel environments. The code is\navailable at github.com/lytang63/SlotSAM.\n","authors":["Luyao Tang","Yuxuan Yuan","Chaoqi Chen","Kunze Huang","Xinghao Ding","Yue Huang"],"pdf_url":"https://arxiv.org/pdf/2408.16310v1.pdf","comment":"This work is accepted by ECCV 2024 EVAL-FoMo Workshop"},{"id":"http://arxiv.org/abs/2407.13307v2","updated":"2024-08-29T07:12:39Z","published":"2024-07-18T09:10:25Z","title":"Conformal Performance Range Prediction for Segmentation Output Quality\n  Control","summary":"  Recent works have introduced methods to estimate segmentation performance\nwithout ground truth, relying solely on neural network softmax outputs. These\ntechniques hold potential for intuitive output quality control. However, such\nperformance estimates rely on calibrated softmax outputs, which is often not\nthe case in modern neural networks. Moreover, the estimates do not take into\naccount inherent uncertainty in segmentation tasks. These limitations may\nrender precise performance predictions unattainable, restricting the practical\napplicability of performance estimation methods. To address these challenges,\nwe develop a novel approach for predicting performance ranges with statistical\nguarantees of containing the ground truth with a user specified probability.\nOur method leverages sampling-based segmentation uncertainty estimation to\nderive heuristic performance ranges, and applies split conformal prediction to\ntransform these estimates into rigorous prediction ranges that meet the desired\nguarantees. We demonstrate our approach on the FIVES retinal vessel\nsegmentation dataset and compare five commonly used sampling-based uncertainty\nestimation techniques. Our results show that it is possible to achieve the\ndesired coverage with small prediction ranges, highlighting the potential of\nperformance range prediction as a valuable tool for output quality control.\n","authors":["Anna M. Wundram","Paul Fischer","Michael Muehlebach","Lisa M. Koch","Christian F. Baumgartner"],"pdf_url":"https://arxiv.org/pdf/2407.13307v2.pdf","comment":"Accepted as an oral presentation at MICCAI UNSURE 2024"},{"id":"http://arxiv.org/abs/2408.16305v1","updated":"2024-08-29T07:11:50Z","published":"2024-08-29T07:11:50Z","title":"Semantics-Oriented Multitask Learning for DeepFake Detection: A Joint\n  Embedding Approach","summary":"  In recent years, the multimedia forensics and security community has seen\nremarkable progress in multitask learning for DeepFake (i.e., face forgery)\ndetection. The prevailing strategy has been to frame DeepFake detection as a\nbinary classification problem augmented by manipulation-oriented auxiliary\ntasks. This strategy focuses on learning features specific to face\nmanipulations, which exhibit limited generalizability. In this paper, we delve\ndeeper into semantics-oriented multitask learning for DeepFake detection,\nleveraging the relationships among face semantics via joint embedding. We first\npropose an automatic dataset expansion technique that broadens current face\nforgery datasets to support semantics-oriented DeepFake detection tasks at both\nthe global face attribute and local face region levels. Furthermore, we resort\nto joint embedding of face images and their corresponding labels (depicted by\ntextual descriptions) for prediction. This approach eliminates the need for\nmanually setting task-agnostic and task-specific parameters typically required\nwhen predicting labels directly from images. In addition, we employ a bi-level\noptimization strategy to dynamically balance the fidelity loss weightings of\nvarious tasks, making the training process fully automated. Extensive\nexperiments on six DeepFake datasets show that our method improves the\ngeneralizability of DeepFake detection and, meanwhile, renders some degree of\nmodel interpretation by providing human-understandable explanations.\n","authors":["Mian Zou","Baosheng Yu","Yibing Zhan","Siwei Lyu","Kede Ma"],"pdf_url":"https://arxiv.org/pdf/2408.16305v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16303v1","updated":"2024-08-29T07:09:33Z","published":"2024-08-29T07:09:33Z","title":"Enhanced Control for Diffusion Bridge in Image Restoration","summary":"  Image restoration refers to the process of restoring a damaged low-quality\nimage back to its corresponding high-quality image. Typically, we use\nconvolutional neural networks to directly learn the mapping from low-quality\nimages to high-quality images achieving image restoration. Recently, a special\ntype of diffusion bridge model has achieved more advanced results in image\nrestoration. It can transform the direct mapping from low-quality to\nhigh-quality images into a diffusion process, restoring low-quality images\nthrough a reverse process. However, the current diffusion bridge restoration\nmodels do not emphasize the idea of conditional control, which may affect\nperformance. This paper introduces the ECDB model enhancing the control of the\ndiffusion bridge with low-quality images as conditions. Moreover, in response\nto the characteristic of diffusion models having low denoising level at larger\nvalues of \\(\\bm t \\), we also propose a Conditional Fusion Schedule, which more\neffectively handles the conditional feature information of various modules.\nExperimental results prove that the ECDB model has achieved state-of-the-art\nresults in many image restoration tasks, including deraining, inpainting and\nsuper-resolution. Code is avaliable at https://github.com/Hammour-steak/ECDB.\n","authors":["Conghan Yue","Zhengwei Peng","Junlong Ma","Dongyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.16303v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15844v2","updated":"2024-08-29T07:08:21Z","published":"2024-08-28T15:04:52Z","title":"Shot Segmentation Based on Von Neumann Entropy for Key Frame Extraction","summary":"  Video key frame extraction is important in various fields, such as video\nsummary, retrieval, and compression. Therefore, we suggest a video key frame\nextraction algorithm based on shot segmentation using Von Neumann entropy. The\nsegmentation of shots is achieved through the computation of Von Neumann\nentropy of the similarity matrix among frames within the video sequence. The\ninitial frame of each shot is selected as key frames, which combines the\ntemporal sequence information of frames. The experimental results show the\nextracted key frames can fully and accurately represent the original video\ncontent while minimizing the number of repeated frames.\n","authors":["Xueqing Zhang","Di Fu","Naihao Liu"],"pdf_url":"https://arxiv.org/pdf/2408.15844v2.pdf","comment":"14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.15996v2","updated":"2024-08-29T06:54:11Z","published":"2024-08-28T17:59:05Z","title":"Spatio-Temporal Context Prompting for Zero-Shot Action Detection","summary":"  Spatio-temporal action detection encompasses the tasks of localizing and\nclassifying individual actions within a video. Recent works aim to enhance this\nprocess by incorporating interaction modeling, which captures the relationship\nbetween people and their surrounding context. However, these approaches have\nprimarily focused on fully-supervised learning, and the current limitation lies\nin the lack of generalization capability to recognize unseen action categories.\nIn this paper, we aim to adapt the pretrained image-language models to detect\nunseen actions. To this end, we propose a method which can effectively leverage\nthe rich knowledge of visual-language models to perform Person-Context\nInteraction. Meanwhile, our Context Prompting module will utilize contextual\ninformation to prompt labels, thereby enhancing the generation of more\nrepresentative text features. Moreover, to address the challenge of recognizing\ndistinct actions by multiple people at the same timestamp, we design the\nInterest Token Spotting mechanism which employs pretrained visual knowledge to\nfind each person's interest context tokens, and then these tokens will be used\nfor prompting to generate text features tailored to each individual. To\nevaluate the ability to detect unseen actions, we propose a comprehensive\nbenchmark on J-HMDB, UCF101-24, and AVA datasets. The experiments show that our\nmethod achieves superior results compared to previous approaches and can be\nfurther extended to multi-action videos, bringing it closer to real-world\napplications. The code and data can be found in\nhttps://webber2933.github.io/ST-CLIP-project-page.\n","authors":["Wei-Jhe Huang","Min-Hung Chen","Shang-Hong Lai"],"pdf_url":"https://arxiv.org/pdf/2408.15996v2.pdf","comment":"Project page: https://webber2933.github.io/ST-CLIP-project-page"},{"id":"http://arxiv.org/abs/2408.16296v1","updated":"2024-08-29T06:54:03Z","published":"2024-08-29T06:54:03Z","title":"Rethinking Sparse Lexical Representations for Image Retrieval in the Age\n  of Rising Multi-Modal Large Language Models","summary":"  In this paper, we rethink sparse lexical representations for image retrieval.\nBy utilizing multi-modal large language models (M-LLMs) that support visual\nprompting, we can extract image features and convert them into textual data,\nenabling us to utilize efficient sparse retrieval algorithms employed in\nnatural language processing for image retrieval tasks. To assist the LLM in\nextracting image features, we apply data augmentation techniques for key\nexpansion and analyze the impact with a metric for relevance between images and\ntextual data. We empirically show the superior precision and recall performance\nof our image retrieval method compared to conventional vision-language\nmodel-based methods on the MS-COCO, PASCAL VOC, and NUS-WIDE datasets in a\nkeyword-based image retrieval scenario, where keywords serve as search queries.\nWe also demonstrate that the retrieval performance can be improved by\niteratively incorporating keywords into search queries.\n","authors":["Kengo Nakata","Daisuke Miyashita","Youyang Ng","Yasuto Hoshi","Jun Deguchi"],"pdf_url":"https://arxiv.org/pdf/2408.16296v1.pdf","comment":"Accepted to ECCV 2024 Workshops: 2nd Workshop on Traditional Computer\n  Vision in the Age of Deep Learning (TradiCV)"},{"id":"http://arxiv.org/abs/2407.18520v3","updated":"2024-08-29T06:52:45Z","published":"2024-07-26T05:29:24Z","title":"Text-Region Matching for Multi-Label Image Recognition with Missing\n  Labels","summary":"  Recently, large-scale visual language pre-trained (VLP) models have\ndemonstrated impressive performance across various downstream tasks. Motivated\nby these advancements, pioneering efforts have emerged in multi-label image\nrecognition with missing labels, leveraging VLP prompt-tuning technology.\nHowever, they usually cannot match text and vision features well, due to\ncomplicated semantics gaps and missing labels in a multi-label image. To tackle\nthis challenge, we propose $\\textbf{T}$ext-$\\textbf{R}$egion\n$\\textbf{M}$atching for optimizing $\\textbf{M}$ulti-$\\textbf{L}$abel prompt\ntuning, namely TRM-ML, a novel method for enhancing meaningful cross-modal\nmatching. Compared to existing methods, we advocate exploring the information\nof category-aware regions rather than the entire image or pixels, which\ncontributes to bridging the semantic gap between textual and visual\nrepresentations in a one-to-one matching manner. Concurrently, we further\nintroduce multimodal contrastive learning to narrow the semantic gap between\ntextual and visual modalities and establish intra-class and inter-class\nrelationships. Additionally, to deal with missing labels, we propose a\nmultimodal category prototype that leverages intra- and inter-category semantic\nrelationships to estimate unknown labels, facilitating pseudo-label generation.\nExtensive experiments on the MS-COCO, PASCAL VOC, Visual Genome, NUS-WIDE, and\nCUB-200-211 benchmark datasets demonstrate that our proposed framework\noutperforms the state-of-the-art methods by a significant margin. Our code is\navailable here: https://github.com/yu-gi-oh-leilei/TRM-ML.\n","authors":["Leilei Ma","Hongxing Xie","Lei Wang","Yanping Fu","Dengdi Sun","Haifeng Zhao"],"pdf_url":"https://arxiv.org/pdf/2407.18520v3.pdf","comment":"Accepted to ACM International Conference on Multimedia (ACM MM) 2024"},{"id":"http://arxiv.org/abs/2408.16289v1","updated":"2024-08-29T06:40:34Z","published":"2024-08-29T06:40:34Z","title":"Convolutional Neural Network Compression Based on Low-Rank Decomposition","summary":"  Deep neural networks typically impose significant computational loads and\nmemory consumption. Moreover, the large parameters pose constraints on\ndeploying the model on edge devices such as embedded systems. Tensor\ndecomposition offers a clear advantage in compressing large-scale weight\ntensors. Nevertheless, direct utilization of low-rank decomposition typically\nleads to significant accuracy loss. This paper proposes a model compression\nmethod that integrates Variational Bayesian Matrix Factorization (VBMF) with\northogonal regularization. Initially, the model undergoes over-parameterization\nand training, with orthogonal regularization applied to enhance its likelihood\nof achieving the accuracy of the original model. Secondly, VBMF is employed to\nestimate the rank of the weight tensor at each layer. Our framework is\nsufficiently general to apply to other convolutional neural networks and easily\nadaptable to incorporate other tensor decomposition methods. Experimental\nresults show that for both high and low compression ratios, our compression\nmodel exhibits advanced performance.\n","authors":["Yaping He","Linhao Jiang","Di Wu"],"pdf_url":"https://arxiv.org/pdf/2408.16289v1.pdf","comment":"10 pages, 1 figures"},{"id":"http://arxiv.org/abs/2405.07288v2","updated":"2024-08-29T06:22:48Z","published":"2024-05-12T14:01:05Z","title":"Erasing Concepts from Text-to-Image Diffusion Models with Few-shot\n  Unlearning","summary":"  Generating images from text has become easier because of the scaling of\ndiffusion models and advancements in the field of vision and language. These\nmodels are trained using vast amounts of data from the Internet. Hence, they\noften contain undesirable content such as copyrighted material. As it is\nchallenging to remove such data and retrain the models, methods for erasing\nspecific concepts from pre-trained models have been investigated. We propose a\nnovel concept-erasure method that updates the text encoder using few-shot\nunlearning in which a few real images are used. The discussion regarding the\ngenerated images after erasing a concept has been lacking. While there are\nmethods for specifying the transition destination for concepts, the validity of\nthe specified concepts is unclear. Our method implicitly achieves this by\ntransitioning to the latent concepts inherent in the model or the images. Our\nmethod can erase a concept within 10 s, making concept erasure more accessible\nthan ever before. Implicitly transitioning to related concepts leads to more\nnatural concept erasure. We applied the proposed method to various concepts and\nconfirmed that concept erasure can be achieved tens to hundreds of times faster\nthan with current methods. By varying the parameters to be updated, we obtained\nresults suggesting that, like previous research, knowledge is primarily\naccumulated in the feed-forward networks of the text encoder. Our code is\navailable at \\url{https://github.com/fmp453/few-shot-erasing}\n","authors":["Masane Fuchi","Tomohiro Takagi"],"pdf_url":"https://arxiv.org/pdf/2405.07288v2.pdf","comment":"25 pages, 28 figures, accepted by BMVC2024"},{"id":"http://arxiv.org/abs/2407.07046v2","updated":"2024-08-29T06:15:55Z","published":"2024-07-09T17:07:29Z","title":"CorMulT: A Semi-supervised Modality Correlation-aware Multimodal\n  Transformer for Sentiment Analysis","summary":"  Multimodal sentiment analysis is an active research area that combines\nmultiple data modalities, e.g., text, image and audio, to analyze human\nemotions and benefits a variety of applications. Existing multimodal sentiment\nanalysis methods can be classified as modality interaction-based methods,\nmodality transformation-based methods and modality similarity-based methods.\nHowever, most of these methods highly rely on the strong correlations between\nmodalities, and cannot fully uncover and utilize the correlations between\nmodalities to enhance sentiment analysis. Therefore, these methods usually\nachieve bad performance for identifying the sentiment of multimodal data with\nweak correlations. To address this issue, we proposed a two-stage\nsemi-supervised model termed Correlation-aware Multimodal Transformer (CorMulT)\nwhich consists pre-training stage and prediction stage. At the pre-training\nstage, a modality correlation contrastive learning module is designed to\nefficiently learn modality correlation coefficients between different\nmodalities. At the prediction stage, the learned correlation coefficients are\nfused with modality representations to make the sentiment prediction. According\nto the experiments on the popular multimodal dataset CMU-MOSEI, CorMulT\nobviously surpasses state-of-the-art multimodal sentiment analysis methods.\n","authors":["Yangmin Li","Ruiqi Zhu","Wengen Li"],"pdf_url":"https://arxiv.org/pdf/2407.07046v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.16102v3","updated":"2024-08-29T05:56:45Z","published":"2023-03-28T16:11:31Z","title":"KeyMatchNet: Zero-Shot Pose Estimation in 3D Point Clouds by Generalized\n  Keypoint Matching","summary":"  In this paper, we present KeyMatchNet, a novel network for zero-shot pose\nestimation in 3D point clouds. Our method uses only depth information, making\nit more applicable for many industrial use cases, as color information is\nseldom available. The network is composed of two parallel components for\ncomputing object and scene features. The features are then combined to create\nmatches used for pose estimation. The parallel structure allows for\npre-processing of the individual parts, which decreases the run-time. Using a\nzero-shot network allows for a very short set-up time, as it is not necessary\nto train models for new objects. However, as the network is not trained for the\nspecific object, zero-shot pose estimation methods generally have lower\naccuracy compared with conventional methods. To address this, we reduce the\ncomplexity of the task by including the scenario information during training.\nThis is typically not feasible as collecting real data for new tasks\ndrastically increases the cost. However, for zero-shot pose estimation,\ntraining for new objects is not necessary and the expensive data collection can\nthus be performed only once. Our method is trained on 1,500 objects and is only\ntested on unseen objects. We demonstrate that the trained network can not only\naccurately estimate poses for novel objects, but also demonstrate the ability\nof the network on objects outside of the trained class. Test results are also\nshown on real data. We believe that the presented method is valuable for many\nreal-world scenarios. Project page available at keymatchnet.github.io\n","authors":["Frederik Hagelskjær","Rasmus Laurvig Haugaard"],"pdf_url":"https://arxiv.org/pdf/2303.16102v3.pdf","comment":"8 pages, 6 figures, 5 tables"},{"id":"http://arxiv.org/abs/2408.16277v1","updated":"2024-08-29T05:56:34Z","published":"2024-08-29T05:56:34Z","title":"Fine-grained Classification of Port Wine Stains Using Optical Coherence\n  Tomography Angiography","summary":"  Accurate classification of port wine stains (PWS, vascular malformations\npresent at birth), is critical for subsequent treatment planning. However, the\ncurrent method of classifying PWS based on the external skin appearance rarely\nreflects the underlying angiopathological heterogeneity of PWS lesions,\nresulting in inconsistent outcomes with the common vascular-targeted\nphotodynamic therapy (V-PDT) treatments. Conversely, optical coherence\ntomography angiography (OCTA) is an ideal tool for visualizing the vascular\nmalformations of PWS. Previous studies have shown no significant correlation\nbetween OCTA quantitative metrics and the PWS subtypes determined by the\ncurrent classification approach. This study proposes a new classification\napproach for PWS using both OCT and OCTA. By examining the hypodermic\nhistopathology and vascular structure of PWS, we have devised a fine-grained\nclassification method that subdivides PWS into five distinct types. To assess\nthe angiopathological differences of various PWS subtypes, we have analyzed six\nmetrics related to vascular morphology and depth information of PWS lesions.\nThe five PWS types present significant differences across all metrics compared\nto the conventional subtypes. Our findings suggest that an angiopathology-based\nclassification accurately reflects the heterogeneity in PWS lesions. This\nresearch marks the first attempt to classify PWS based on angiopathology,\npotentially guiding more effective subtyping and treatment strategies for PWS.\n","authors":["Xiaofeng Deng","Defu Chen","Bowen Liu","Xiwan Zhang","Haixia Qiu","Wu Yuan","Hongliang Ren"],"pdf_url":"https://arxiv.org/pdf/2408.16277v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n  Copyright may be transferred without notice, after which this version may no\n  longer be accessible"},{"id":"http://arxiv.org/abs/2408.14400v2","updated":"2024-08-29T05:37:38Z","published":"2024-08-26T16:34:13Z","title":"Satellite Sunroof: High-res Digital Surface Models and Roof Segmentation\n  for Global Solar Mapping","summary":"  The transition to renewable energy, particularly solar, is key to mitigating\nclimate change. Google's Solar API aids this transition by estimating solar\npotential from aerial imagery, but its impact is constrained by geographical\ncoverage. This paper proposes expanding the API's reach using satellite\nimagery, enabling global solar potential assessment. We tackle challenges\ninvolved in building a Digital Surface Model (DSM) and roof instance\nsegmentation from lower resolution and single oblique views using deep learning\nmodels. Our models, trained on aligned satellite and aerial datasets, produce\n25cm DSMs and roof segments. With ~1m DSM MAE on buildings, ~5deg roof pitch\nerror and ~56% IOU on roof segmentation, they significantly enhance the Solar\nAPI's potential to promote solar adoption.\n","authors":["Vishal Batchu","Alex Wilson","Betty Peng","Carl Elkin","Umangi Jain","Christopher Van Arsdale","Ross Goroshin","Varun Gulshan"],"pdf_url":"https://arxiv.org/pdf/2408.14400v2.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2408.16273v1","updated":"2024-08-29T05:33:59Z","published":"2024-08-29T05:33:59Z","title":"SAU: A Dual-Branch Network to Enhance Long-Tailed Recognition via\n  Generative Models","summary":"  Long-tailed distributions in image recognition pose a considerable challenge\ndue to the severe imbalance between a few dominant classes with numerous\nexamples and many minority classes with few samples. Recently, the use of large\ngenerative models to create synthetic data for image classification has been\nrealized, but utilizing synthetic data to address the challenge of long-tailed\nrecognition remains relatively unexplored. In this work, we proposed the use of\nsynthetic data as a complement to long-tailed datasets to eliminate the impact\nof data imbalance. To tackle this real-synthetic mixed dataset, we designed a\ntwo-branch model that contains Synthetic-Aware and Unaware branches (SAU). The\ncore ideas are (1) a synthetic-unaware branch for classification that mixes\nreal and synthetic data and treats all data equally without distinguishing\nbetween them. (2) A synthetic-aware branch for improving the robustness of the\nfeature extractor by distinguishing between real and synthetic data and\nlearning their discrepancies. Extensive experimental results demonstrate that\nour method can improve the accuracy of long-tailed image recognition. Notably,\nour approach achieves state-of-the-art Top-1 accuracy and significantly\nsurpasses other methods on CIFAR-10-LT and CIFAR-100-LT datasets across various\nimbalance factors. Our code is available at https://github.com/lgX1123/gm4lt.\n","authors":["Guangxi Li","Yinsheng Song","Mingkai Zheng"],"pdf_url":"https://arxiv.org/pdf/2408.16273v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2408.16272v1","updated":"2024-08-29T05:32:03Z","published":"2024-08-29T05:32:03Z","title":"Beyond Uncertainty: Evidential Deep Learning for Robust Video Temporal\n  Grounding","summary":"  Existing Video Temporal Grounding (VTG) models excel in accuracy but often\noverlook open-world challenges posed by open-vocabulary queries and untrimmed\nvideos. This leads to unreliable predictions for noisy, corrupted, and\nout-of-distribution data. Adapting VTG models to dynamically estimate\nuncertainties based on user input can address this issue. To this end, we\nintroduce SRAM, a robust network module that benefits from a two-stage\ncross-modal alignment task. More importantly, it integrates Deep Evidential\nRegression (DER) to explicitly and thoroughly quantify uncertainty during\ntraining, thus allowing the model to say \"I do not know\" in scenarios beyond\nits handling capacity. However, the direct application of traditional DER\ntheory and its regularizer reveals structural flaws, leading to unintended\nconstraints in VTG tasks. In response, we develop a simple yet effective\nGeom-regularizer that enhances the uncertainty learning framework from the\nground up. To the best of our knowledge, this marks the first successful\nattempt of DER in VTG. Our extensive quantitative and qualitative results\naffirm the effectiveness, robustness, and interpretability of our modules and\nthe uncertainty learning paradigm in VTG tasks. The code will be made\navailable.\n","authors":["Kaijing Ma","Haojian Huang","Jin Chen","Haodong Chen","Pengliang Ji","Xianghao Zang","Han Fang","Chao Ban","Hao Sun","Mulin Chen","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2408.16272v1.pdf","comment":"Ongoing work: 28pages, 19 figures, 7 tables. Code is available at:\n  https://kaijing.space/SRAM/"},{"id":"http://arxiv.org/abs/2408.16268v1","updated":"2024-08-29T05:13:01Z","published":"2024-08-29T05:13:01Z","title":"UDD: Dataset Distillation via Mining Underutilized Regions","summary":"  Dataset distillation synthesizes a small dataset such that a model trained on\nthis set approximates the performance of the original dataset. Recent studies\non dataset distillation focused primarily on the design of the optimization\nprocess, with methods such as gradient matching, feature alignment, and\ntraining trajectory matching. However, little attention has been given to the\nissue of underutilized regions in synthetic images. In this paper, we propose\nUDD, a novel approach to identify and exploit the underutilized regions to make\nthem informative and discriminate, and thus improve the utilization of the\nsynthetic dataset. Technically, UDD involves two underutilized regions\nsearching policies for different conditions, i.e., response-based policy and\ndata jittering-based policy. Compared with previous works, such two policies\nare utilization-sensitive, equipping with the ability to dynamically adjust the\nunderutilized regions during the training process. Additionally, we analyze the\ncurrent model optimization problem and design a category-wise feature\ncontrastive loss, which can enhance the distinguishability of different\ncategories and alleviate the shortcomings of the existing multi-formation\nmethods. Experimentally, our method improves the utilization of the synthetic\ndataset and outperforms the state-of-the-art methods on various datasets, such\nas MNIST, FashionMNIST, SVHN, CIFAR-10, and CIFAR-100. For example, the\nimprovements on CIFAR-10 and CIFAR-100 are 4.0\\% and 3.7\\% over the next best\nmethod with IPC=1, by mining the underutilized regions.\n","authors":["Shiguang Wang","Zhongyu Zhang","Jian Cheng"],"pdf_url":"https://arxiv.org/pdf/2408.16268v1.pdf","comment":"PRCV2024"},{"id":"http://arxiv.org/abs/2408.16266v1","updated":"2024-08-29T05:05:02Z","published":"2024-08-29T05:05:02Z","title":"Improving Diffusion-based Data Augmentation with Inversion Spherical\n  Interpolation","summary":"  Data Augmentation (DA), \\ie, synthesizing faithful and diverse samples to\nexpand the original training set, is a prevalent and effective strategy to\nimprove various visual recognition tasks. With the powerful image generation\nability, diffusion-based DA has shown strong performance gains on different\nbenchmarks. In this paper, we analyze today's diffusion-based DA methods, and\nargue that they cannot take account of both faithfulness and diversity, which\nare two critical keys for generating high-quality samples and boosting final\nclassification performance. To this end, we propose a novel Diffusion-based\nInversion Interpolation DA method: Diff-II. Specifically, Diff-II consists of\nthree main steps: 1) Category concepts learning: Learning concept embeddings\nfor each category. 2) Inversion interpolation: Calculating the inversion for\neach image, and conducting spherical interpolation for two randomly sampled\ninversions from the same category. 3) Two-stage denoising: Using different\nprompts to generate synthesized images in a coarse-to-fine manner. Extensive\nexperiments on multiple image classification tasks (\\eg, few-shot, long-tailed,\nand out-of-distribution classification) have demonstrated its effectiveness\nover state-of-the-art diffusion-based DA methods.\n","authors":["Yanghao Wang","Long Chen"],"pdf_url":"https://arxiv.org/pdf/2408.16266v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16265v1","updated":"2024-08-29T05:04:25Z","published":"2024-08-29T05:04:25Z","title":"Low Saturation Confidence Distribution-based Test-Time Adaptation for\n  Cross-Domain Remote Sensing Image Classification","summary":"  Although the Unsupervised Domain Adaptation (UDA) method has improved the\neffect of remote sensing image classification tasks, most of them are still\nlimited by access to the source domain (SD) data. Designs such as Source-free\nDomain Adaptation (SFDA) solve the challenge of a lack of SD data, however,\nthey still rely on a large amount of target domain data and thus cannot achieve\nfast adaptations, which seriously hinders their further application in broader\nscenarios. The real-world applications of cross-domain remote sensing image\nclassification require a balance of speed and accuracy at the same time.\nTherefore, we propose a novel and comprehensive test time adaptation (TTA)\nmethod -- Low Saturation Confidence Distribution Test Time Adaptation\n(LSCD-TTA), which is the first attempt to solve such scenarios through the idea\nof TTA. LSCD-TTA specifically considers the distribution characteristics of\nremote sensing images, including three main parts that concentrate on different\noptimization directions: First, low saturation distribution (LSD) considers the\ndominance of low-confidence samples during the later TTA stage. Second,\nweak-category cross-entropy (WCCE) increases the weight of categories that are\nmore difficult to classify with less prior knowledge. Finally, diverse\ncategories confidence (DIV) comprehensively considers the category diversity to\nalleviate the deviation of the sample distribution. By weighting the\nabovementioned three modules, the model can widely, quickly and accurately\nadapt to the target domain without much prior target distributions, repeated\ndata access, and manual annotation. We evaluate LSCD-TTA on three\nremote-sensing image datasets. The experimental results show that LSCD-TTA\nachieves a significant gain of 4.96%-10.51% with Resnet-50 and 5.33%-12.49%\nwith Resnet-101 in average accuracy compared to other state-of-the-art DA and\nTTA methods.\n","authors":["Yu Liang","Xiucheng Zhang","Juepeng Zheng","Jianxi Huang","Haohuan Fu"],"pdf_url":"https://arxiv.org/pdf/2408.16265v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16258v1","updated":"2024-08-29T04:40:31Z","published":"2024-08-29T04:40:31Z","title":"Advancing Architectural Floorplan Design with Geometry-enhanced Graph\n  Diffusion","summary":"  Automating architectural floorplan design is vital for housing and interior\ndesign, offering a faster, cost-effective alternative to manual sketches by\narchitects. However, existing methods, including rule-based and learning-based\napproaches, face challenges in design complexity and constrained generation\nwith extensive post-processing, and tend to obvious geometric inconsistencies\nsuch as misalignment, overlap, and gaps. In this work, we propose a novel\ngenerative framework for vector floorplan design via structural graph\ngeneration, called GSDiff, focusing on wall junction generation and wall\nsegment prediction to capture both geometric and semantic aspects of structural\ngraphs. To improve the geometric rationality of generated structural graphs, we\npropose two innovative geometry enhancement methods. In wall junction\ngeneration, we propose a novel alignment loss function to improve geometric\nconsistency. In wall segment prediction, we propose a random self-supervision\nmethod to enhance the model's perception of the overall geometric structure,\nthereby promoting the generation of reasonable geometric structures. Employing\nthe diffusion model and the Transformer model, as well as the geometry\nenhancement strategies, our framework can generate wall junctions, wall\nsegments and room polygons with structural and semantic information, resulting\nin structural graphs that accurately represent floorplans. Extensive\nexperiments show that the proposed method surpasses existing techniques,\nenabling free generation and constrained generation, marking a shift towards\nstructure generation in architectural design.\n","authors":["Sizhe Hu","Wenming Wu","Yuntao Wang","Benzhu Xu","Liping Zheng"],"pdf_url":"https://arxiv.org/pdf/2408.16258v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16254v1","updated":"2024-08-29T04:30:31Z","published":"2024-08-29T04:30:31Z","title":"EvLight++: Low-Light Video Enhancement with an Event Camera: A\n  Large-Scale Real-World Dataset, Novel Method, and More","summary":"  Event cameras offer significant advantages for low-light video enhancement,\nprimarily due to their high dynamic range. Current research, however, is\nseverely limited by the absence of large-scale, real-world, and\nspatio-temporally aligned event-video datasets. To address this, we introduce a\nlarge-scale dataset with over 30,000 pairs of frames and events captured under\nvarying illumination. This dataset was curated using a robotic arm that traces\na consistent non-linear trajectory, achieving spatial alignment precision under\n0.03mm and temporal alignment with errors under 0.01s for 90% of the dataset.\nBased on the dataset, we propose \\textbf{EvLight++}, a novel event-guided\nlow-light video enhancement approach designed for robust performance in\nreal-world scenarios. Firstly, we design a multi-scale holistic fusion branch\nto integrate structural and textural information from both images and events.\nTo counteract variations in regional illumination and noise, we introduce\nSignal-to-Noise Ratio (SNR)-guided regional feature selection, enhancing\nfeatures from high SNR regions and augmenting those from low SNR regions by\nextracting structural information from events. To incorporate temporal\ninformation and ensure temporal coherence, we further introduce a recurrent\nmodule and temporal loss in the whole pipeline. Extensive experiments on our\nand the synthetic SDSD dataset demonstrate that EvLight++ significantly\noutperforms both single image- and video-based methods by 1.37 dB and 3.71 dB,\nrespectively. To further explore its potential in downstream tasks like\nsemantic segmentation and monocular depth estimation, we extend our datasets by\nadding pseudo segmentation and depth labels via meticulous annotation efforts\nwith foundation models. Experiments under diverse low-light scenes show that\nthe enhanced results achieve a 15.97% improvement in mIoU for semantic\nsegmentation.\n","authors":["Kanghao Chen","Guoqiang Liang","Hangyu Li","Yunfan Lu","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2408.16254v1.pdf","comment":"Journal extension based on EvLight (arXiv:2404.00834)"},{"id":"http://arxiv.org/abs/2408.16247v1","updated":"2024-08-29T03:58:21Z","published":"2024-08-29T03:58:21Z","title":"Anno-incomplete Multi-dataset Detection","summary":"  Object detectors have shown outstanding performance on various public\ndatasets. However, annotating a new dataset for a new task is usually\nunavoidable in real, since 1) a single existing dataset usually does not\ncontain all object categories needed; 2) using multiple datasets usually\nsuffers from annotation incompletion and heterogeneous features. We propose a\nnovel problem as \"Annotation-incomplete Multi-dataset Detection\", and develop\nan end-to-end multi-task learning architecture which can accurately detect all\nthe object categories with multiple partially annotated datasets. Specifically,\nwe propose an attention feature extractor which helps to mine the relations\namong different datasets. Besides, a knowledge amalgamation training strategy\nis incorporated to accommodate heterogeneous features from different sources.\nExtensive experiments on different object detection datasets demonstrate the\neffectiveness of our methods and an improvement of 2.17%, 2.10% in mAP can be\nachieved on COCO and VOC respectively.\n","authors":["Yiran Xu","Haoxiang Zhong","Kai Wu","Jialin Li","Yong Liu","Chengjie Wang","Shu-Tao Xia","Hongen Liao"],"pdf_url":"https://arxiv.org/pdf/2408.16247v1.pdf","comment":"12 pages, 9 figures"},{"id":"http://arxiv.org/abs/2408.15643v2","updated":"2024-08-29T03:47:04Z","published":"2024-08-28T08:53:33Z","title":"RIDE: Boosting 3D Object Detection for LiDAR Point Clouds via\n  Rotation-Invariant Analysis","summary":"  The rotation robustness property has drawn much attention to point cloud\nanalysis, whereas it still poses a critical challenge in 3D object detection.\nWhen subjected to arbitrary rotation, most existing detectors fail to produce\nexpected outputs due to the poor rotation robustness. In this paper, we present\nRIDE, a pioneering exploration of Rotation-Invariance for the 3D\nLiDAR-point-based object DEtector, with the key idea of designing\nrotation-invariant features from LiDAR scenes and then effectively\nincorporating them into existing 3D detectors. Specifically, we design a\nbi-feature extractor that extracts (i) object-aware features though sensitive\nto rotation but preserve geometry well, and (ii) rotation-invariant features,\nwhich lose geometric information to a certain extent but are robust to\nrotation. These two kinds of features complement each other to decode 3D\nproposals that are robust to arbitrary rotations. Particularly, our RIDE is\ncompatible and easy to plug into the existing one-stage and two-stage 3D\ndetectors, and boosts both detection performance and rotation robustness.\nExtensive experiments on the standard benchmarks showcase that the mean average\nprecision (mAP) and rotation robustness can be significantly boosted by\nintegrating with our RIDE, with +5.6% mAP and 53% rotation robustness\nimprovement on KITTI, +5.1% and 28% improvement correspondingly on nuScenes.\nThe code will be available soon.\n","authors":["Zhaoxuan Wang","Xu Han","Hongxin Liu","Xianzhi Li"],"pdf_url":"https://arxiv.org/pdf/2408.15643v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16236v1","updated":"2024-08-29T03:26:14Z","published":"2024-08-29T03:26:14Z","title":"Neural Spectral Decomposition for Dataset Distillation","summary":"  In this paper, we propose Neural Spectrum Decomposition, a generic\ndecomposition framework for dataset distillation. Unlike previous methods, we\nconsider the entire dataset as a high-dimensional observation that is low-rank\nacross all dimensions. We aim to discover the low-rank representation of the\nentire dataset and perform distillation efficiently. Toward this end, we learn\na set of spectrum tensors and transformation matrices, which, through simple\nmatrix multiplication, reconstruct the data distribution. Specifically, a\nspectrum tensor can be mapped back to the image space by a transformation\nmatrix, and efficient information sharing during the distillation learning\nprocess is achieved through pairwise combinations of different spectrum vectors\nand transformation matrices. Furthermore, we integrate a trajectory matching\noptimization method guided by a real distribution. Our experimental results\ndemonstrate that our approach achieves state-of-the-art performance on\nbenchmarks, including CIFAR10, CIFAR100, Tiny Imagenet, and ImageNet Subset.\nOur code are available at \\url{https://github.com/slyang2021/NSD}.\n","authors":["Shaolei Yang","Shen Cheng","Mingbo Hong","Haoqiang Fan","Xing Wei","Shuaicheng Liu"],"pdf_url":"https://arxiv.org/pdf/2408.16236v1.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2311.01673v2","updated":"2024-08-29T03:25:04Z","published":"2023-11-03T02:43:51Z","title":"Content Significance Distribution of Sub-Text Blocks in Articles and Its\n  Application to Article-Organization Assessment","summary":"  We explore how to capture the significance of a sub-text block in an article\nand how it may be used for text mining tasks. A sub-text block is a\nsub-sequence of sentences in the article. We formulate the notion of content\nsignificance distribution (CSD) of sub-text blocks, referred to as CSD of the\nfirst kind and denoted by CSD-1. In particular, we leverage Hugging Face's\nSentenceTransformer to generate contextual sentence embeddings, and use\nMoverScore over text embeddings to measure how similar a sub-text block is to\nthe entire text. To overcome the exponential blowup on the number of sub-text\nblocks, we present an approximation algorithm and show that the approximated\nCSD-1 is almost identical to the exact CSD-1. Under this approximation, we show\nthat the average and median CSD-1's for news, scholarly research, argument, and\nnarrative articles share the same pattern. We also show that under a certain\nlinear transformation, the complement of the cumulative distribution function\nof the beta distribution with certain values of $\\alpha$ and $\\beta$ resembles\na CSD-1 curve. We then use CSD-1's to extract linguistic features to train an\nSVC classifier for assessing how well an article is organized. Through\nexperiments, we show that this method achieves high accuracy for assessing\nstudent essays. Moreover, we study CSD of sentence locations, referred to as\nCSD of the second kind and denoted by CSD-2, and show that average CSD-2's for\ndifferent types of articles possess distinctive patterns, which either conform\ncommon perceptions of article structures or provide rectification with minor\ndeviation.\n","authors":["You Zhou","Jie Wang"],"pdf_url":"https://arxiv.org/pdf/2311.01673v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16235v1","updated":"2024-08-29T03:23:51Z","published":"2024-08-29T03:23:51Z","title":"LMT-GP: Combined Latent Mean-Teacher and Gaussian Process for\n  Semi-supervised Low-light Image Enhancement","summary":"  While recent low-light image enhancement (LLIE) methods have made significant\nadvancements, they still face challenges in terms of low visual quality and\nweak generalization ability when applied to complex scenarios. To address these\nissues, we propose a semi-supervised method based on latent mean-teacher and\nGaussian process, named LMT-GP. We first design a latent mean-teacher framework\nthat integrates both labeled and unlabeled data, as well as their latent\nvectors, into model training. Meanwhile, we use a mean-teacher-assisted\nGaussian process learning strategy to establish a connection between the latent\nand pseudo-latent vectors obtained from the labeled and unlabeled data. To\nguide the learning process, we utilize an assisted Gaussian process regression\n(GPR) loss function. Furthermore, we design a pseudo-label adaptation module\n(PAM) to ensure the reliability of the network learning. To demonstrate our\nmethod's generalization ability and effectiveness, we apply it to multiple LLIE\ndatasets and high-level vision tasks. Experiment results demonstrate that our\nmethod achieves high generalization performance and image quality. The code is\navailable at https://github.com/HFUT-CV/LMT-GP.\n","authors":["Ye Yu","Fengxin Chen","Jun Yu","Zhen Kan"],"pdf_url":"https://arxiv.org/pdf/2408.16235v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16233v1","updated":"2024-08-29T03:20:43Z","published":"2024-08-29T03:20:43Z","title":"PSE-Net: Channel Pruning for Convolutional Neural Networks with\n  Parallel-subnets Estimator","summary":"  Channel Pruning is one of the most widespread techniques used to compress\ndeep neural networks while maintaining their performances. Currently, a typical\npruning algorithm leverages neural architecture search to directly find\nnetworks with a configurable width, the key step of which is to identify\nrepresentative subnet for various pruning ratios by training a supernet.\nHowever, current methods mainly follow a serial training strategy to optimize\nsupernet, which is very time-consuming. In this work, we introduce PSE-Net, a\nnovel parallel-subnets estimator for efficient channel pruning. Specifically,\nwe propose a parallel-subnets training algorithm that simulate the\nforward-backward pass of multiple subnets by droping extraneous features on\nbatch dimension, thus various subnets could be trained in one round. Our\nproposed algorithm facilitates the efficiency of supernet training and equips\nthe network with the ability to interpolate the accuracy of unsampled subnets,\nenabling PSE-Net to effectively evaluate and rank the subnets. Over the trained\nsupernet, we develop a prior-distributed-based sampling algorithm to boost the\nperformance of classical evolutionary search. Such algorithm utilizes the prior\ninformation of supernet training phase to assist in the search of optimal\nsubnets while tackling the challenge of discovering samples that satisfy\nresource constraints due to the long-tail distribution of network\nconfiguration. Extensive experiments demonstrate PSE-Net outperforms previous\nstate-of-the-art channel pruning methods on the ImageNet dataset while\nretaining superior supernet training efficiency. For example, under 300M FLOPs\nconstraint, our pruned MobileNetV2 achieves 75.2% Top-1 accuracy on ImageNet\ndataset, exceeding the original MobileNetV2 by 2.6 units while only cost\n30%/16% times than BCNet/AutoAlim.\n","authors":["Shiguang Wang","Tao Xie","Haijun Liu","Xingcheng Zhang","Jian Cheng"],"pdf_url":"https://arxiv.org/pdf/2408.16233v1.pdf","comment":"10pages, Neural Networks"},{"id":"http://arxiv.org/abs/2408.16232v1","updated":"2024-08-29T03:12:04Z","published":"2024-08-29T03:12:04Z","title":"Enhancing Conditional Image Generation with Explainable Latent Space\n  Manipulation","summary":"  In the realm of image synthesis, achieving fidelity to a reference image\nwhile adhering to conditional prompts remains a significant challenge. This\npaper proposes a novel approach that integrates a diffusion model with latent\nspace manipulation and gradient-based selective attention mechanisms to address\nthis issue. Leveraging Grad-SAM (Gradient-based Selective Attention\nManipulation), we analyze the cross attention maps of the cross attention\nlayers and gradients for the denoised latent vector, deriving importance scores\nof elements of denoised latent vector related to the subject of interest. Using\nthis information, we create masks at specific timesteps during denoising to\npreserve subjects while seamlessly integrating the reference image features.\nThis approach ensures the faithful formation of subjects based on conditional\nprompts, while concurrently refining the background for a more coherent\ncomposition. Our experiments on places365 dataset demonstrate promising\nresults, with our proposed model achieving the lowest mean and median Frechet\nInception Distance (FID) scores compared to baseline models, indicating\nsuperior fidelity preservation. Furthermore, our model exhibits competitive\nperformance in aligning the generated images with provided textual\ndescriptions, as evidenced by high CLIP scores. These results highlight the\neffectiveness of our approach in both fidelity preservation and textual context\npreservation, offering a significant advancement in text-to-image synthesis\ntasks.\n","authors":["Kshitij Pathania"],"pdf_url":"https://arxiv.org/pdf/2408.16232v1.pdf","comment":"7 pages , 5 figures"},{"id":"http://arxiv.org/abs/2311.13385v4","updated":"2024-08-29T03:11:14Z","published":"2023-11-22T13:27:36Z","title":"SegVol: Universal and Interactive Volumetric Medical Image Segmentation","summary":"  Precise image segmentation provides clinical study with instructive\ninformation. Despite the remarkable progress achieved in medical image\nsegmentation, there is still an absence of a 3D foundation segmentation model\nthat can segment a wide range of anatomical categories with easy user\ninteraction. In this paper, we propose a 3D foundation segmentation model,\nnamed SegVol, supporting universal and interactive volumetric medical image\nsegmentation. By scaling up training data to 90K unlabeled Computed Tomography\n(CT) volumes and 6K labeled CT volumes, this foundation model supports the\nsegmentation of over 200 anatomical categories using semantic and spatial\nprompts. To facilitate efficient and precise inference on volumetric images, we\ndesign a zoom-out-zoom-in mechanism. Extensive experiments on 22 anatomical\nsegmentation tasks verify that SegVol outperforms the competitors in 19 tasks,\nwith improvements up to 37.24% compared to the runner-up methods. We\ndemonstrate the effectiveness and importance of specific designs by ablation\nstudy. We expect this foundation model can promote the development of\nvolumetric medical image analysis. The model and code are publicly available\nat: https://github.com/BAAI-DCAI/SegVol.\n","authors":["Yuxin Du","Fan Bai","Tiejun Huang","Bo Zhao"],"pdf_url":"https://arxiv.org/pdf/2311.13385v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02139v3","updated":"2024-08-29T03:09:40Z","published":"2023-12-04T18:57:01Z","title":"DiffiT: Diffusion Vision Transformers for Image Generation","summary":"  Diffusion models with their powerful expressivity and high sample quality\nhave achieved State-Of-The-Art (SOTA) performance in the generative domain. The\npioneering Vision Transformer (ViT) has also demonstrated strong modeling\ncapabilities and scalability, especially for recognition tasks. In this paper,\nwe study the effectiveness of ViTs in diffusion-based generative learning and\npropose a new model denoted as Diffusion Vision Transformers (DiffiT).\nSpecifically, we propose a methodology for finegrained control of the denoising\nprocess and introduce the Time-dependant Multihead Self Attention (TMSA)\nmechanism. DiffiT is surprisingly effective in generating high-fidelity images\nwith significantly better parameter efficiency. We also propose latent and\nimage space DiffiT models and show SOTA performance on a variety of\nclass-conditional and unconditional synthesis tasks at different resolutions.\nThe Latent DiffiT model achieves a new SOTA FID score of 1.73 on ImageNet256\ndataset while having 19.85%, 16.88% less parameters than other\nTransformer-based diffusion models such as MDT and DiT,respectively. Code:\nhttps://github.com/NVlabs/DiffiT\n","authors":["Ali Hatamizadeh","Jiaming Song","Guilin Liu","Jan Kautz","Arash Vahdat"],"pdf_url":"https://arxiv.org/pdf/2312.02139v3.pdf","comment":"Accepted to ECCV'24"},{"id":"http://arxiv.org/abs/2408.16227v1","updated":"2024-08-29T02:58:35Z","published":"2024-08-29T02:58:35Z","title":"Revisiting 360 Depth Estimation with PanoGabor: A New Fusion Perspective","summary":"  Depth estimation from a monocular 360 image is important to the perception of\nthe entire 3D environment. However, the inherent distortion and large field of\nview (FoV) in 360 images pose great challenges for this task. To this end,\nexisting mainstream solutions typically introduce additional perspective-based\n360 representations (\\textit{e.g.}, Cubemap) to achieve effective feature\nextraction. Nevertheless, regardless of the introduced representations, they\neventually need to be unified into the equirectangular projection (ERP) format\nfor the subsequent depth estimation, which inevitably reintroduces the\ntroublesome distortions. In this work, we propose an oriented distortion-aware\nGabor Fusion framework (PGFuse) to address the above challenges. First, we\nintroduce Gabor filters that analyze texture in the frequency domain, thereby\nextending the receptive fields and enhancing depth cues. To address the\nreintroduced distortions, we design a linear latitude-aware distortion\nrepresentation method to generate customized, distortion-aware Gabor filters\n(PanoGabor filters). Furthermore, we design a channel-wise and spatial-wise\nunidirectional fusion module (CS-UFM) that integrates the proposed PanoGabor\nfilters to unify other representations into the ERP format, delivering\neffective and distortion-free features. Considering the orientation sensitivity\nof the Gabor transform, we introduce a spherical gradient constraint to\nstabilize this sensitivity. Experimental results on three popular indoor 360\nbenchmarks demonstrate the superiority of the proposed PGFuse to existing\nstate-of-the-art solutions. Code can be available upon acceptance.\n","authors":["Zhijie Shen","Chunyu Lin","Lang Nie","Kang Liao"],"pdf_url":"https://arxiv.org/pdf/2408.16227v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.13520v2","updated":"2024-08-29T02:52:46Z","published":"2024-07-18T13:55:54Z","title":"EaDeblur-GS: Event assisted 3D Deblur Reconstruction with Gaussian\n  Splatting","summary":"  3D deblurring reconstruction techniques have recently seen significant\nadvancements with the development of Neural Radiance Fields (NeRF) and 3D\nGaussian Splatting (3DGS). Although these techniques can recover relatively\nclear 3D reconstructions from blurry image inputs, they still face limitations\nin handling severe blurring and complex camera motion. To address these issues,\nwe propose Event-assisted 3D Deblur Reconstruction with Gaussian Splatting\n(EaDeblur-GS), which integrates event camera data to enhance the robustness of\n3DGS against motion blur. By employing an Adaptive Deviation Estimator (ADE)\nnetwork to estimate Gaussian center deviations and using novel loss functions,\nEaDeblur-GS achieves sharp 3D reconstructions in real-time, demonstrating\nperformance comparable to state-of-the-art methods.\n","authors":["Yuchen Weng","Zhengwen Shen","Ruofan Chen","Qi Wang","Jun Wang"],"pdf_url":"https://arxiv.org/pdf/2407.13520v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16224v1","updated":"2024-08-29T02:43:20Z","published":"2024-08-29T02:43:20Z","title":"LLaVA-SG: Leveraging Scene Graphs as Visual Semantic Expression in\n  Vision-Language Models","summary":"  Recent advances in large vision-language models (VLMs) typically employ\nvision encoders based on the Vision Transformer (ViT) architecture. The\ndivision of the images into patches by ViT results in a fragmented perception,\nthereby hindering the visual understanding capabilities of VLMs. In this paper,\nwe propose an innovative enhancement to address this limitation by introducing\na Scene Graph Expression (SGE) module in VLMs. This module extracts and\nstructurally expresses the complex semantic information within images, thereby\nimproving the foundational perception and understanding abilities of VLMs.\nExtensive experiments demonstrate that integrating our SGE module significantly\nenhances the VLM's performance in vision-language tasks, indicating its\neffectiveness in preserving intricate semantic details and facilitating better\nvisual understanding. Code and data would be available.\n","authors":["Jingyi Wang","Jianzhong Ju","Jian Luan","Zhidong Deng"],"pdf_url":"https://arxiv.org/pdf/2408.16224v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.10121v3","updated":"2024-08-29T02:35:21Z","published":"2023-09-18T19:49:22Z","title":"Pre-training on Synthetic Driving Data for Trajectory Prediction","summary":"  Accumulating substantial volumes of real-world driving data proves pivotal in\nthe realm of trajectory forecasting for autonomous driving. Given the heavy\nreliance of current trajectory forecasting models on data-driven methodologies,\nwe aim to tackle the challenge of learning general trajectory forecasting\nrepresentations under limited data availability. We propose a pipeline-level\nsolution to mitigate the issue of data scarcity in trajectory forecasting. The\nsolution is composed of two parts: firstly, we adopt HD map augmentation and\ntrajectory synthesis for generating driving data, and then we learn\nrepresentations by pre-training on them. Specifically, we apply vector\ntransformations to reshape the maps, and then employ a rule-based model to\ngenerate trajectories on both original and augmented scenes; thus enlarging the\ndriving data without collecting additional real ones. To foster the learning of\ngeneral representations within this augmented dataset, we comprehensively\nexplore the different pre-training strategies, including extending the concept\nof a Masked AutoEncoder (MAE) for trajectory forecasting. Without bells and\nwhistles, our proposed pipeline-level solution is general, simple, yet\neffective: we conduct extensive experiments to demonstrate the effectiveness of\nour data expansion and pre-training strategies, which outperform the baseline\nprediction model by large margins, e.g. 5.04%, 3.84% and 8.30% in terms of\n$MR_6$, $minADE_6$ and $minFDE_6$. The pre-training dataset and the codes for\npre-training and fine-tuning are released at\nhttps://github.com/yhli123/Pretraining_on_Synthetic_Driving_Data_for_Trajectory_Prediction.\n","authors":["Yiheng Li","Seth Z. Zhao","Chenfeng Xu","Chen Tang","Chenran Li","Mingyu Ding","Masayoshi Tomizuka","Wei Zhan"],"pdf_url":"https://arxiv.org/pdf/2309.10121v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16219v1","updated":"2024-08-29T02:25:12Z","published":"2024-08-29T02:25:12Z","title":"Training-free Video Temporal Grounding using Large-scale Pre-trained\n  Models","summary":"  Video temporal grounding aims to identify video segments within untrimmed\nvideos that are most relevant to a given natural language query. Existing video\ntemporal localization models rely on specific datasets for training and have\nhigh data collection costs, but they exhibit poor generalization capability\nunder the across-dataset and out-of-distribution (OOD) settings. In this paper,\nwe propose a Training-Free Video Temporal Grounding (TFVTG) approach that\nleverages the ability of pre-trained large models. A naive baseline is to\nenumerate proposals in the video and use the pre-trained visual language models\n(VLMs) to select the best proposal according to the vision-language alignment.\nHowever, most existing VLMs are trained on image-text pairs or trimmed video\nclip-text pairs, making it struggle to (1) grasp the relationship and\ndistinguish the temporal boundaries of multiple events within the same video;\n(2) comprehend and be sensitive to the dynamic transition of events (the\ntransition from one event to another) in the video. To address these issues, we\npropose leveraging large language models (LLMs) to analyze multiple sub-events\ncontained in the query text and analyze the temporal order and relationships\nbetween these events. Secondly, we split a sub-event into dynamic transition\nand static status parts and propose the dynamic and static scoring functions\nusing VLMs to better evaluate the relevance between the event and the\ndescription. Finally, for each sub-event description, we use VLMs to locate the\ntop-k proposals and leverage the order and relationships between sub-events\nprovided by LLMs to filter and integrate these proposals. Our method achieves\nthe best performance on zero-shot video temporal grounding on Charades-STA and\nActivityNet Captions datasets without any training and demonstrates better\ngeneralization capabilities in cross-dataset and OOD settings.\n","authors":["Minghang Zheng","Xinhao Cai","Qingchao Chen","Yuxin Peng","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2408.16219v1.pdf","comment":"Accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2305.18680v2","updated":"2024-08-29T02:22:08Z","published":"2023-05-30T01:38:54Z","title":"Improving Deep Representation Learning via Auxiliary Learnable Target\n  Coding","summary":"  Deep representation learning is a subfield of machine learning that focuses\non learning meaningful and useful representations of data through deep neural\nnetworks. However, existing methods for semantic classification typically\nemploy pre-defined target codes such as the one-hot and the Hadamard codes,\nwhich can either fail or be less flexible to model inter-class correlation. In\nlight of this, this paper introduces a novel learnable target coding as an\nauxiliary regularization of deep representation learning, which can not only\nincorporate latent dependency across classes but also impose geometric\nproperties of target codes into representation space. Specifically, a\nmargin-based triplet loss and a correlation consistency loss on the proposed\ntarget codes are designed to encourage more discriminative representations\nowing to enlarging between-class margins in representation space and favoring\nequal semantic correlation of learnable target codes respectively. Experimental\nresults on several popular visual classification and retrieval benchmarks can\ndemonstrate the effectiveness of our method on improving representation\nlearning, especially for imbalanced data. Source codes are made publicly\navailable at\n\\href{https://github.com/AkonLau/LTC}{https://github.com/AkonLau/LTC}.\n","authors":["Kangjun Liu","Ke Chen","Kui Jia","Yaowei Wang"],"pdf_url":"https://arxiv.org/pdf/2305.18680v2.pdf","comment":"Accepted by Pattern Recognition, 33 pages, 8 figures, 11 tables"},{"id":"http://arxiv.org/abs/2408.15829v2","updated":"2024-08-29T02:16:02Z","published":"2024-08-28T14:44:42Z","title":"SITransformer: Shared Information-Guided Transformer for Extreme\n  Multimodal Summarization","summary":"  Extreme Multimodal Summarization with Multimodal Output (XMSMO) becomes an\nattractive summarization approach by integrating various types of information\nto create extremely concise yet informative summaries for individual\nmodalities. Existing methods overlook the issue that multimodal data often\ncontains more topic irrelevant information, which can mislead the model into\nproducing inaccurate summaries especially for extremely short ones. In this\npaper, we propose SITransformer, a Shared Information-guided Transformer for\nextreme multimodal summarization. It has a shared information guided pipeline\nwhich involves a cross-modal shared information extractor and a cross-modal\ninteraction module. The extractor formulates semantically shared salient\ninformation from different modalities by devising a novel filtering process\nconsisting of a differentiable top-k selector and a shared-information guided\ngating unit. As a result, the common, salient, and relevant contents across\nmodalities are identified. Next, a transformer with cross-modal attentions is\ndeveloped for intra- and inter-modality learning with the shared information\nguidance to produce the extreme summary. Comprehensive experiments demonstrate\nthat SITransformer significantly enhances the summarization quality for both\nvideo and text summaries for XMSMO. Our code will be publicly available at\nhttps://github.com/SichengLeoLiu/MMAsia24-XMSMO.\n","authors":["Sicheng Liu","Lintao Wang","Xiaogan Zhu","Xuequan Lu","Zhiyong Wang","Kun Hu"],"pdf_url":"https://arxiv.org/pdf/2408.15829v2.pdf","comment":"8 pages, 5 figures, submitted to ACM Multimedia Asia 2024"},{"id":"http://arxiv.org/abs/2403.11541v2","updated":"2024-08-29T02:13:09Z","published":"2024-03-18T07:51:22Z","title":"Hierarchical Spatial Proximity Reasoning for Vision-and-Language\n  Navigation","summary":"  Most Vision-and-Language Navigation (VLN) algorithms are prone to making\ndecision due to a lack of visual common sense and insufficient reasoning\ncapabilities. To address this issue, we propose a Hierarchical Spatial\nProximity Reasoning (HSPR) method. First, we introduce a scene understanding\nauxiliary task to help the agent build a knowledge base of hierarchical spatial\nproximity. This task utilizes panoramic views and object features to identify\ntypes of nodes and uncover the adjacency relationships between nodes, objects,\nand between nodes and objects. Second, we propose a multi-step reasoning\nnavigation algorithm based on hierarchical spatial proximity knowledge base,\nwhich continuously plans feasible paths to enhance exploration efficiency.\nThird, we introduce a residual fusion method to improve navigation decision\naccuracy. Finally, we validate our approach with experiments on publicly\navailable datasets including REVERIE, SOON, R2R, and R4R. Our code is available\nat https://github.com/iCityLab/HSPR.\n","authors":["Ming Xu","Zilong Xie"],"pdf_url":"https://arxiv.org/pdf/2403.11541v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16213v1","updated":"2024-08-29T02:12:58Z","published":"2024-08-29T02:12:58Z","title":"M4CXR: Exploring Multi-task Potentials of Multi-modal Large Language\n  Models for Chest X-ray Interpretation","summary":"  The rapid evolution of artificial intelligence, especially in large language\nmodels (LLMs), has significantly impacted various domains, including\nhealthcare. In chest X-ray (CXR) analysis, previous studies have employed LLMs,\nbut with limitations: either underutilizing the multi-tasking capabilities of\nLLMs or lacking clinical accuracy. This paper presents M4CXR, a multi-modal LLM\ndesigned to enhance CXR interpretation. The model is trained on a visual\ninstruction-following dataset that integrates various task-specific datasets in\na conversational format. As a result, the model supports multiple tasks such as\nmedical report generation (MRG), visual grounding, and visual question\nanswering (VQA). M4CXR achieves state-of-the-art clinical accuracy in MRG by\nemploying a chain-of-thought prompting strategy, in which it identifies\nfindings in CXR images and subsequently generates corresponding reports. The\nmodel is adaptable to various MRG scenarios depending on the available inputs,\nsuch as single-image, multi-image, and multi-study contexts. In addition to\nMRG, M4CXR performs visual grounding at a level comparable to specialized\nmodels and also demonstrates outstanding performance in VQA. Both quantitative\nand qualitative assessments reveal M4CXR's versatility in MRG, visual\ngrounding, and VQA, while consistently maintaining clinical accuracy.\n","authors":["Jonggwon Park","Soobum Kim","Byungmu Yoon","Jihun Hyun","Kyoyun Choi"],"pdf_url":"https://arxiv.org/pdf/2408.16213v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.10577v2","updated":"2024-08-29T02:09:11Z","published":"2024-05-17T07:04:29Z","title":"DuoSpaceNet: Leveraging Both Bird's-Eye-View and Perspective View\n  Representations for 3D Object Detection","summary":"  Recent advances in multi-view camera-only 3D object detection either rely on\nan accurate reconstruction of bird's-eye-view (BEV) 3D features or on\ntraditional 2D perspective view (PV) image features. While both have their own\npros and cons, few have found a way to stitch them together in order to benefit\nfrom \"the best of both worlds\". To this end, we explore a duo space (i.e., BEV\nand PV) 3D perception framework, in conjunction with some useful duo space\nfusion strategies that allow effective aggregation of the two feature\nrepresentations. To the best of our knowledge, our proposed method,\nDuoSpaceNet, is the first to leverage two distinct feature spaces and achieves\nthe state-of-the-art 3D object detection and BEV map segmentation results on\nnuScenes dataset.\n","authors":["Zhe Huang","Yizhe Zhao","Hao Xiao","Chenyan Wu","Lingting Ge"],"pdf_url":"https://arxiv.org/pdf/2405.10577v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18038v2","updated":"2024-08-29T02:05:04Z","published":"2024-07-25T13:31:55Z","title":"TiCoSS: Tightening the Coupling between Semantic Segmentation and Stereo\n  Matching within A Joint Learning Framework","summary":"  Semantic segmentation and stereo matching, respectively analogous to the\nventral and dorsal streams in our human brain, are two key components of\nautonomous driving perception systems. Addressing these two tasks with separate\nnetworks is no longer the mainstream direction in developing computer vision\nalgorithms, particularly with the recent advances in large vision models and\nembodied artificial intelligence. The trend is shifting towards combining them\nwithin a joint learning framework, especially emphasizing feature sharing\nbetween the two tasks. The major contributions of this study lie in\ncomprehensively tightening the coupling between semantic segmentation and\nstereo matching. Specifically, this study introduces three novelties: (1) a\ntightly coupled, gated feature fusion strategy, (2) a hierarchical deep\nsupervision strategy, and (3) a coupling tightening loss function. The combined\nuse of these technical contributions results in TiCoSS, a state-of-the-art\njoint learning framework that simultaneously tackles semantic segmentation and\nstereo matching. Through extensive experiments on the KITTI and vKITTI2\ndatasets, along with qualitative and quantitative analyses, we validate the\neffectiveness of our developed strategies and loss function, and demonstrate\nits superior performance compared to prior arts, with a notable increase in\nmIoU by over 9%. Our source code will be publicly available at\nmias.group/TiCoSS upon publication.\n","authors":["Guanfeng Tang","Zhiyuan Wu","Jiahang Li","Ping Zhong","Xieyuanli Chen","Huiming Liu","Rui Fan"],"pdf_url":"https://arxiv.org/pdf/2407.18038v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16201v1","updated":"2024-08-29T01:46:37Z","published":"2024-08-29T01:46:37Z","title":"Uni-3DAD: GAN-Inversion Aided Universal 3D Anomaly Detection on\n  Model-free Products","summary":"  Anomaly detection is a long-standing challenge in manufacturing systems.\nTraditionally, anomaly detection has relied on human inspectors. However, 3D\npoint clouds have gained attention due to their robustness to environmental\nfactors and their ability to represent geometric data. Existing 3D anomaly\ndetection methods generally fall into two categories. One compares scanned 3D\npoint clouds with design files, assuming these files are always available.\nHowever, such assumptions are often violated in many real-world applications\nwhere model-free products exist, such as fresh produce (i.e., ``Cookie\",\n``Potato\", etc.), dentures, bone, etc. The other category compares patches of\nscanned 3D point clouds with a library of normal patches named memory bank.\nHowever, those methods usually fail to detect incomplete shapes, which is a\nfairly common defect type (i.e., missing pieces of different products). The\nmain challenge is that missing areas in 3D point clouds represent the absence\nof scanned points. This makes it infeasible to compare the missing region with\nexisting point cloud patches in the memory bank. To address these two\nchallenges, we proposed a unified, unsupervised 3D anomaly detection framework\ncapable of identifying all types of defects on model-free products. Our method\nintegrates two detection modules: a feature-based detection module and a\nreconstruction-based detection module. Feature-based detection covers geometric\ndefects, such as dents, holes, and cracks, while the reconstruction-based\nmethod detects missing regions. Additionally, we employ a One-class Support\nVector Machine (OCSVM) to fuse the detection results from both modules. The\nresults demonstrate that (1) our proposed method outperforms the\nstate-of-the-art methods in identifying incomplete shapes and (2) it still\nmaintains comparable performance with the SOTA methods in detecting all other\ntypes of anomalies.\n","authors":["Jiayu Liu","Shancong Mou","Nathan Gaw","Yinan Wang"],"pdf_url":"https://arxiv.org/pdf/2408.16201v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16268v2","updated":"2024-08-29T01:44:27Z","published":"2023-12-26T12:16:03Z","title":"360 Layout Estimation via Orthogonal Planes Disentanglement and\n  Multi-view Geometric Consistency Perception","summary":"  Existing panoramic layout estimation solutions tend to recover room\nboundaries from a vertically compressed sequence, yielding imprecise results as\nthe compression process often muddles the semantics between various planes.\nBesides, these data-driven approaches impose an urgent demand for massive data\nannotations, which are laborious and time-consuming. For the first problem, we\npropose an orthogonal plane disentanglement network (termed DOPNet) to\ndistinguish ambiguous semantics. DOPNet consists of three modules that are\nintegrated to deliver distortion-free, semantics-clean, and detail-sharp\ndisentangled representations, which benefit the subsequent layout recovery. For\nthe second problem, we present an unsupervised adaptation technique tailored\nfor horizon-depth and ratio representations. Concretely, we introduce an\noptimization strategy for decision-level layout analysis and a 1D cost volume\nconstruction method for feature-level multi-view aggregation, both of which are\ndesigned to fully exploit the geometric consistency across multiple\nperspectives. The optimizer provides a reliable set of pseudo-labels for\nnetwork training, while the 1D cost volume enriches each view with\ncomprehensive scene information derived from other perspectives. Extensive\nexperiments demonstrate that our solution outperforms other SoTA models on both\nmonocular layout estimation and multi-view layout estimation tasks. Cobe can be\navailable at https://github.com/zhijieshen-bjtu/MV-DOPNet.\n","authors":["Zhijie Shen","Chunyu Lin","Junsong Zhang","Lang Nie","Kang Liao","Yao Zhao"],"pdf_url":"https://arxiv.org/pdf/2312.16268v2.pdf","comment":"Accept to TPAMI2024. arXiv admin note: substantial text overlap with\n  arXiv:2303.00971"},{"id":"http://arxiv.org/abs/2408.16200v1","updated":"2024-08-29T01:42:38Z","published":"2024-08-29T01:42:38Z","title":"PolarBEVDet: Exploring Polar Representation for Multi-View 3D Object\n  Detection in Bird's-Eye-View","summary":"  Recently, LSS-based multi-view 3D object detection provides an economical and\ndeployment-friendly solution for autonomous driving. However, all the existing\nLSS-based methods transform multi-view image features into a Cartesian\nBird's-Eye-View(BEV) representation, which does not take into account the\nnon-uniform image information distribution and hardly exploits the view\nsymmetry. In this paper, in order to adapt the image information distribution\nand preserve the view symmetry by regular convolution, we propose to employ the\npolar BEV representation to substitute the Cartesian BEV representation. To\nachieve this, we elaborately tailor three modules: a polar view transformer to\ngenerate the polar BEV representation, a polar temporal fusion module for\nfusing historical polar BEV features and a polar detection head to predict the\npolar-parameterized representation of the object. In addition, we design a 2D\nauxiliary detection head and a spatial attention enhancement module to improve\nthe quality of feature extraction in perspective view and BEV, respectively.\nFinally, we integrate the above improvements into a novel multi-view 3D object\ndetector, PolarBEVDet. Experiments on nuScenes show that PolarBEVDet achieves\nthe superior performance. The code is available at\nhttps://github.com/Yzichen/PolarBEVDet.git.\n","authors":["Zichen Yu","Quanli Liu","Wei Wang","Liyong Zhang","Xiaoguang Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.16200v1.pdf","comment":"11 pages, 6 figures"},{"id":"http://arxiv.org/abs/2408.01565v2","updated":"2024-08-29T01:32:17Z","published":"2024-08-02T20:40:19Z","title":"Embodiment: Self-Supervised Depth Estimation Based on Camera Models","summary":"  Depth estimation is a critical topic for robotics and vision-related tasks.\nIn monocular depth estimation, in comparison with supervised learning that\nrequires expensive ground truth labeling, self-supervised methods possess great\npotential due to no labeling cost. However, self-supervised learning still has\na large gap with supervised learning in 3D reconstruction and depth estimation\nperformance. Meanwhile, scaling is also a major issue for monocular\nunsupervised depth estimation, which commonly still needs ground truth scale\nfrom GPS, LiDAR, or existing maps to correct. In the era of deep learning,\nexisting methods primarily rely on exploring image relationships to train\nunsupervised neural networks, while the physical properties of the camera\nitself such as intrinsics and extrinsics are often overlooked. These physical\nproperties are not just mathematical parameters; they are embodiments of the\ncamera's interaction with the physical world. By embedding these physical\nproperties into the deep learning model, we can calculate depth priors for\nground regions and regions connected to the ground based on physical\nprinciples, providing free supervision signals without the need for additional\nsensors. This approach is not only easy to implement but also enhances the\neffects of all unsupervised methods by embedding the camera's physical\nproperties into the model, thereby achieving an embodied understanding of the\nreal world.\n","authors":["Jinchang Zhang","Praveen Kumar Reddy","Xue-Iuan Wong","Yiannis Aloimonos","Guoyu Lu"],"pdf_url":"https://arxiv.org/pdf/2408.01565v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16195v1","updated":"2024-08-29T01:25:36Z","published":"2024-08-29T01:25:36Z","title":"DLM-VMTL:A Double Layer Mapper for heterogeneous data video Multi-task\n  prompt learning","summary":"  In recent years, the parameters of backbones of Video Understanding tasks\ncontinue to increase and even reach billion-level. Whether fine-tuning a\nspecific task on the Video Foundation Model or pre-training the model designed\nfor the specific task, incurs a lot of overhead. How to make these models play\nother values than their own tasks becomes a worthy question. Multi-Task\nLearning(MTL) makes the visual task acquire the rich shareable knowledge from\nother tasks while joint training. It is fully explored in Image Recognition\ntasks especially dense predict tasks. Nevertheless, it is rarely used in video\ndomain due to the lack of multi-labels video data. In this paper, a\nheterogenous data video multi-task prompt learning (VMTL) method is proposed to\naddress above problem. It's different from it in image domain, a Double-Layers\nMapper(DLM) is proposed to extract the shareable knowledge into visual promptS\nand align it with representation of primary task. Extensive experiments prove\nthat our DLM-VMTL performs better than baselines on 6 different video\nunderstanding tasks and 11 datasets.\n","authors":["Zeyi Bo","Wuxi Sun","Ye Jin"],"pdf_url":"https://arxiv.org/pdf/2408.16195v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16190v1","updated":"2024-08-29T01:06:51Z","published":"2024-08-29T01:06:51Z","title":"Estimating Dynamic Flow Features in Groups of Tracked Objects","summary":"  Interpreting motion captured in image sequences is crucial for a wide range\nof computer vision applications. Typical estimation approaches include optical\nflow (OF), which approximates the apparent motion instantaneously in a scene,\nand multiple object tracking (MOT), which tracks the motion of subjects over\ntime. Often, the motion of objects in a scene is governed by some underlying\ndynamical system which could be inferred by analyzing the motion of groups of\nobjects. Standard motion analyses, however, are not designed to intuit flow\ndynamics from trajectory data, making such measurements difficult in practice.\nThe goal of this work is to extend gradient-based dynamical systems analyses to\nreal-world applications characterized by complex, feature-rich image sequences\nwith imperfect tracers. The tracer trajectories are tracked using deep vision\nnetworks and gradients are approximated using Lagrangian gradient regression\n(LGR), a tool designed to estimate spatial gradients from sparse data. From\ngradients, dynamical features such as regions of coherent rotation and\ntransport barriers are identified. The proposed approach is affordably\nimplemented and enables advanced studies including the motion analysis of two\ndistinct object classes in a single image sequence. Two examples of the method\nare presented on data sets for which standard gradient-based analyses do not\napply.\n","authors":["Tanner D. Harms","Steven L. Brunton","Beverley J. McKeon"],"pdf_url":"https://arxiv.org/pdf/2408.16190v1.pdf","comment":"21 pages, 6 figures"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2408.16672v1","updated":"2024-08-29T16:21:00Z","published":"2024-08-29T16:21:00Z","title":"Jina-ColBERT-v2: A General-Purpose Multilingual Late Interaction\n  Retriever","summary":"  Multi-vector dense models, such as ColBERT, have proven highly effective in\ninformation retrieval. ColBERT's late interaction scoring approximates the\njoint query-document attention seen in cross-encoders while maintaining\ninference efficiency closer to traditional dense retrieval models, thanks to\nits bi-encoder architecture and recent optimizations in indexing and search. In\nthis paper, we introduce several improvements to the ColBERT model architecture\nand training pipeline, leveraging techniques successful in the more established\nsingle-vector embedding model paradigm, particularly those suited for\nheterogeneous multilingual data. Our new model, Jina-ColBERT-v2, demonstrates\nstrong performance across a range of English and multilingual retrieval tasks,\nwhile also cutting storage requirements by up to 50% compared to previous\nmodels.\n","authors":["Rohan Jha","Bo Wang","Michael Günther","Saba Sturua","Mohammad Kalim Akram","Han Xiao"],"pdf_url":"https://arxiv.org/pdf/2408.16672v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14698v2","updated":"2024-08-29T15:14:48Z","published":"2024-08-26T23:52:27Z","title":"Smart Multi-Modal Search: Contextual Sparse and Dense Embedding\n  Integration in Adobe Express","summary":"  As user content and queries become increasingly multi-modal, the need for\neffective multi-modal search systems has grown. Traditional search systems\noften rely on textual and metadata annotations for indexed images, while\nmulti-modal embeddings like CLIP enable direct search using text and image\nembeddings. However, embedding-based approaches face challenges in integrating\ncontextual features such as user locale and recency. Building a scalable\nmulti-modal search system requires fine-tuning several components. This paper\npresents a multi-modal search architecture and a series of AB tests that\noptimize embeddings and multi-modal technologies in Adobe Express template\nsearch. We address considerations such as embedding model selection, the roles\nof embeddings in matching and ranking, and the balance between dense and sparse\nembeddings. Our iterative approach demonstrates how utilizing sparse, dense,\nand contextual features enhances short and long query search, significantly\nreduces null rates (over 70\\%), and increases click-through rates (CTR). Our\nfindings provide insights into developing robust multi-modal search systems,\nthereby enhancing relevance for complex queries.\n","authors":["Cherag Aroraa","Tracy Holloway King","Jayant Kumar","Yi Lu","Sanat Sharma","Arvind Srikantan","David Uvalle","Josep Valls-Vargas","Harsha Vardhan"],"pdf_url":"https://arxiv.org/pdf/2408.14698v2.pdf","comment":"CIKM 2024 (International Conference on Information and Knowledge\n  Management), Multimodal Search and Recommendations Workshop"},{"id":"http://arxiv.org/abs/2408.16578v1","updated":"2024-08-29T14:44:12Z","published":"2024-08-29T14:44:12Z","title":"Transformers Meet ACT-R: Repeat-Aware and Sequential Listening Session\n  Recommendation","summary":"  Music streaming services often leverage sequential recommender systems to\npredict the best music to showcase to users based on past sequences of\nlistening sessions. Nonetheless, most sequential recommendation methods ignore\nor insufficiently account for repetitive behaviors. This is a crucial\nlimitation for music recommendation, as repeatedly listening to the same song\nover time is a common phenomenon that can even change the way users perceive\nthis song. In this paper, we introduce PISA (Psychology-Informed Session\nembedding using ACT-R), a session-level sequential recommender system that\novercomes this limitation. PISA employs a Transformer architecture learning\nembedding representations of listening sessions and users using attention\nmechanisms inspired by Anderson's ACT-R (Adaptive Control of Thought-Rational),\na cognitive architecture modeling human information access and memory dynamics.\nThis approach enables us to capture dynamic and repetitive patterns from user\nbehaviors, allowing us to effectively predict the songs they will listen to in\nsubsequent sessions, whether they are repeated or new ones. We demonstrate the\nempirical relevance of PISA using both publicly available listening data from\nLast.fm and proprietary data from Deezer, a global music streaming service,\nconfirming the critical importance of repetition modeling for sequential\nlistening session recommendation. Along with this paper, we publicly release\nour proprietary dataset to foster future research in this field, as well as the\nsource code of PISA to facilitate its future use.\n","authors":["Viet-Anh Tran","Guillaume Salha-Galvan","Bruno Sguerra","Romain Hennequin"],"pdf_url":"https://arxiv.org/pdf/2408.16578v1.pdf","comment":"11 pages. Accepted by RecSys'2024, full paper"},{"id":"http://arxiv.org/abs/2408.16446v1","updated":"2024-08-29T11:19:57Z","published":"2024-08-29T11:19:57Z","title":"Is text normalization relevant for classifying medieval charters?","summary":"  This study examines the impact of historical text normalization on the\nclassification of medieval charters, specifically focusing on document dating\nand locating. Using a data set of Middle High German charters from a digital\narchive, we evaluate various classifiers, including traditional and\ntransformer-based models, with and without normalization. Our results indicate\nthat the given normalization minimally improves locating tasks but reduces\naccuracy for dating, implying that original texts contain crucial features that\nnormalization may obscure. We find that support vector machines and gradient\nboosting outperform other models, questioning the efficiency of transformers\nfor this use case. Results suggest a selective approach to historical text\nnormalization, emphasizing the significance of preserving some textual\ncharacteristics that are critical for classification tasks in document\nanalysis.\n","authors":["Florian Atzenhofer-Baumgartner","Tamás Kovács"],"pdf_url":"https://arxiv.org/pdf/2408.16446v1.pdf","comment":"This preprint has not undergone peer review or any post-submission\n  improvements or corrections"},{"id":"http://arxiv.org/abs/2408.16430v1","updated":"2024-08-29T10:44:59Z","published":"2024-08-29T10:44:59Z","title":"Do Recommender Systems Promote Local Music? A Reproducibility Study\n  Using Music Streaming Data","summary":"  This paper examines the influence of recommender systems on local music\nrepresentation, discussing prior findings from an empirical study on the LFM-2b\npublic dataset. This prior study argued that different recommender systems\nexhibit algorithmic biases shifting music consumption either towards or against\nlocal content. However, LFM-2b users do not reflect the diverse audience of\nmusic streaming services. To assess the robustness of this study's conclusions,\nwe conduct a comparative analysis using proprietary listening data from a\nglobal music streaming service, which we publicly release alongside this paper.\nWe observe significant differences in local music consumption patterns between\nour dataset and LFM-2b, suggesting that caution should be exercised when\ndrawing conclusions on local music based solely on LFM-2b. Moreover, we show\nthat the algorithmic biases exhibited in the original work vary in our dataset,\nand that several unexplored model parameters can significantly influence these\nbiases and affect the study's conclusion on both datasets. Finally, we discuss\nthe complexity of accurately labeling local music, emphasizing the risk of\nmisleading conclusions due to unreliable, biased, or incomplete labels. To\nencourage further research and ensure reproducibility, we have publicly shared\nour dataset and code.\n","authors":["Kristina Matrosova","Lilian Marey","Guillaume Salha-Galvan","Thomas Louail","Olivier Bodini","Manuel Moussallam"],"pdf_url":"https://arxiv.org/pdf/2408.16430v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16312v1","updated":"2024-08-29T07:20:56Z","published":"2024-08-29T07:20:56Z","title":"SynDL: A Large-Scale Synthetic Test Collection","summary":"  Large-scale test collections play a crucial role in Information Retrieval\n(IR) research. However, according to the Cranfield paradigm and the research\ninto publicly available datasets, the existing information retrieval research\nstudies are commonly developed on small-scale datasets that rely on human\nassessors for relevance judgments - a time-intensive and expensive process.\nRecent studies have shown the strong capability of Large Language Models (LLMs)\nin producing reliable relevance judgments with human accuracy but at a greatly\nreduced cost. In this paper, to address the missing large-scale ad-hoc document\nretrieval dataset, we extend the TREC Deep Learning Track (DL) test collection\nvia additional language model synthetic labels to enable researchers to test\nand evaluate their search systems at a large scale. Specifically, such a test\ncollection includes more than 1,900 test queries from the previous years of\ntracks. We compare system evaluation with past human labels from past years and\nfind that our synthetically created large-scale test collection can lead to\nhighly correlated system rankings.\n","authors":["Hossein A. Rahmani","Xi Wang","Emine Yilmaz","Nick Craswell","Bhaskar Mitra","Paul Thomas"],"pdf_url":"https://arxiv.org/pdf/2408.16312v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2408.16296v1","updated":"2024-08-29T06:54:03Z","published":"2024-08-29T06:54:03Z","title":"Rethinking Sparse Lexical Representations for Image Retrieval in the Age\n  of Rising Multi-Modal Large Language Models","summary":"  In this paper, we rethink sparse lexical representations for image retrieval.\nBy utilizing multi-modal large language models (M-LLMs) that support visual\nprompting, we can extract image features and convert them into textual data,\nenabling us to utilize efficient sparse retrieval algorithms employed in\nnatural language processing for image retrieval tasks. To assist the LLM in\nextracting image features, we apply data augmentation techniques for key\nexpansion and analyze the impact with a metric for relevance between images and\ntextual data. We empirically show the superior precision and recall performance\nof our image retrieval method compared to conventional vision-language\nmodel-based methods on the MS-COCO, PASCAL VOC, and NUS-WIDE datasets in a\nkeyword-based image retrieval scenario, where keywords serve as search queries.\nWe also demonstrate that the retrieval performance can be improved by\niteratively incorporating keywords into search queries.\n","authors":["Kengo Nakata","Daisuke Miyashita","Youyang Ng","Yasuto Hoshi","Jun Deguchi"],"pdf_url":"https://arxiv.org/pdf/2408.16296v1.pdf","comment":"Accepted to ECCV 2024 Workshops: 2nd Workshop on Traditional Computer\n  Vision in the Age of Deep Learning (TradiCV)"},{"id":"http://arxiv.org/abs/2408.16238v1","updated":"2024-08-29T03:34:39Z","published":"2024-08-29T03:34:39Z","title":"Efficient Transfer Learning Framework for Cross-Domain Click-Through\n  Rate Prediction","summary":"  Natural content and advertisement coexist in industrial recommendation\nsystems but differ in data distribution. Concretely, traffic related to the\nadvertisement is considerably sparser compared to that of natural content,\nwhich motivates the development of transferring knowledge from the richer\nsource natural content domain to the sparser advertising domain. The challenges\ninclude the inefficiencies arising from the management of extensive source data\nand the problem of 'catastrophic forgetting' that results from the CTR model's\ndaily updating. To this end, we propose a novel tri-level asynchronous\nframework, i.e., Efficient Transfer Learning Framework for Cross-Domain\nClick-Through Rate Prediction (E-CDCTR), to transfer comprehensive knowledge of\nnatural content to advertisement CTR models. This framework consists of three\nkey components: Tiny Pre-training Model ((TPM), which trains a tiny CTR model\nwith several basic features on long-term natural data; Complete Pre-training\nModel (CPM), which trains a CTR model holding network structure and input\nfeatures the same as target advertisement on short-term natural data;\nAdvertisement CTR model (A-CTR), which derives its parameter initialization\nfrom CPM together with multiple historical embeddings from TPM as extra feature\nand then fine-tunes on advertisement data. TPM provides richer representations\nof user and item for both the CPM and A-CTR, effectively alleviating the\nforgetting problem inherent in the daily updates. CPM further enhances the\nadvertisement model by providing knowledgeable initialization, thereby\nalleviating the data sparsity challenges typically encountered by advertising\nCTR models. Such a tri-level cross-domain transfer learning framework offers an\nefficient solution to address both data sparsity and `catastrophic forgetting',\nyielding remarkable improvements.\n","authors":["Qi Liu","Xingyuan Tang","Jianqiang Huang","Xiangqian Yu","Haoran Jin","Jin Chen","Yuanhao Pu","Defu Lian","Tan Qu","Zhe Wang","Jia Cheng","Jun Lei"],"pdf_url":"https://arxiv.org/pdf/2408.16238v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21191v2","updated":"2024-08-29T02:27:19Z","published":"2024-07-30T20:58:36Z","title":"GenRec: Generative Sequential Recommendation with Large Language Models","summary":"  Sequential recommendation is a task to capture hidden user preferences from\nhistorical user item interaction data and recommend next items for the user.\nSignificant progress has been made in this domain by leveraging classification\nbased learning methods. Inspired by the recent paradigm of 'pretrain, prompt\nand predict' in NLP, we consider sequential recommendation as a sequence to\nsequence generation task and propose a novel model named Generative\nRecommendation (GenRec). Unlike classification based models that learn explicit\nuser and item representations, GenRec utilizes the sequence modeling capability\nof Transformer and adopts the masked item prediction objective to effectively\nlearn the hidden bidirectional sequential patterns. Different from existing\ngenerative sequential recommendation models, GenRec does not rely on manually\ndesigned hard prompts. The input to GenRec is textual user item sequence and\nthe output is top ranked next items. Moreover, GenRec is lightweight and\nrequires only a few hours to train effectively in low-resource settings, making\nit highly applicable to real-world scenarios and helping to democratize large\nlanguage models in the sequential recommendation domain. Our extensive\nexperiments have demonstrated that GenRec generalizes on various public\nreal-world datasets and achieves state-of-the-art results. Our experiments also\nvalidate the effectiveness of the the proposed masked item prediction objective\nthat improves the model performance by a large margin.\n","authors":["Panfeng Cao","Pietro Lio"],"pdf_url":"https://arxiv.org/pdf/2407.21191v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15793v2","updated":"2024-08-29T00:32:15Z","published":"2023-07-28T20:25:11Z","title":"Summaries, Highlights, and Action items: Design, implementation and\n  evaluation of an LLM-powered meeting recap system","summary":"  Meetings play a critical infrastructural role in the coordination of work. In\nrecent years, due to shift to hybrid and remote work, more meetings are moving\nto online Computer Mediated Spaces. This has led to new problems (e.g. more\ntime spent in less engaging meetings) and new opportunities (e.g. automated\ntranscription/captioning and recap support). Recent advances in large language\nmodels (LLMs) for dialog summarization have the potential to improve the\nexperience of meetings by reducing individuals' meeting load and increasing the\nclarity and alignment of meeting outputs. Despite this potential, they face\ntechnological limitation due to long transcripts and inability to capture\ndiverse recap needs based on user's context. To address these gaps, we design,\nimplement and evaluate in-context a meeting recap system. We first\nconceptualize two salient recap representations -- important highlights, and a\nstructured, hierarchical minutes view. We develop a system to operationalize\nthe representations with dialogue summarization as its building blocks.\nFinally, we evaluate the effectiveness of the system with seven users in the\ncontext of their work meetings. Our findings show promise in using LLM-based\ndialogue summarization for meeting recap and the need for both representations\nin different contexts. However, we find that LLM-based recap still lacks an\nunderstanding of whats personally relevant to participants, can miss important\ndetails, and mis-attributions can be detrimental to group dynamics. We identify\ncollaboration opportunities such as a shared recap document that a high quality\nrecap enables. We report on implications for designing AI systems to partner\nwith users to learn and improve from natural interactions to overcome the\nlimitations related to personal relevance and summarization quality.\n","authors":["Sumit Asthana","Sagih Hilleli","Pengcheng He","Aaron Halfaker"],"pdf_url":"https://arxiv.org/pdf/2307.15793v2.pdf","comment":"in review for CSCW 24"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2408.16765v1","updated":"2024-08-29T17:59:07Z","published":"2024-08-29T17:59:07Z","title":"A Score-Based Density Formula, with Applications in Diffusion Generative\n  Models","summary":"  Score-based generative models (SGMs) have revolutionized the field of\ngenerative modeling, achieving unprecedented success in generating realistic\nand diverse content. Despite empirical advances, the theoretical basis for why\noptimizing the evidence lower bound (ELBO) on the log-likelihood is effective\nfor training diffusion generative models, such as DDPMs, remains largely\nunexplored. In this paper, we address this question by establishing a density\nformula for a continuous-time diffusion process, which can be viewed as the\ncontinuous-time limit of the forward process in an SGM. This formula reveals\nthe connection between the target density and the score function associated\nwith each step of the forward process. Building on this, we demonstrate that\nthe minimizer of the optimization objective for training DDPMs nearly coincides\nwith that of the true objective, providing a theoretical foundation for\noptimizing DDPMs using the ELBO. Furthermore, we offer new insights into the\nrole of score-matching regularization in training GANs, the use of ELBO in\ndiffusion classifiers, and the recently proposed diffusion loss.\n","authors":["Gen Li","Yuling Yan"],"pdf_url":"https://arxiv.org/pdf/2408.16765v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.05733v2","updated":"2024-08-29T17:58:35Z","published":"2024-05-09T12:50:16Z","title":"Batched Stochastic Bandit for Nondegenerate Functions","summary":"  This paper studies batched bandit learning problems for nondegenerate\nfunctions. We introduce an algorithm that solves the batched bandit problem for\nnondegenerate functions near-optimally. More specifically, we introduce an\nalgorithm, called Geometric Narrowing (GN), whose regret bound is of order\n$\\widetilde{{\\mathcal{O}}} ( A_{+}^d \\sqrt{T} )$. In addition, GN only needs\n$\\mathcal{O} (\\log \\log T)$ batches to achieve this regret. We also provide\nlower bound analysis for this problem. More specifically, we prove that over\nsome (compact) doubling metric space of doubling dimension $d$: 1. For any\npolicy $\\pi$, there exists a problem instance on which $\\pi$ admits a regret of\norder ${\\Omega} ( A_-^d \\sqrt{T})$; 2. No policy can achieve a regret of order\n$ A_-^d \\sqrt{T} $ over all problem instances, using less than $ \\Omega ( \\log\n\\log T ) $ rounds of communications. Our lower bound analysis shows that the GN\nalgorithm achieves near optimal regret with minimal number of batches.\n","authors":["Yu Liu","Yunlu Shu","Tianyu Wang"],"pdf_url":"https://arxiv.org/pdf/2405.05733v2.pdf","comment":"34 pages, 14 colored figures"},{"id":"http://arxiv.org/abs/2408.16762v1","updated":"2024-08-29T17:57:05Z","published":"2024-08-29T17:57:05Z","title":"UV-free Texture Generation with Denoising and Geodesic Heat Diffusions","summary":"  Seams, distortions, wasted UV space, vertex-duplication, and varying\nresolution over the surface are the most prominent issues of the standard\nUV-based texturing of meshes. These issues are particularly acute when\nautomatic UV-unwrapping techniques are used. For this reason, instead of\ngenerating textures in automatically generated UV-planes like most\nstate-of-the-art methods, we propose to represent textures as coloured\npoint-clouds whose colours are generated by a denoising diffusion probabilistic\nmodel constrained to operate on the surface of 3D objects. Our sampling and\nresolution agnostic generative model heavily relies on heat diffusion over the\nsurface of the meshes for spatial communication between points. To enable\nprocessing of arbitrarily sampled point-cloud textures and ensure long-distance\ntexture consistency we introduce a fast re-sampling of the mesh spectral\nproperties used during the heat diffusion and introduce a novel\nheat-diffusion-based self-attention mechanism. Our code and pre-trained models\nare available at github.com/simofoti/UV3-TeD.\n","authors":["Simone Foti","Stefanos Zafeiriou","Tolga Birdal"],"pdf_url":"https://arxiv.org/pdf/2408.16762v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.10972v2","updated":"2024-08-29T17:55:52Z","published":"2024-07-15T17:59:55Z","title":"VGBench: Evaluating Large Language Models on Vector Graphics\n  Understanding and Generation","summary":"  In the realm of vision models, the primary mode of representation is using\npixels to rasterize the visual world. Yet this is not always the best or unique\nway to represent visual content, especially for designers and artists who\ndepict the world using geometry primitives such as polygons. Vector graphics\n(VG), on the other hand, offer a textual representation of visual content,\nwhich can be more concise and powerful for content like cartoons, sketches and\nscientific figures. Recent studies have shown promising results on processing\nvector graphics with capable Large Language Models (LLMs). However, such works\nfocus solely on qualitative results, understanding, or a specific type of\nvector graphics. We propose VGBench, a comprehensive benchmark for LLMs on\nhandling vector graphics through diverse aspects, including (a) both visual\nunderstanding and generation, (b) evaluation of various vector graphics\nformats, (c) diverse question types, (d) wide range of prompting techniques,\n(e) under multiple LLMs and (f) comparison with VLMs on rasterized\nrepresentations. Evaluating on our collected 4279 understanding and 5845\ngeneration samples, we find that LLMs show strong capability on both aspects\nwhile exhibiting less desirable performance on low-level formats (SVG). Both\ndata and evaluation pipeline will be open-sourced at https://vgbench.github.io.\n","authors":["Bocheng Zou","Mu Cai","Jianrui Zhang","Yong Jae Lee"],"pdf_url":"https://arxiv.org/pdf/2407.10972v2.pdf","comment":"Project Page: https://vgbench.github.io"},{"id":"http://arxiv.org/abs/2408.16753v1","updated":"2024-08-29T17:49:18Z","published":"2024-08-29T17:49:18Z","title":"Reinforcement Learning without Human Feedback for Last Mile Fine-Tuning\n  of Large Language Models","summary":"  Reinforcement learning is used to align language models with human preference\nsignals after first pre-training the model to predict the next token of text\nwithin a large corpus using likelihood maximization. Before being deployed in a\nspecific domain, models are often further fine-tuned on task specific data.\nSince human preferences are often unavailable for the last step, it is\nperformed using likelihood maximization as that is the typical default method.\nHowever, reinforcement learning has other advantages besides facilitating\nalignment to a human derived reward function. For one, whereas likelihood\nmaximization is a form of imitation learning in which the model is trained on\nwhat to do under ideal conditions, reinforcement learning is not limited to\ndemonstrating actions just for optimally reached states and trains a model what\nto do under a range of scenarios as it explores the policy space. In addition,\nit also trains a model what not to do, suppressing competitive but poor\nactions. This work develops a framework for last-mile fine-tuning using\nreinforcement learning and tests whether it garners performance gains. The\nexperiments center on abstractive summarization, but the framework is general\nand broadly applicable. Use of the procedure produced significantly better\nresults than likelihood maximization when comparing raw predictions. For the\nspecific data tested, the gap could be bridged by employing post-processing of\nthe maximum likelihood outputs. Nonetheless, the framework offers a new avenue\nfor model optimization in situations where post-processing may be less\nstraightforward or effective, and it can be extended to include more complex\nclasses of undesirable outputs to penalize and train against, such as\nhallucinations.\n","authors":["Alec Solway"],"pdf_url":"https://arxiv.org/pdf/2408.16753v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.13154v3","updated":"2024-08-29T17:47:18Z","published":"2024-06-19T02:09:15Z","title":"Conditional score-based diffusion models for solving inverse problems in\n  mechanics","summary":"  We propose a framework to perform Bayesian inference using conditional\nscore-based diffusion models to solve a class of inverse problems in mechanics\ninvolving the inference of a specimen's spatially varying material properties\nfrom noisy measurements of its mechanical response to loading. Conditional\nscore-based diffusion models are generative models that learn to approximate\nthe score function of a conditional distribution using samples from the joint\ndistribution. More specifically, the score functions corresponding to multiple\nrealizations of the measurement are approximated using a single neural network,\nthe so-called score network, which is subsequently used to sample the posterior\ndistribution using an appropriate Markov chain Monte Carlo scheme based on\nLangevin dynamics. Training the score network only requires simulating the\nforward model. Hence, the proposed approach can accommodate black-box forward\nmodels and complex measurement noise. Moreover, once the score network has been\ntrained, it can be re-used to solve the inverse problem for different\nrealizations of the measurements. We demonstrate the efficacy of the proposed\napproach on a suite of high-dimensional inverse problems in mechanics that\ninvolve inferring heterogeneous material properties from noisy measurements.\nSome examples we consider involve synthetic data, while others include data\ncollected from actual elastography experiments. Further, our applications\ndemonstrate that the proposed approach can handle different measurement\nmodalities, complex patterns in the inferred quantities, non-Gaussian and\nnon-additive noise models, and nonlinear black-box forward models. The results\nshow that the proposed framework can solve large-scale physics-based inverse\nproblems efficiently.\n","authors":["Agnimitra Dasgupta","Harisankar Ramaswamy","Javier Murgoitio-Esandi","Ken Foo","Runze Li","Qifa Zhou","Brendan Kennedy","Assad Oberai"],"pdf_url":"https://arxiv.org/pdf/2406.13154v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16751v1","updated":"2024-08-29T17:46:18Z","published":"2024-08-29T17:46:18Z","title":"A Gradient Analysis Framework for Rewarding Good and Penalizing Bad\n  Examples in Language Models","summary":"  Beyond maximum likelihood estimation (MLE), the standard objective of a\nlanguage model (LM) that optimizes good examples probabilities, many studies\nhave explored ways that also penalize bad examples for enhancing the quality of\noutput distribution, including unlikelihood training, exponential maximizing\naverage treatment effect (ExMATE), and direct preference optimization (DPO). To\nsystematically compare these methods and further provide a unified recipe for\nLM optimization, in this paper, we present a unique angle of gradient analysis\nof loss functions that simultaneously reward good examples and penalize bad\nones in LMs. Through both mathematical results and experiments on\nCausalDialogue and Anthropic HH-RLHF datasets, we identify distinct functional\ncharacteristics among these methods. We find that ExMATE serves as a superior\nsurrogate for MLE, and that combining DPO with ExMATE instead of MLE further\nenhances both the statistical (5-7%) and generative (+18% win rate)\nperformance.\n","authors":["Yi-Lin Tuan","William Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2408.16751v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.06599v3","updated":"2024-08-29T17:31:26Z","published":"2023-02-13T18:55:31Z","title":"FilFL: Client Filtering for Optimized Client Participation in Federated\n  Learning","summary":"  Federated learning, an emerging machine learning paradigm, enables clients to\ncollaboratively train a model without exchanging local data. Clients\nparticipating in the training process significantly impact the convergence\nrate, learning efficiency, and model generalization. We propose a novel\napproach, client filtering, to improve model generalization and optimize client\nparticipation and training. The proposed method periodically filters available\nclients to identify a subset that maximizes a combinatorial objective function\nwith an efficient greedy filtering algorithm. Thus, the clients are assessed as\na combination rather than individually. We theoretically analyze the\nconvergence of federated learning with client filtering in heterogeneous\nsettings and evaluate its performance across diverse vision and language tasks,\nincluding realistic scenarios with time-varying client availability. Our\nempirical results demonstrate several benefits of our approach, including\nimproved learning efficiency, faster convergence, and up to 10% higher test\naccuracy than training without client filtering.\n","authors":["Fares Fourati","Salma Kharrat","Vaneet Aggarwal","Mohamed-Slim Alouini","Marco Canini"],"pdf_url":"https://arxiv.org/pdf/2302.06599v3.pdf","comment":"Accepted at ECAI'24"},{"id":"http://arxiv.org/abs/2310.03103v5","updated":"2024-08-29T17:24:20Z","published":"2023-10-04T18:47:34Z","title":"Learning to Prompt Your Domain for Vision-Language Models","summary":"  Prompt learning has recently become a very efficient transfer learning\nparadigm for Contrastive Language Image Pretraining (CLIP) models. Compared\nwith fine-tuning the entire encoder, prompt learning can obtain highly\ncompetitive results by optimizing only a small number of parameters, which\npresents considerably exciting benefits for federated learning applications\nthat prioritizes communication efficiency. However, in this work, we identify\nthat directly transferring prompt learning approaches into federated learning\ndoes not yield favorable results since the model often suffers from\nconsiderable domain gaps across different clients. To address this issue, we\npropose ADAPT, a novel domain-aware prompt learning approach that facilitates\nboth intra- and inter-domain prompts across federated participants. The basic\nidea of ADAPT is that the prompted CLIP should detect the input image's domain\ncorrespondence and before making the prediction of its category. Extensive\nexperiments of ADAPT demonstrate its significant efficiency and effectiveness\nin federated learning. For example, by learning and sharing only 0.08M\nparameters, our ADAPT attains a 68.4% average accuracy over six domains in the\nDomainNet dataset, which improves the original CLIP by a large margin of 14.8%.\n","authors":["Guoyizhe Wei","Feng Wang","Anshul Shah","Rama Chellappa"],"pdf_url":"https://arxiv.org/pdf/2310.03103v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09359v4","updated":"2024-08-29T17:21:27Z","published":"2024-04-14T21:14:47Z","title":"Evaluation Framework for Feedback Generation Methods in Skeletal\n  Movement Assessment","summary":"  The application of machine-learning solutions to movement assessment from\nskeleton videos has attracted significant research attention in recent years.\nThis advancement has made rehabilitation at home more accessible, utilizing\nmovement assessment algorithms that can operate on affordable equipment for\nhuman pose detection and analysis from 2D or 3D videos. While the primary\nobjective of automatic assessment tasks is to score movements, the automatic\ngeneration of feedback highlighting key movement issues has the potential to\nsignificantly enhance and accelerate the rehabilitation process. While numerous\nresearch works exist in the field of automatic movement assessment, only a\nhandful address feedback generation. In this study, we propose terminology and\ncriteria for the classification, evaluation, and comparison of feedback\ngeneration solutions. We discuss the challenges associated with each feedback\ngeneration approach and use our proposed criteria to classify existing\nsolutions. To our knowledge, this is the first work that formulates feedback\ngeneration in skeletal movement assessment.\n","authors":["Tal Hakim"],"pdf_url":"https://arxiv.org/pdf/2404.09359v4.pdf","comment":"Accepted to xAI4Biometrics 2024 at ECCV 2024"},{"id":"http://arxiv.org/abs/2303.15477v5","updated":"2024-08-29T17:20:14Z","published":"2023-03-26T18:31:52Z","title":"Adaptive Log-Euclidean Metrics for SPD Matrix Learning","summary":"  Symmetric Positive Definite (SPD) matrices have received wide attention in\nmachine learning due to their intrinsic capacity to encode underlying\nstructural correlation in data. Many successful Riemannian metrics have been\nproposed to reflect the non-Euclidean geometry of SPD manifolds. However, most\nexisting metric tensors are fixed, which might lead to sub-optimal performance\nfor SPD matrix learning, especially for deep SPD neural networks. To remedy\nthis limitation, we leverage the commonly encountered pullback techniques and\npropose Adaptive Log-Euclidean Metrics (ALEMs), which extend the widely used\nLog-Euclidean Metric (LEM). Compared with the previous Riemannian metrics, our\nmetrics contain learnable parameters, which can better adapt to the complex\ndynamics of Riemannian neural networks with minor extra computations. We also\npresent a complete theoretical analysis to support our ALEMs, including\nalgebraic and Riemannian properties. The experimental and theoretical results\ndemonstrate the merit of the proposed metrics in improving the performance of\nSPD neural networks. The efficacy of our metrics is further showcased on a set\nof recently developed Riemannian building blocks, including Riemannian batch\nnormalization, Riemannian Residual blocks, and Riemannian classifiers.\n","authors":["Ziheng Chen","Yue Song","Tianyang Xu","Zhiwu Huang","Xiao-Jun Wu","Nicu Sebe"],"pdf_url":"https://arxiv.org/pdf/2303.15477v5.pdf","comment":"Accepted by TIP 2024"},{"id":"http://arxiv.org/abs/2408.16725v1","updated":"2024-08-29T17:18:53Z","published":"2024-08-29T17:18:53Z","title":"Mini-Omni: Language Models Can Hear, Talk While Thinking in Streaming","summary":"  Recent advances in language models have achieved significant progress.\nGPT-4o, as a new milestone, has enabled real-time conversations with humans,\ndemonstrating near-human natural fluency. Such human-computer interaction\nnecessitates models with the capability to perform reasoning directly with the\naudio modality and generate output in streaming. However, this remains beyond\nthe reach of current academic models, as they typically depend on extra TTS\nsystems for speech synthesis, resulting in undesirable latency. This paper\nintroduces the Mini-Omni, an audio-based end-to-end conversational model,\ncapable of real-time speech interaction. To achieve this capability, we propose\na text-instructed speech generation method, along with batch-parallel\nstrategies during inference to further boost the performance. Our method also\nhelps to retain the original model's language capabilities with minimal\ndegradation, enabling other works to establish real-time interaction\ncapabilities. We call this training method \"Any Model Can Talk\". We also\nintroduce the VoiceAssistant-400K dataset to fine-tune models optimized for\nspeech output. To our best knowledge, Mini-Omni is the first fully end-to-end,\nopen-source model for real-time speech interaction, offering valuable potential\nfor future research.\n","authors":["Zhifei Xie","Changqiao Wu"],"pdf_url":"https://arxiv.org/pdf/2408.16725v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2408.16717v1","updated":"2024-08-29T17:07:43Z","published":"2024-08-29T17:07:43Z","title":"A GREAT Architecture for Edge-Based Graph Problems Like TSP","summary":"  In the last years, many neural network-based approaches have been proposed to\ntackle combinatorial optimization problems such as routing problems. Many of\nthese approaches are based on graph neural networks (GNNs) or related\ntransformers, operating on the Euclidean coordinates representing the routing\nproblems. However, GNNs are inherently not well suited to operate on dense\ngraphs, such as in routing problems. Furthermore, models operating on Euclidean\ncoordinates cannot be applied to non-Euclidean versions of routing problems\nthat are often found in real-world settings. To overcome these limitations, we\npropose a novel GNN-related edge-based neural model called Graph Edge Attention\nNetwork (GREAT). We evaluate the performance of GREAT in the\nedge-classification task to predict optimal edges in the Traveling Salesman\nProblem (TSP). We can use such a trained GREAT model to produce sparse TSP\ngraph instances, keeping only the edges GREAT finds promising. Compared to\nother, non-learning-based methods to sparsify TSP graphs, GREAT can produce\nvery sparse graphs while keeping most of the optimal edges. Furthermore, we\nbuild a reinforcement learning-based GREAT framework which we apply to\nEuclidean and non-Euclidean asymmetric TSP. This framework achieves\nstate-of-the-art results.\n","authors":["Attila Lischka","Jiaming Wu","Morteza Haghir Chehreghani","Balázs Kulcsár"],"pdf_url":"https://arxiv.org/pdf/2408.16717v1.pdf","comment":"15 pages, 7 figures"},{"id":"http://arxiv.org/abs/2405.09536v2","updated":"2024-08-29T17:06:52Z","published":"2024-05-15T17:45:59Z","title":"Wasserstein Gradient Boosting: A Framework for Distribution-Valued\n  Supervised Learning","summary":"  Gradient boosting is a sequential ensemble method that fits a new weaker\nlearner to pseudo residuals at each iteration. We propose Wasserstein gradient\nboosting, a novel extension of gradient boosting that fits a new weak learner\nto alternative pseudo residuals that are Wasserstein gradients of loss\nfunctionals of probability distributions assigned at each input. It solves\ndistribution-valued supervised learning, where the output values of the\ntraining dataset are probability distributions for each input. In\nclassification and regression, a model typically returns, for each input, a\npoint estimate of a parameter of a noise distribution specified for a response\nvariable, such as the class probability parameter of a categorical distribution\nspecified for a response label. A main application of Wasserstein gradient\nboosting in this paper is tree-based evidential learning, which returns a\ndistributional estimate of the response parameter for each input. We\nempirically demonstrate the superior performance of the probabilistic\nprediction by Wasserstein gradient boosting in comparison with existing\nuncertainty quantification methods.\n","authors":["Takuo Matsubara"],"pdf_url":"https://arxiv.org/pdf/2405.09536v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16707v1","updated":"2024-08-29T17:00:47Z","published":"2024-08-29T17:00:47Z","title":"Enhanced forecasting of stock prices based on variational mode\n  decomposition, PatchTST, and adaptive scale-weighted layer","summary":"  The significant fluctuations in stock index prices in recent years highlight\nthe critical need for accurate forecasting to guide investment and financial\nstrategies. This study introduces a novel composite forecasting framework that\nintegrates variational mode decomposition (VMD), PatchTST, and adaptive\nscale-weighted layer (ASWL) to address these challenges. Utilizing datasets of\nfour major stock indices--SP500, DJI, SSEC, and FTSE--from 2000 to 2024, the\nproposed method first decomposes the raw price series into intrinsic mode\nfunctions (IMFs) using VMD. Each IMF is then modeled with PatchTST to capture\ntemporal patterns effectively. The ASWL module is applied to incorporate scale\ninformation, enhancing prediction accuracy. The final forecast is derived by\naggregating predictions from all IMFs. The VMD-PatchTST-ASWL framework\ndemonstrates significant improvements in forecasting accuracy compared to\ntraditional models, showing robust performance across different indices. This\ninnovative approach provides a powerful tool for stock index price forecasting,\nwith potential applications in various financial analysis and investment\ndecision-making contexts.\n","authors":["Xiaorui Xue","Shaofang Li","Xiaonan Wang"],"pdf_url":"https://arxiv.org/pdf/2408.16707v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05527v3","updated":"2024-08-29T16:48:58Z","published":"2024-03-08T18:48:30Z","title":"GEAR: An Efficient KV Cache Compression Recipe for Near-Lossless\n  Generative Inference of LLM","summary":"  Key-value (KV) caching has become the de-facto to accelerate generation speed\nfor large language models (LLMs) inference. However, the growing cache demand\nwith increasing sequence length has transformed LLM inference to be a memory\nbound problem, significantly constraining the system throughput. Existing\nmethods rely on dropping unimportant tokens or quantizing all entries\nuniformly. Such methods, however, often incur high approximation errors to\nrepresent the compressed matrices. The autoregressive decoding process further\ncompounds the error of each step, resulting in critical deviation in model\ngeneration and deterioration of performance. To tackle this challenge, we\npropose GEAR, an efficient KV cache compression framework that achieves\nnear-lossless high-ratio compression. GEAR first applies quantization to\nmajority of entries of similar magnitudes to ultra-low precision. It then\nemploys a low rank matrix to approximate the quantization error, and a sparse\nmatrix to remedy individual errors from outlier entries. By adeptly integrating\nthree techniques, GEAR is able to fully exploit their synergistic potentials.\nOur experiments demonstrate that compared to alternatives, GEAR achieves\nnear-lossless 4-bit KV cache compression with up to 2.38x throughput\nimprovement, while reducing peak-memory size up to 2.29x. Our code is publicly\navailable at https://github.com/HaoKang-Timmy/GEAR.\n","authors":["Hao Kang","Qingru Zhang","Souvik Kundu","Geonhwa Jeong","Zaoxing Liu","Tushar Krishna","Tuo Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.05527v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16698v1","updated":"2024-08-29T16:47:58Z","published":"2024-08-29T16:47:58Z","title":"SympGNNs: Symplectic Graph Neural Networks for identifiying\n  high-dimensional Hamiltonian systems and node classification","summary":"  Existing neural network models to learn Hamiltonian systems, such as\nSympNets, although accurate in low-dimensions, struggle to learn the correct\ndynamics for high-dimensional many-body systems. Herein, we introduce\nSymplectic Graph Neural Networks (SympGNNs) that can effectively handle system\nidentification in high-dimensional Hamiltonian systems, as well as node\nclassification. SympGNNs combines symplectic maps with permutation\nequivariance, a property of graph neural networks. Specifically, we propose two\nvariants of SympGNNs: i) G-SympGNN and ii) LA-SympGNN, arising from different\nparameterizations of the kinetic and potential energy. We demonstrate the\ncapabilities of SympGNN on two physical examples: a 40-particle coupled\nHarmonic oscillator, and a 2000-particle molecular dynamics simulation in a\ntwo-dimensional Lennard-Jones potential. Furthermore, we demonstrate the\nperformance of SympGNN in the node classification task, achieving accuracy\ncomparable to the state-of-the-art. We also empirically show that SympGNN can\novercome the oversmoothing and heterophily problems, two key challenges in the\nfield of graph neural networks.\n","authors":["Alan John Varghese","Zhen Zhang","George Em Karniadakis"],"pdf_url":"https://arxiv.org/pdf/2408.16698v1.pdf","comment":"17 pages, 10 figures"},{"id":"http://arxiv.org/abs/2406.10166v2","updated":"2024-08-29T16:44:17Z","published":"2024-06-14T16:36:35Z","title":"Misam: Using ML in Dataflow Selection of Sparse-Sparse Matrix\n  Multiplication","summary":"  Sparse matrix-matrix multiplication (SpGEMM) is a critical operation in\nnumerous fields, including scientific computing, graph analytics, and deep\nlearning. These applications exploit the sparsity of matrices to reduce storage\nand computational demands. However, the irregular structure of sparse matrices\nposes significant challenges for performance optimization. Traditional hardware\naccelerators are tailored for specific sparsity patterns with fixed dataflow\nschemes - inner, outer, and row-wise but often perform suboptimally when the\nactual sparsity deviates from these predetermined patterns. As the use of\nSpGEMM expands across various domains, each with distinct sparsity\ncharacteristics, the demand for hardware accelerators that can efficiently\nhandle a range of sparsity patterns is increasing. This paper presents a\nmachine learning based approach for adaptively selecting the most appropriate\ndataflow scheme for SpGEMM tasks with diverse sparsity patterns. By employing\ndecision trees and deep reinforcement learning, we explore the potential of\nthese techniques to surpass heuristic-based methods in identifying optimal\ndataflow schemes. We evaluate our models by comparing their performance with\nthat of a heuristic, highlighting the strengths and weaknesses of each\napproach. Our findings suggest that using machine learning for dynamic dataflow\nselection in hardware accelerators can provide upto 28 times gains.\n","authors":["Sanjali Yadav","Bahar Asgari"],"pdf_url":"https://arxiv.org/pdf/2406.10166v2.pdf","comment":"Accepted to ISCA 2024 MLArchSys workshop\n  https://openreview.net/forum?id=A1V9FaZRbV"},{"id":"http://arxiv.org/abs/2310.12000v2","updated":"2024-08-29T16:40:44Z","published":"2023-10-18T14:31:16Z","title":"Iterative Methods for Vecchia-Laplace Approximations for Latent Gaussian\n  Process Models","summary":"  Latent Gaussian process (GP) models are flexible probabilistic non-parametric\nfunction models. Vecchia approximations are accurate approximations for GPs to\novercome computational bottlenecks for large data, and the Laplace\napproximation is a fast method with asymptotic convergence guarantees to\napproximate marginal likelihoods and posterior predictive distributions for\nnon-Gaussian likelihoods. Unfortunately, the computational complexity of\ncombined Vecchia-Laplace approximations grows faster than linearly in the\nsample size when used in combination with direct solver methods such as the\nCholesky decomposition. Computations with Vecchia-Laplace approximations can\nthus become prohibitively slow precisely when the approximations are usually\nthe most accurate, i.e., on large data sets. In this article, we present\niterative methods to overcome this drawback. Among other things, we introduce\nand analyze several preconditioners, derive new convergence results, and\npropose novel methods for accurately approximating predictive variances. We\nanalyze our proposed methods theoretically and in experiments with simulated\nand real-world data. In particular, we obtain a speed-up of an order of\nmagnitude compared to Cholesky-based calculations and a threefold increase in\nprediction accuracy in terms of the continuous ranked probability score\ncompared to a state-of-the-art method on a large satellite data set. All\nmethods are implemented in a free C++ software library with high-level Python\nand R packages.\n","authors":["Pascal Kündig","Fabio Sigrist"],"pdf_url":"https://arxiv.org/pdf/2310.12000v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.06829v3","updated":"2024-08-29T16:38:24Z","published":"2022-11-13T06:11:38Z","title":"Methods for Recovering Conditional Independence Graphs: A Survey","summary":"  Conditional Independence (CI) graphs are a type of probabilistic graphical\nmodels that are primarily used to gain insights about feature relationships.\nEach edge represents the partial correlation between the connected features\nwhich gives information about their direct dependence. In this survey, we list\nout different methods and study the advances in techniques developed to recover\nCI graphs. We cover traditional optimization methods as well as recently\ndeveloped deep learning architectures along with their recommended\nimplementations. To facilitate wider adoption, we include preliminaries that\nconsolidate associated operations, for example techniques to obtain covariance\nmatrix for mixed datatypes.\n","authors":["Harsh Shrivastava","Urszula Chajewska"],"pdf_url":"https://arxiv.org/pdf/2211.06829v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16686v1","updated":"2024-08-29T16:32:24Z","published":"2024-08-29T16:32:24Z","title":"CW-CNN & CW-AN: Convolutional Networks and Attention Networks for\n  CW-Complexes","summary":"  We present a novel framework for learning on CW-complex structured data\npoints. Recent advances have discussed CW-complexes as ideal learning\nrepresentations for problems in cheminformatics. However, there is a lack of\navailable machine learning methods suitable for learning on CW-complexes. In\nthis paper we develop notions of convolution and attention that are well\ndefined for CW-complexes. These notions enable us to create the first neural\nnetwork that can receive a CW-complex as input. We illustrate and interpret\nthis framework in the context of supervised prediction.\n","authors":["Rahul Khorana"],"pdf_url":"https://arxiv.org/pdf/2408.16686v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16683v1","updated":"2024-08-29T16:28:43Z","published":"2024-08-29T16:28:43Z","title":"A Catalog of Fairness-Aware Practices in Machine Learning Engineering","summary":"  Machine learning's widespread adoption in decision-making processes raises\nconcerns about fairness, particularly regarding the treatment of sensitive\nfeatures and potential discrimination against minorities. The software\nengineering community has responded by developing fairness-oriented metrics,\nempirical studies, and approaches. However, there remains a gap in\nunderstanding and categorizing practices for engineering fairness throughout\nthe machine learning lifecycle. This paper presents a novel catalog of\npractices for addressing fairness in machine learning derived from a systematic\nmapping study. The study identifies and categorizes 28 practices from existing\nliterature, mapping them onto different stages of the machine learning\nlifecycle. From this catalog, the authors extract actionable items and\nimplications for both researchers and practitioners in software engineering.\nThis work aims to provide a comprehensive resource for integrating fairness\nconsiderations into the development and deployment of machine learning systems,\nenhancing their reliability, accountability, and credibility.\n","authors":["Gianmario Voria","Giulia Sellitto","Carmine Ferrara","Francesco Abate","Andrea De Lucia","Filomena Ferrucci","Gemma Catolino","Fabio Palomba"],"pdf_url":"https://arxiv.org/pdf/2408.16683v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16673v1","updated":"2024-08-29T16:21:00Z","published":"2024-08-29T16:21:00Z","title":"Entropic Distribution Matching in Supervised Fine-tuning of LLMs: Less\n  Overfitting and Better Diversity","summary":"  Large language models rely on Supervised Fine-Tuning (SFT) to specialize in\ndownstream tasks. Cross Entropy (CE) loss is the de facto choice in SFT, but it\noften leads to overfitting and limited output diversity due to its aggressive\nupdates to the data distribution. This paper aim to address these issues by\nintroducing the maximum entropy principle, which favors models with flatter\ndistributions that still effectively capture the data. Specifically, we develop\na new distribution matching method called GEM, which solves reverse\nKullback-Leibler divergence minimization with an entropy regularizer.\n  For the SFT of Llama-3-8B models, GEM outperforms CE in several aspects.\nFirst, when applied to the UltraFeedback dataset to develop general\ninstruction-following abilities, GEM exhibits reduced overfitting, evidenced by\nlower perplexity and better performance on the IFEval benchmark. Furthermore,\nGEM enhances output diversity, leading to performance gains of up to 7 points\non math reasoning and code generation tasks using best-of-n sampling, even\nwithout domain-specific data. Second, when fine-tuning with domain-specific\ndatasets for math reasoning and code generation, GEM also shows less\noverfitting and improvements of up to 10 points compared with CE.\n","authors":["Ziniu Li","Congliang Chen","Tian Xu","Zeyu Qin","Jiancong Xiao","Ruoyu Sun","Zhi-Quan Luo"],"pdf_url":"https://arxiv.org/pdf/2408.16673v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16667v1","updated":"2024-08-29T16:15:01Z","published":"2024-08-29T16:15:01Z","title":"Iterative Graph Alignment","summary":"  By compressing diverse narratives, LLMs go beyond memorization, achieving\nintelligence by capturing generalizable causal relationships. However, they\nsuffer from local 'representation gaps' due to insufficient training data\ndiversity, limiting their real-world utility, especially in tasks requiring\nstrict alignment to rules. Traditional alignment methods relying on heavy human\nannotations are inefficient and unscalable. Recent self-alignment techniques\nalso fall short, as they often depend on self-selection based prompting and\nmemorization-based learning. To address these issues, we introduce Iterative\nGraph Alignment (IGA), an annotation-free rule-based alignment algorithm. A\nteacher model (VLM) employs Iterative Graph Prompting (IGP) to create logical\ngraphs and reference answers. The student model (LLM) identifies local\nknowledge gaps by attempting to align its responses with these references,\ncollaborating with helper models to generate diverse answers. These aligned\nresponses are then used for iterative supervised fine-tuning (SFT). Our\nevaluations across five rule-based scenarios demonstrate IGP's effectiveness,\nwith a 73.12\\% alignment improvement in Claude Sonnet 3.5, and\nLlama3-8B-Instruct achieving an 86.20\\% improvement, outperforming Claude\nSonnet 3.5 in rule-based alignment.\n","authors":["Fangyuan Yu","Hardeep Singh Arora","Matt Johnson"],"pdf_url":"https://arxiv.org/pdf/2408.16667v1.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2408.15096v2","updated":"2024-08-29T15:59:13Z","published":"2024-08-27T14:26:56Z","title":"Post-processing fairness with minimal changes","summary":"  In this paper, we introduce a novel post-processing algorithm that is both\nmodel-agnostic and does not require the sensitive attribute at test time. In\naddition, our algorithm is explicitly designed to enforce minimal changes\nbetween biased and debiased predictions; a property that, while highly\ndesirable, is rarely prioritized as an explicit objective in fairness\nliterature. Our approach leverages a multiplicative factor applied to the logit\nvalue of probability scores produced by a black-box classifier. We demonstrate\nthe efficacy of our method through empirical evaluations, comparing its\nperformance against other four debiasing algorithms on two widely used datasets\nin fairness research.\n","authors":["Federico Di Gennaro","Thibault Laugel","Vincent Grari","Xavier Renard","Marcin Detyniecki"],"pdf_url":"https://arxiv.org/pdf/2408.15096v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04559v2","updated":"2024-08-29T15:58:09Z","published":"2024-07-05T14:48:15Z","title":"Not (yet) the whole story: Evaluating Visual Storytelling Requires More\n  than Measuring Coherence, Grounding, and Repetition","summary":"  Visual storytelling consists in generating a natural language story given a\ntemporally ordered sequence of images. This task is not only challenging for\nmodels, but also very difficult to evaluate with automatic metrics since there\nis no consensus about what makes a story 'good'. In this paper, we introduce a\nnovel method that measures story quality in terms of human likeness regarding\nthree key aspects highlighted in previous work: visual grounding, coherence,\nand repetitiveness. We then use this method to evaluate the stories generated\nby several models, showing that the foundation model LLaVA obtains the best\nresult, but only slightly so compared to TAPM, a 50-times smaller visual\nstorytelling model. Upgrading the visual and language components of TAPM\nresults in a model that yields competitive performance with a relatively low\nnumber of parameters. Finally, we carry out a human evaluation study, whose\nresults suggest that a 'good' story may require more than a human-like level of\nvisual grounding, coherence, and repetition.\n","authors":["Aditya K Surikuchi","Raquel Fernández","Sandro Pezzelle"],"pdf_url":"https://arxiv.org/pdf/2407.04559v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16653v1","updated":"2024-08-29T15:56:22Z","published":"2024-08-29T15:56:22Z","title":"Optimal Parallelization of Boosting","summary":"  Recent works on the parallel complexity of Boosting have established strong\nlower bounds on the tradeoff between the number of training rounds $p$ and the\ntotal parallel work per round $t$. These works have also presented highly\nnon-trivial parallel algorithms that shed light on different regions of this\ntradeoff. Despite these advancements, a significant gap persists between the\ntheoretical lower bounds and the performance of these algorithms across much of\nthe tradeoff space. In this work, we essentially close this gap by providing\nboth improved lower bounds on the parallel complexity of weak-to-strong\nlearners, and a parallel Boosting algorithm whose performance matches these\nbounds across the entire $p$ vs.~$t$ compromise spectrum, up to logarithmic\nfactors. Ultimately, this work settles the true parallel complexity of Boosting\nalgorithms that are nearly sample-optimal.\n","authors":["Arthur da Cunha","Mikael Møller Høgsgaard","Kasper Green Larsen"],"pdf_url":"https://arxiv.org/pdf/2408.16653v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16650v1","updated":"2024-08-29T15:55:27Z","published":"2024-08-29T15:55:27Z","title":"Towards Efficient Modelling of String Dynamics: A Comparison of State\n  Space and Koopman based Deep Learning Methods","summary":"  This paper presents an examination of State Space Models (SSM) and\nKoopman-based deep learning methods for modelling the dynamics of both linear\nand non-linear stiff strings. Through experiments with datasets generated under\ndifferent initial conditions and sample rates, we assess the capacity of these\nmodels to accurately model the complex behaviours observed in string dynamics.\nOur findings indicate that our proposed Koopman-based model performs as well as\nor better than other existing approaches in non-linear cases for long-sequence\nmodelling.\n  We inform the design of these architectures with the structure of the\nproblems at hand. Although challenges remain in extending model predictions\nbeyond the training horizon (i.e., extrapolation), the focus of our\ninvestigation lies in the models' ability to generalise across different\ninitial conditions within the training time interval. This research contributes\ninsights into the physical modelling of dynamical systems (in particular those\naddressing musical acoustics) by offering a comparative overview of these and\nprevious methods and introducing innovative strategies for model improvement.\nOur results highlight the efficacy of these models in simulating non-linear\ndynamics and emphasise their wide-ranging applicability in accurately modelling\ndynamical systems over extended sequences.\n","authors":["Rodrigo Diaz","Carlos De La Vega Martin","Mark Sandler"],"pdf_url":"https://arxiv.org/pdf/2408.16650v1.pdf","comment":"Accepted to DAFx2024"},{"id":"http://arxiv.org/abs/2405.00846v3","updated":"2024-08-29T15:53:29Z","published":"2024-05-01T20:21:44Z","title":"Gameplay Filters: Robust Zero-Shot Safety through Adversarial\n  Imagination","summary":"  Despite the impressive recent advances in learning-based robot control,\nensuring robustness to out-of-distribution conditions remains an open\nchallenge. Safety filters can, in principle, keep arbitrary control policies\nfrom incurring catastrophic failures by overriding unsafe actions, but existing\nsolutions for complex (e.g., legged) robot dynamics do not span the full motion\nenvelope and instead rely on local, reduced-order models. These filters tend to\noverly restrict agility and can still fail when perturbed away from nominal\nconditions. This paper presents the gameplay filter, a new class of predictive\nsafety filter that continually plays out hypothetical matches between its\nsimulation-trained safety strategy and a virtual adversary co-trained to invoke\nworst-case events and sim-to-real error, and precludes actions that would cause\nit to fail down the line. We demonstrate the scalability and robustness of the\napproach with a first-of-its-kind full-order safety filter for (36-D)\nquadrupedal dynamics. Physical experiments on two different quadruped platforms\ndemonstrate the superior zero-shot effectiveness of the gameplay filter under\nlarge perturbations such as tugging and unmodeled terrain.\n","authors":["Duy P. Nguyen","Kai-Chieh Hsu","Wenhao Yu","Jie Tan","Jaime F. Fisac"],"pdf_url":"https://arxiv.org/pdf/2405.00846v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16638v1","updated":"2024-08-29T15:42:06Z","published":"2024-08-29T15:42:06Z","title":"3D Pose-Based Temporal Action Segmentation for Figure Skating: A\n  Fine-Grained and Jump Procedure-Aware Annotation Approach","summary":"  Understanding human actions from videos is essential in many domains,\nincluding sports. In figure skating, technical judgments are performed by\nwatching skaters' 3D movements, and its part of the judging procedure can be\nregarded as a Temporal Action Segmentation (TAS) task. TAS tasks in figure\nskating that automatically assign temporal semantics to video are actively\nresearched. However, there is a lack of datasets and effective methods for TAS\ntasks requiring 3D pose data. In this study, we first created the FS-Jump3D\ndataset of complex and dynamic figure skating jumps using optical markerless\nmotion capture. We also propose a new fine-grained figure skating jump TAS\ndataset annotation method with which TAS models can learn jump procedures. In\nthe experimental results, we validated the usefulness of 3D pose features as\ninput and the fine-grained dataset for the TAS model in figure skating.\nFS-Jump3D Dataset is available at https://github.com/ryota-skating/FS-Jump3D.\n","authors":["Ryota Tanaka","Tomohiro Suzuki","Keisuke Fujii"],"pdf_url":"https://arxiv.org/pdf/2408.16638v1.pdf","comment":"10 pages, 7th ACM International Workshop on Multimedia Content\n  Analysis in Sports"},{"id":"http://arxiv.org/abs/2405.14469v2","updated":"2024-08-29T15:41:52Z","published":"2024-05-23T11:56:05Z","title":"Generalization of Hamiltonian algorithms","summary":"  The paper proves generalization results for a class of stochastic learning\nalgorithms. The method applies whenever the algorithm generates an absolutely\ncontinuous distribution relative to some a-priori measure and the Radon Nikodym\nderivative has subgaussian concentration. Applications are bounds for the Gibbs\nalgorithm and randomizations of stable deterministic algorithms as well as\nPAC-Bayesian bounds with data-dependent priors.\n","authors":["Andreas Maurer"],"pdf_url":"https://arxiv.org/pdf/2405.14469v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20743v2","updated":"2024-08-29T15:31:58Z","published":"2024-05-31T10:13:17Z","title":"Trajectory Forecasting through Low-Rank Adaptation of Discrete Latent\n  Codes","summary":"  Trajectory forecasting is crucial for video surveillance analytics, as it\nenables the anticipation of future movements for a set of agents, e.g.\nbasketball players engaged in intricate interactions with long-term intentions.\nDeep generative models offer a natural learning approach for trajectory\nforecasting, yet they encounter difficulties in achieving an optimal balance\nbetween sampling fidelity and diversity. We address this challenge by\nleveraging Vector Quantized Variational Autoencoders (VQ-VAEs), which utilize a\ndiscrete latent space to tackle the issue of posterior collapse. Specifically,\nwe introduce an instance-based codebook that allows tailored latent\nrepresentations for each example. In a nutshell, the rows of the codebook are\ndynamically adjusted to reflect contextual information (i.e., past motion\npatterns extracted from the observed trajectories). In this way, the\ndiscretization process gains flexibility, leading to improved reconstructions.\nNotably, instance-level dynamics are injected into the codebook through\nlow-rank updates, which restrict the customization of the codebook to a lower\ndimension space. The resulting discrete space serves as the basis of the\nsubsequent step, which regards the training of a diffusion-based predictive\nmodel. We show that such a two-fold framework, augmented with instance-level\ndiscretization, leads to accurate and diverse forecasts, yielding\nstate-of-the-art performance on three established benchmarks.\n","authors":["Riccardo Benaglia","Angelo Porrello","Pietro Buzzega","Simone Calderara","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2405.20743v2.pdf","comment":"15 pages, 3 figures, 5 tables"},{"id":"http://arxiv.org/abs/2408.16623v1","updated":"2024-08-29T15:31:51Z","published":"2024-08-29T15:31:51Z","title":"Turbulence Strength $C_n^2$ Estimation from Video using Physics-based\n  Deep Learning","summary":"  Images captured from a long distance suffer from dynamic image distortion due\nto turbulent flow of air cells with random temperatures, and thus refractive\nindices. This phenomenon, known as image dancing, is commonly characterized by\nits refractive-index structure constant $C_n^2$ as a measure of the turbulence\nstrength. For many applications such as atmospheric forecast model,\nlong-range/astronomy imaging, and aviation safety, optical communication\ntechnology, $C_n^2$ estimation is critical for accurately sensing the turbulent\nenvironment. Previous methods for $C_n^2$ estimation include estimation from\nmeteorological data (temperature, relative humidity, wind shear, etc.) for\nsingle-point measurements, two-ended pathlength measurements from optical\nscintillometer for path-averaged $C_n^2$, and more recently estimating $C_n^2$\nfrom passive video cameras for low cost and hardware complexity. In this paper,\nwe present a comparative analysis of classical image gradient methods for\n$C_n^2$ estimation and modern deep learning-based methods leveraging\nconvolutional neural networks. To enable this, we collect a dataset of video\ncapture along with reference scintillometer measurements for ground truth, and\nwe release this unique dataset to the scientific community. We observe that\ndeep learning methods can achieve higher accuracy when trained on similar data,\nbut suffer from generalization errors to other, unseen imagery as compared to\nclassical methods. To overcome this trade-off, we present a novel physics-based\nnetwork architecture that combines learned convolutional layers with a\ndifferentiable image gradient method that maintains high accuracy while being\ngeneralizable across image datasets.\n","authors":["Ripon Kumar Saha","Esen Salcin","Jihoo Kim","Joseph Smith","Suren Jayasuriya"],"pdf_url":"https://arxiv.org/pdf/2408.16623v1.pdf","comment":"Code Available: https://github.com/Riponcs/Cn2Estimation"},{"id":"http://arxiv.org/abs/2408.13140v2","updated":"2024-08-29T15:31:35Z","published":"2024-08-23T15:02:09Z","title":"Verification of Geometric Robustness of Neural Networks via Piecewise\n  Linear Approximation and Lipschitz Optimisation","summary":"  We address the problem of verifying neural networks against geometric\ntransformations of the input image, including rotation, scaling, shearing, and\ntranslation. The proposed method computes provably sound piecewise linear\nconstraints for the pixel values by using sampling and linear approximations in\ncombination with branch-and-bound Lipschitz optimisation. The method obtains\nprovably tighter over-approximations of the perturbation region than the\npresent state-of-the-art. We report results from experiments on a comprehensive\nset of verification benchmarks on MNIST and CIFAR10. We show that our proposed\nimplementation resolves up to 32% more verification cases than present\napproaches.\n","authors":["Ben Batten","Yang Zheng","Alessandro De Palma","Panagiotis Kouvaros","Alessio Lomuscio"],"pdf_url":"https://arxiv.org/pdf/2408.13140v2.pdf","comment":"ECAI 2024"},{"id":"http://arxiv.org/abs/2408.16621v1","updated":"2024-08-29T15:28:42Z","published":"2024-08-29T15:28:42Z","title":"Towards Infusing Auxiliary Knowledge for Distracted Driver Detection","summary":"  Distracted driving is a leading cause of road accidents globally.\nIdentification of distracted driving involves reliably detecting and\nclassifying various forms of driver distraction (e.g., texting, eating, or\nusing in-car devices) from in-vehicle camera feeds to enhance road safety. This\ntask is challenging due to the need for robust models that can generalize to a\ndiverse set of driver behaviors without requiring extensive annotated datasets.\nIn this paper, we propose KiD3, a novel method for distracted driver detection\n(DDD) by infusing auxiliary knowledge about semantic relations between entities\nin a scene and the structural configuration of the driver's pose. Specifically,\nwe construct a unified framework that integrates the scene graphs, and driver\npose information with the visual cues in video frames to create a holistic\nrepresentation of the driver's actions.Our results indicate that KiD3 achieves\na 13.64% accuracy improvement over the vision-only baseline by incorporating\nsuch auxiliary knowledge with visual information.\n","authors":["Ishwar B Balappanawar","Ashmit Chamoli","Ruwan Wickramarachchi","Aditya Mishra","Ponnurangam Kumaraguru","Amit P. Sheth"],"pdf_url":"https://arxiv.org/pdf/2408.16621v1.pdf","comment":"Accepted at KiL 2024: Workshop on Knowledge-infused Learning\n  co-located with 30th ACM KDD Conference"},{"id":"http://arxiv.org/abs/2408.16620v1","updated":"2024-08-29T15:28:01Z","published":"2024-08-29T15:28:01Z","title":"Hyperdimensional Vector Tsetlin Machines with Applications to Sequence\n  Learning and Generation","summary":"  We construct a two-layered model for learning and generating sequential data\nthat is both computationally fast and competitive with vanilla Tsetlin\nmachines, adding numerous advantages. Through the use of hyperdimensional\nvector computing (HVC) algebras and Tsetlin machine clause structures, we\ndemonstrate that the combination of both inherits the generality of data\nencoding and decoding of HVC with the fast interpretable nature of Tsetlin\nmachines to yield a powerful machine learning model. We apply the approach in\ntwo areas, namely in forecasting, generating new sequences, and classification.\nFor the latter, we derive results for the entire UCR Time Series Archive and\ncompare with the standard benchmarks to see how well the method competes in\ntime series classification.\n","authors":["Christian D. Blakely"],"pdf_url":"https://arxiv.org/pdf/2408.16620v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16613v1","updated":"2024-08-29T15:20:17Z","published":"2024-08-29T15:20:17Z","title":"Blending Low and High-Level Semantics of Time Series for Better Masked\n  Time Series Generation","summary":"  State-of-the-art approaches in time series generation (TSG), such as\nTimeVQVAE, utilize vector quantization-based tokenization to effectively model\ncomplex distributions of time series. These approaches first learn to transform\ntime series into a sequence of discrete latent vectors, and then a prior model\nis learned to model the sequence. The discrete latent vectors, however, only\ncapture low-level semantics (\\textit{e.g.,} shapes). We hypothesize that\nhigher-fidelity time series can be generated by training a prior model on more\ninformative discrete latent vectors that contain both low and high-level\nsemantics (\\textit{e.g.,} characteristic dynamics). In this paper, we introduce\na novel framework, termed NC-VQVAE, to integrate self-supervised learning into\nthose TSG methods to derive a discrete latent space where low and high-level\nsemantics are captured. Our experimental results demonstrate that NC-VQVAE\nresults in a considerable improvement in the quality of synthetic samples.\n","authors":["Johan Vik Mathisen","Erlend Lokna","Daesoo Lee","Erlend Aune"],"pdf_url":"https://arxiv.org/pdf/2408.16613v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16612v1","updated":"2024-08-29T15:19:06Z","published":"2024-08-29T15:19:06Z","title":"Data Quality Monitoring through Transfer Learning on Anomaly Detection\n  for the Hadron Calorimeters","summary":"  The proliferation of sensors brings an immense volume of spatio-temporal (ST)\ndata in many domains for various purposes, including monitoring, diagnostics,\nand prognostics applications. Data curation is a time-consuming process for a\nlarge volume of data, making it challenging and expensive to deploy data\nanalytics platforms in new environments. Transfer learning (TL) mechanisms\npromise to mitigate data sparsity and model complexity by utilizing pre-trained\nmodels for a new task. Despite the triumph of TL in fields like computer vision\nand natural language processing, efforts on complex ST models for anomaly\ndetection (AD) applications are limited. In this study, we present the\npotential of TL within the context of AD for the Hadron Calorimeter of the\nCompact Muon Solenoid experiment at CERN. We have transferred the ST AD models\ntrained on data collected from one part of a calorimeter to another. We have\ninvestigated different configurations of TL on semi-supervised autoencoders of\nthe ST AD models -- transferring convolutional, graph, and recurrent neural\nnetworks of both the encoder and decoder networks. The experiment results\ndemonstrate that TL effectively enhances the model learning accuracy on a\ntarget subdetector. The TL achieves promising data reconstruction and AD\nperformance while substantially reducing the trainable parameters of the AD\nmodels. It also improves robustness against anomaly contamination in the\ntraining data sets of the semi-supervised AD models.\n","authors":["Mulugeta Weldezgina Asres","Christian Walter Omlin","Long Wang","Pavel Parygin","David Yu","Jay Dittmann","The CMS-HCAL Collaboration"],"pdf_url":"https://arxiv.org/pdf/2408.16612v1.pdf","comment":"28 pages, 15 figures, and 9 tables"},{"id":"http://arxiv.org/abs/2408.16605v1","updated":"2024-08-29T15:14:52Z","published":"2024-08-29T15:14:52Z","title":"Subspace Representation Learning for Sparse Linear Arrays to Localize\n  More Sources than Sensors: A Deep Learning Methodology","summary":"  Localizing more sources than sensors with a sparse linear array (SLA) has\nlong relied on minimizing a distance between two covariance matrices and recent\nalgorithms often utilize semidefinite programming (SDP). Although deep neural\nnetwork (DNN)-based methods offer new alternatives, they still depend on\ncovariance matrix fitting. In this paper, we develop a novel methodology that\nestimates the co-array subspaces from a sample covariance for SLAs. Our\nmethodology trains a DNN to learn signal and noise subspace representations\nthat are invariant to the selection of bases. To learn such representations, we\npropose loss functions that gauge the separation between the desired and the\nestimated subspace. In particular, we propose losses that measure the length of\nthe shortest path between subspaces viewed on a union of Grassmannians, and\nprove that it is possible for a DNN to approximate signal subspaces. The\ncomputation of learning subspaces of different dimensions is accelerated by a\nnew batch sampling strategy called consistent rank sampling. The methodology is\nrobust to array imperfections due to its geometry-agnostic and data-driven\nnature. In addition, we propose a fully end-to-end gridless approach that\ndirectly learns angles to study the possibility of bypassing subspace methods.\nNumerical results show that learning such subspace representations is more\nbeneficial than learning covariances or angles. It outperforms conventional\nSDP-based methods such as the sparse and parametric approach (SPA) and existing\nDNN-based covariance reconstruction methods for a wide range of signal-to-noise\nratios (SNRs), snapshots, and source numbers for both perfect and imperfect\narrays.\n","authors":["Kuan-Lin Chen","Bhaskar D. Rao"],"pdf_url":"https://arxiv.org/pdf/2408.16605v1.pdf","comment":"13 pages. Submitted to the IEEE Transactions on Signal Processing"},{"id":"http://arxiv.org/abs/2401.12972v3","updated":"2024-08-29T15:11:29Z","published":"2024-01-23T18:58:35Z","title":"On the Efficacy of Text-Based Input Modalities for Action Anticipation","summary":"  Anticipating future actions is a highly challenging task due to the diversity\nand scale of potential future actions; yet, information from different\nmodalities help narrow down plausible action choices. Each modality can provide\ndiverse and often complementary context for the model to learn from. While\nprevious multi-modal methods leverage information from modalities such as video\nand audio, we primarily explore how text descriptions of actions and objects\ncan also lead to more accurate action anticipation by providing additional\ncontextual cues, e.g., about the environment and its contents. We propose a\nMulti-modal Contrastive Anticipative Transformer (M-CAT), a video transformer\narchitecture that jointly learns from multi-modal features and text\ndescriptions of actions and objects. We train our model in two stages, where\nthe model first learns to align video clips with descriptions of future\nactions, and is subsequently fine-tuned to predict future actions. Compared to\nexisting methods, M-CAT has the advantage of learning additional context from\ntwo types of text inputs: rich descriptions of future actions during\npre-training, and, text descriptions for detected objects and actions during\nmodality feature fusion. Through extensive experimental evaluation, we\ndemonstrate that our model outperforms previous methods on the EpicKitchens\ndatasets, and show that using simple text descriptions of actions and objects\naid in more effective action anticipation. In addition, we examine the impact\nof object and action information obtained via text, and perform extensive\nablations.\n","authors":["Apoorva Beedu","Harish Haresamudram","Karan Samel","Irfan Essa"],"pdf_url":"https://arxiv.org/pdf/2401.12972v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16599v1","updated":"2024-08-29T15:09:04Z","published":"2024-08-29T15:09:04Z","title":"sEMG-Driven Physics-Informed Gated Recurrent Networks for Modeling Upper\n  Limb Multi-Joint Movement Dynamics","summary":"  Exoskeletons and rehabilitation systems offer great potential for enhancing\nhuman strength and recovery through advanced human-machine interfaces (HMIs)\nthat adapt to movement dynamics. However, the real-time application of\nphysics-informed neural networks (PINNs) is limited by their reliance on fixed\ninput lengths and surrogate models. This study introduces a novel\nphysics-informed Gated Recurrent Network (PiGRN) designed to predict\nmulti-joint torques using surface electromyography (sEMG) data. The PiGRN model\nemploys a Gated Recurrent Unit (GRU) to convert time-series sEMG inputs into\nmulti-joint kinematics and external loads, which are then integrated into an\nequation of motion to ensure consistency with physical laws. Experimental\nvalidation with sEMG data from five participants performing elbow\nflexion-extension tasks showed that the PiGRN model accurately predicted joint\ntorques for 10 unfamiliar movements, with RMSE values between 4.02\\% and\n11.40\\% and correlation coefficients ranging from 0.87 to 0.98. These findings\nhighlight the PiGRN's potential for real-time exoskeleton and rehabilitation\napplications. Future research will explore more diverse datasets, improve\nmusculoskeletal models, and investigate unsupervised learning methods.\n","authors":["Rajnish Kumar","Anand Gupta","Suriya Prakash Muthukrishnan","Lalan Kumar","Sitikantha Roy"],"pdf_url":"https://arxiv.org/pdf/2408.16599v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16592v1","updated":"2024-08-29T14:55:33Z","published":"2024-08-29T14:55:33Z","title":"High-Dimensional Sparse Data Low-rank Representation via Accelerated\n  Asynchronous Parallel Stochastic Gradient Descent","summary":"  Data characterized by high dimensionality and sparsity are commonly used to\ndescribe real-world node interactions. Low-rank representation (LR) can map\nhigh-dimensional sparse (HDS) data to low-dimensional feature spaces and infer\nnode interactions via modeling data latent associations. Unfortunately,\nexisting optimization algorithms for LR models are computationally inefficient\nand slowly convergent on large-scale datasets. To address this issue, this\npaper proposes an Accelerated Asynchronous Parallel Stochastic Gradient Descent\nA2PSGD for High-Dimensional Sparse Data Low-rank Representation with three\nfold-ideas: a) establishing a lock-free scheduler to simultaneously respond to\nscheduling requests from multiple threads; b) introducing a greedy\nalgorithm-based load balancing strategy for balancing the computational load\namong threads; c) incorporating Nesterov's accelerated gradient into the\nlearning scheme to accelerate model convergence. Empirical studies show that\nA2PSGD outperforms existing optimization algorithms for HDS data LR in both\naccuracy and training time.\n","authors":["Qicong Hu","Hao Wu"],"pdf_url":"https://arxiv.org/pdf/2408.16592v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16589v1","updated":"2024-08-29T14:52:42Z","published":"2024-08-29T14:52:42Z","title":"CrisperWhisper: Accurate Timestamps on Verbatim Speech Transcriptions","summary":"  We demonstrate that carefully adjusting the tokenizer of the Whisper speech\nrecognition model significantly improves the precision of word-level timestamps\nwhen applying dynamic time warping to the decoder's cross-attention scores. We\nfine-tune the model to produce more verbatim speech transcriptions and employ\nseveral techniques to increase robustness against multiple speakers and\nbackground noise. These adjustments achieve state-of-the-art performance on\nbenchmarks for verbatim speech transcription, word segmentation, and the timed\ndetection of filler events, and can further mitigate transcription\nhallucinations. The code is available open\nhttps://github.com/nyrahealth/CrisperWhisper.\n","authors":["Laurin Wagner","Bernhard Thallinger","Mario Zusag"],"pdf_url":"https://arxiv.org/pdf/2408.16589v1.pdf","comment":"Published at INTERSPEECH2024"},{"id":"http://arxiv.org/abs/2308.11375v2","updated":"2024-08-29T14:45:26Z","published":"2023-08-22T12:01:49Z","title":"Standardized Interpretable Fairness Measures for Continuous Risk Scores","summary":"  We propose a standardized version of fairness measures for continuous scores\nwith a reasonable interpretation based on the Wasserstein distance. Our\nmeasures are easily computable and well suited for quantifying and interpreting\nthe strength of group disparities as well as for comparing biases across\ndifferent models, datasets, or time points. We derive a link between the\ndifferent families of existing fairness measures for scores and show that the\nproposed standardized fairness measures outperform ROC-based fairness measures\nbecause they are more explicit and can quantify significant biases that\nROC-based fairness measures miss.\n","authors":["Ann-Kristin Becker","Oana Dumitrasc","Klaus Broelemann"],"pdf_url":"https://arxiv.org/pdf/2308.11375v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16578v1","updated":"2024-08-29T14:44:12Z","published":"2024-08-29T14:44:12Z","title":"Transformers Meet ACT-R: Repeat-Aware and Sequential Listening Session\n  Recommendation","summary":"  Music streaming services often leverage sequential recommender systems to\npredict the best music to showcase to users based on past sequences of\nlistening sessions. Nonetheless, most sequential recommendation methods ignore\nor insufficiently account for repetitive behaviors. This is a crucial\nlimitation for music recommendation, as repeatedly listening to the same song\nover time is a common phenomenon that can even change the way users perceive\nthis song. In this paper, we introduce PISA (Psychology-Informed Session\nembedding using ACT-R), a session-level sequential recommender system that\novercomes this limitation. PISA employs a Transformer architecture learning\nembedding representations of listening sessions and users using attention\nmechanisms inspired by Anderson's ACT-R (Adaptive Control of Thought-Rational),\na cognitive architecture modeling human information access and memory dynamics.\nThis approach enables us to capture dynamic and repetitive patterns from user\nbehaviors, allowing us to effectively predict the songs they will listen to in\nsubsequent sessions, whether they are repeated or new ones. We demonstrate the\nempirical relevance of PISA using both publicly available listening data from\nLast.fm and proprietary data from Deezer, a global music streaming service,\nconfirming the critical importance of repetition modeling for sequential\nlistening session recommendation. Along with this paper, we publicly release\nour proprietary dataset to foster future research in this field, as well as the\nsource code of PISA to facilitate its future use.\n","authors":["Viet-Anh Tran","Guillaume Salha-Galvan","Bruno Sguerra","Romain Hennequin"],"pdf_url":"https://arxiv.org/pdf/2408.16578v1.pdf","comment":"11 pages. Accepted by RecSys'2024, full paper"},{"id":"http://arxiv.org/abs/2408.16577v1","updated":"2024-08-29T14:43:42Z","published":"2024-08-29T14:43:42Z","title":"Seeking the Sufficiency and Necessity Causal Features in Multimodal\n  Representation Learning","summary":"  Learning representations with a high Probability of Necessary and Sufficient\nCauses (PNS) has been shown to enhance deep learning models' ability. This task\ninvolves identifying causal features that are both sufficient (guaranteeing the\noutcome) and necessary (without which the outcome cannot occur). However,\ncurrent research predominantly focuses on unimodal data, and extending PNS\nlearning to multimodal settings presents significant challenges. The challenges\narise as the conditions for PNS identifiability, Exogeneity and Monotonicity,\nneed to be reconsidered in a multimodal context, where sufficient and necessary\ncausal features are distributed across different modalities. To address this,\nwe first propose conceptualizing multimodal representations as comprising\nmodality-invariant and modality-specific components. We then analyze PNS\nidentifiability for each component, while ensuring non-trivial PNS estimation.\nFinally, we formulate tractable optimization objectives that enable multimodal\nmodels to learn high-PNS representations, thereby enhancing their predictive\nperformance. Experiments demonstrate the effectiveness of our method on both\nsynthetic and real-world data.\n","authors":["Boyu Chen","Junjie Liu","Zhu Li","Mengyue yang"],"pdf_url":"https://arxiv.org/pdf/2408.16577v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16573v1","updated":"2024-08-29T14:40:32Z","published":"2024-08-29T14:40:32Z","title":"An Adaptive Latent Factorization of Tensors Model for Embedding Dynamic\n  Communication Network","summary":"  The Dynamic Communication Network (DCN) describes the interactions over time\namong various communication nodes, and it is widely used in Big-data\napplications as a data source. As the number of communication nodes increases\nand temporal slots accumulate, each node interacts in with only a few nodes in\na given temporal slot, the DCN can be represented by an High-Dimensional Sparse\n(HDS) tensor. In order to extract rich behavioral patterns from an HDS tensor\nin DCN, this paper proposes an Adaptive Temporal-dependent Tensor low-rank\nrepresentation (ATT) model. It adopts a three-fold approach: a) designing a\ntemporal-dependent method to reconstruct temporal feature matrix, thereby\nprecisely represent the data by capturing the temporal patterns; b) achieving\nhyper-parameters adaptation of the model via the Differential Evolutionary\nAlgorithms (DEA) to avoid tedious hyper-parameters tuning; c) employing\nnonnegative learning schemes for the model parameters to effectively handle an\nthe nonnegativity inherent in HDS data. The experimental results on four\nreal-world DCNs demonstrate that the proposed ATT model significantly\noutperforms several state-of-the-art models in both prediction errors and\nconvergence rounds.\n","authors":["Xin Liao","Qicong Hu","Peng Tang"],"pdf_url":"https://arxiv.org/pdf/2408.16573v1.pdf","comment":"10 pages, 2 figures"},{"id":"http://arxiv.org/abs/2210.05506v2","updated":"2024-08-29T14:36:20Z","published":"2022-10-11T14:58:58Z","title":"Follow-up Attention: An Empirical Study of Developer and Neural Model\n  Code Exploration","summary":"  Recent neural models of code, such as OpenAI Codex and AlphaCode, have\ndemonstrated remarkable proficiency at code generation due to the underlying\nattention mechanism. However, it often remains unclear how the models actually\nprocess code, and to what extent their reasoning and the way their attention\nmechanism scans the code matches the patterns of developers. A poor\nunderstanding of the model reasoning process limits the way in which current\nneural models are leveraged today, so far mostly for their raw prediction. To\nfill this gap, this work studies how the processed attention signal of three\nopen large language models - CodeGen, InCoder and GPT-J - agrees with how\ndevelopers look at and explore code when each answers the same sensemaking\nquestions about code. Furthermore, we contribute an open-source eye-tracking\ndataset comprising 92 manually-labeled sessions from 25 developers engaged in\nsensemaking tasks. We empirically evaluate five heuristics that do not use the\nattention and ten attention-based post-processing approaches of the attention\nsignal of CodeGen against our ground truth of developers exploring code,\nincluding the novel concept of follow-up attention which exhibits the highest\nagreement between model and human attention. Our follow-up attention method can\npredict the next line a developer will look at with 47% accuracy. This\noutperforms the baseline prediction accuracy of 42.3%, which uses the session\nhistory of other developers to recommend the next line. These results\ndemonstrate the potential of leveraging the attention signal of pre-trained\nmodels for effective code exploration.\n","authors":["Matteo Paltenghi","Rahul Pandita","Austin Z. Henley","Albert Ziegler"],"pdf_url":"https://arxiv.org/pdf/2210.05506v2.pdf","comment":"Published at IEEE Transactions on Software Engineering"},{"id":"http://arxiv.org/abs/2408.16567v1","updated":"2024-08-29T14:35:14Z","published":"2024-08-29T14:35:14Z","title":"Identifying Terrain Physical Parameters from Vision -- Towards\n  Physical-Parameter-Aware Locomotion and Navigation","summary":"  Identifying the physical properties of the surrounding environment is\nessential for robotic locomotion and navigation to deal with non-geometric\nhazards, such as slippery and deformable terrains. It would be of great benefit\nfor robots to anticipate these extreme physical properties before contact;\nhowever, estimating environmental physical parameters from vision is still an\nopen challenge. Animals can achieve this by using their prior experience and\nknowledge of what they have seen and how it felt. In this work, we propose a\ncross-modal self-supervised learning framework for vision-based environmental\nphysical parameter estimation, which paves the way for future\nphysical-property-aware locomotion and navigation. We bridge the gap between\nexisting policies trained in simulation and identification of physical terrain\nparameters from vision. We propose to train a physical decoder in simulation to\npredict friction and stiffness from multi-modal input. The trained network\nallows the labeling of real-world images with physical parameters in a\nself-supervised manner to further train a visual network during deployment,\nwhich can densely predict the friction and stiffness from image data. We\nvalidate our physical decoder in simulation and the real world using a\nquadruped ANYmal robot, outperforming an existing baseline method. We show that\nour visual network can predict the physical properties in indoor and outdoor\nexperiments while allowing fast adaptation to new environments.\n","authors":["Jiaqi Chen","Jonas Frey","Ruyi Zhou","Takahiro Miki","Georg Martius","Marco Hutter"],"pdf_url":"https://arxiv.org/pdf/2408.16567v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.11962v3","updated":"2024-08-29T14:31:58Z","published":"2023-02-23T12:18:28Z","title":"Unified Convergence Theory of Stochastic and Variance-Reduced Cubic\n  Newton Methods","summary":"  We study stochastic Cubic Newton methods for solving general possibly\nnon-convex minimization problems. We propose a new framework, which we call the\nhelper framework, that provides a unified view of the stochastic and\nvariance-reduced second-order algorithms equipped with global complexity\nguarantees. It can also be applied to learning with auxiliary information. Our\nhelper framework offers the algorithm designer high flexibility for\nconstructing and analyzing the stochastic Cubic Newton methods, allowing\narbitrary size batches, and the use of noisy and possibly biased estimates of\nthe gradients and Hessians, incorporating both the variance reduction and the\nlazy Hessian updates. We recover the best-known complexities for the stochastic\nand variance-reduced Cubic Newton, under weak assumptions on the noise. A\ndirect consequence of our theory is the new lazy stochastic second-order\nmethod, which significantly improves the arithmetic complexity for large\ndimension problems. We also establish complexity bounds for the classes of\ngradient-dominated objectives, that include convex and strongly convex\nproblems. For Auxiliary Learning, we show that using a helper (auxiliary\nfunction) can outperform training alone if a given similarity measure is small.\n","authors":["El Mahdi Chayti","Nikita Doikov","Martin Jaggi"],"pdf_url":"https://arxiv.org/pdf/2302.11962v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15099v2","updated":"2024-08-29T14:20:44Z","published":"2024-08-27T14:31:54Z","title":"No Regrets: Investigating and Improving Regret Approximations for\n  Curriculum Discovery","summary":"  What data or environments to use for training to improve downstream\nperformance is a longstanding and very topical question in reinforcement\nlearning. In particular, Unsupervised Environment Design (UED) methods have\ngained recent attention as their adaptive curricula enable agents to be robust\nto in- and out-of-distribution tasks. We ask to what extent these methods are\nthemselves robust when applied to a novel setting, closely inspired by a\nreal-world robotics problem. Surprisingly, we find that the state-of-the-art\nUED methods either do not improve upon the na\\\"{i}ve baseline of Domain\nRandomisation (DR), or require substantial hyperparameter tuning to do so. Our\nanalysis shows that this is due to their underlying scoring functions failing\nto predict intuitive measures of ``learnability'', i.e., in finding the\nsettings that the agent sometimes solves, but not always. Based on this, we\ninstead directly train on levels with high learnability and find that this\nsimple and intuitive approach outperforms UED methods and DR in several\nbinary-outcome environments, including on our domain and the standard UED\ndomain of Minigrid. We further introduce a new adversarial evaluation procedure\nfor directly measuring robustness, closely mirroring the conditional value at\nrisk (CVaR). We open-source all our code and present visualisations of final\npolicies here: https://github.com/amacrutherford/sampling-for-learnability.\n","authors":["Alexander Rutherford","Michael Beukman","Timon Willi","Bruno Lacerda","Nick Hawes","Jakob Foerster"],"pdf_url":"https://arxiv.org/pdf/2408.15099v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16555v1","updated":"2024-08-29T14:18:54Z","published":"2024-08-29T14:18:54Z","title":"Android Malware Detection Based on RGB Images and Multi-feature Fusion","summary":"  With the widespread adoption of smartphones, Android malware has become a\nsignificant challenge in the field of mobile device security. Current Android\nmalware detection methods often rely on feature engineering to construct\ndynamic or static features, which are then used for learning. However, static\nfeature-based methods struggle to counter code obfuscation, packing, and\nsigning techniques, while dynamic feature-based methods involve time-consuming\nfeature extraction. Image-based methods for Android malware detection offer\nbetter resilience against malware variants and polymorphic malware. This paper\nproposes an end-to-end Android malware detection technique based on RGB images\nand multi-feature fusion. The approach involves extracting Dalvik Executable\n(DEX) files, AndroidManifest.xml files, and API calls from APK files,\nconverting them into grayscale images, and enhancing their texture features\nusing Canny edge detection, histogram equalization, and adaptive thresholding\ntechniques. These grayscale images are then combined into an RGB image\ncontaining multi-feature fusion information, which is analyzed using mainstream\nimage classification models for Android malware detection. Extensive\nexperiments demonstrate that the proposed method effectively captures Android\nmalware characteristics, achieving an accuracy of up to 97.25%, outperforming\nexisting detection methods that rely solely on DEX files as classification\nfeatures. Additionally, ablation experiments confirm the effectiveness of using\nthe three key files for feature representation in the proposed approach.\n","authors":["Zhiqiang Wang","Qiulong Yu","Sicheng Yuan"],"pdf_url":"https://arxiv.org/pdf/2408.16555v1.pdf","comment":"9 pages,10 figures"},{"id":"http://arxiv.org/abs/2408.16553v1","updated":"2024-08-29T14:16:13Z","published":"2024-08-29T14:16:13Z","title":"Super-Resolution works for coastal simulations","summary":"  Learning fine-scale details of a coastal ocean simulation from a coarse\nrepresentation is a challenging task. For real-world applications,\nhigh-resolution simulations are necessary to advance understanding of many\ncoastal processes, specifically, to predict flooding resulting from tsunamis\nand storm surges. We propose a Deep Network for Coastal Super-Resolution\n(DNCSR) for spatiotemporal enhancement to efficiently learn the high-resolution\nnumerical solution. Given images of coastal simulations produced on\nlow-resolution computational meshes using low polynomial order discontinuous\nGalerkin discretizations and a coarse temporal resolution, the proposed DNCSR\nlearns to produce high-resolution free surface elevation and velocity\nvisualizations in both time and space. To efficiently model the dynamic changes\nover time and space, we propose grid-aware spatiotemporal attention to project\nthe temporal features to the spatial domain for non-local feature matching. The\ncoordinate information is also utilized via positional encoding. For the final\nreconstruction, we use the spatiotemporal bilinear operation to interpolate the\nmissing frames and then expand the feature maps to the frequency domain for\nresidual mapping. Besides data-driven losses, the proposed physics-informed\nloss guarantees gradient consistency and momentum changes. Their combination\ncontributes to the overall 24% improvements in RMSE. To train the proposed\nmodel, we propose a large-scale coastal simulation dataset and use it for model\noptimization and evaluation. Our method shows superior super-resolution quality\nand fast computation compared to the state-of-the-art methods.\n","authors":["Zhi-Song Liu","Markus Buttner","Vadym Aizinger","Andreas Rupp"],"pdf_url":"https://arxiv.org/pdf/2408.16553v1.pdf","comment":"13 pages, 12 figures"},{"id":"http://arxiv.org/abs/2407.17844v2","updated":"2024-08-29T14:06:57Z","published":"2024-07-25T07:58:19Z","title":"Innovative Speech-Based Deep Learning Approaches for Parkinson's Disease\n  Classification: A Systematic Review","summary":"  Parkinson's disease (PD), the second most prevalent neurodegenerative\ndisorder worldwide, frequently presents with early-stage speech impairments.\nRecent advancements in Artificial Intelligence (AI), particularly deep learning\n(DL), have significantly enhanced PD diagnosis through the analysis of speech\ndata. Nevertheless, the progress of research is restricted by the limited\navailability of publicly accessible speech-based PD datasets, primarily due to\nprivacy concerns. The goal of this systematic review is to explore the current\nlandscape of speech-based DL approaches for PD classification, based on 33\nscientific works published between 2020 and March 2024. We discuss their\navailable resources, capabilities, potential limitations, and issues related to\nbias, explainability, and privacy. Furthermore, this review provides an\noverview of publicly accessible speech-based datasets and open-source material\nfor PD. The DL approaches are categorized into end-to-end (E2E) learning,\ntransfer learning (TL) and deep acoustic features extraction (DAFE) approaches.\nAmong E2E approaches, Convolutional Neural Networks (CNNs) are prevalent,\nthough Transformers are increasingly popular. E2E approaches face challenges\nsuch as limited data and computational resources, especially with Transformers.\nTL addresses these issues by providing more robust PD diagnosis and better\ngeneralizability across languages. DAFE aims to improve the explainability and\ninterpretability of results by examining the specific effects of deep features\non both other DL approaches and more traditional machine learning (ML) methods.\nHowever, it often underperforms compared to E2E and TL approaches.\n","authors":["Lisanne van Gelderen","Cristian Tejedor-García"],"pdf_url":"https://arxiv.org/pdf/2407.17844v2.pdf","comment":"Submitted in Applied Sciences - peer reviewed Open Access journal.\n  This research was funded by the NWO research programme AiNed Fellowship\n  Grants under the project Responsible AI for Voice Diagnostics (RAIVD) - grant\n  number NGF.1607.22.013"},{"id":"http://arxiv.org/abs/2408.16543v1","updated":"2024-08-29T14:01:30Z","published":"2024-08-29T14:01:30Z","title":"Statistical and Geometrical properties of regularized Kernel\n  Kullback-Leibler divergence","summary":"  In this paper, we study the statistical and geometrical properties of the\nKullback-Leibler divergence with kernel covariance operators (KKL) introduced\nby Bach [2022]. Unlike the classical Kullback-Leibler (KL) divergence that\ninvolves density ratios, the KKL compares probability distributions through\ncovariance operators (embeddings) in a reproducible kernel Hilbert space\n(RKHS), and compute the Kullback-Leibler quantum divergence. This novel\ndivergence hence shares parallel but different aspects with both the standard\nKullback-Leibler between probability distributions and kernel embeddings\nmetrics such as the maximum mean discrepancy. A limitation faced with the\noriginal KKL divergence is its inability to be defined for distributions with\ndisjoint supports. To solve this problem, we propose in this paper a\nregularised variant that guarantees that the divergence is well defined for all\ndistributions. We derive bounds that quantify the deviation of the regularised\nKKL to the original one, as well as finite-sample bounds. In addition, we\nprovide a closed-form expression for the regularised KKL, specifically\napplicable when the distributions consist of finite sets of points, which makes\nit implementable. Furthermore, we derive a Wasserstein gradient descent scheme\nof the KKL divergence in the case of discrete distributions, and study\nempirically its properties to transport a set of points to a target\ndistribution.\n","authors":["Clémentine Chazal","Anna Korba","Francis Bach"],"pdf_url":"https://arxiv.org/pdf/2408.16543v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16542v1","updated":"2024-08-29T14:00:57Z","published":"2024-08-29T14:00:57Z","title":"SALSA: Speedy ASR-LLM Synchronous Aggregation","summary":"  Harnessing pre-trained LLMs to improve ASR systems, particularly for\nlow-resource languages, is now an emerging area of research. Existing methods\nrange from using LLMs for ASR error correction to tightly coupled systems that\nreplace the ASR decoder with the LLM. These approaches either increase decoding\ntime or require expensive training of the cross-attention layers. We propose\nSALSA, which couples the decoder layers of the ASR to the LLM decoder, while\nsynchronously advancing both decoders. Such coupling is performed with a simple\nprojection of the last decoder state, and is thus significantly more training\nefficient than earlier approaches. A challenge of our proposed coupling is\nhandling the mismatch between the tokenizers of the LLM and ASR systems. We\nhandle this mismatch using cascading tokenization with respect to the LLM and\nASR vocabularies. We evaluate SALSA on 8 low-resource languages in the FLEURS\nbenchmark, yielding substantial WER reductions of up to 38%.\n","authors":["Ashish Mittal","Darshan Prabhu","Sunita Sarawagi","Preethi Jyothi"],"pdf_url":"https://arxiv.org/pdf/2408.16542v1.pdf","comment":"Accepted to INTERSPEECH 2024"},{"id":"http://arxiv.org/abs/2408.16537v1","updated":"2024-08-29T13:52:28Z","published":"2024-08-29T13:52:28Z","title":"SFR-GNN: Simple and Fast Robust GNNs against Structural Attacks","summary":"  Graph Neural Networks (GNNs) have demonstrated commendable performance for\ngraph-structured data. Yet, GNNs are often vulnerable to adversarial structural\nattacks as embedding generation relies on graph topology. Existing efforts are\ndedicated to purifying the maliciously modified structure or applying adaptive\naggregation, thereby enhancing the robustness against adversarial structural\nattacks. It is inevitable for a defender to consume heavy computational costs\ndue to lacking prior knowledge about modified structures. To this end, we\npropose an efficient defense method, called Simple and Fast Robust Graph Neural\nNetwork (SFR-GNN), supported by mutual information theory. The SFR-GNN first\npre-trains a GNN model using node attributes and then fine-tunes it over the\nmodified graph in the manner of contrastive learning, which is free of\npurifying modified structures and adaptive aggregation, thus achieving great\nefficiency gains. Consequently, SFR-GNN exhibits a 24%--162% speedup compared\nto advanced robust models, demonstrating superior robustness for node\nclassification tasks.\n","authors":["Xing Ai","Guanyu Zhu","Yulin Zhu","Yu Zheng","Gaolei Li","Jianhua Li","Kai Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.16537v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16535v1","updated":"2024-08-29T13:50:08Z","published":"2024-08-29T13:50:08Z","title":"TinyTNAS: GPU-Free, Time-Bound, Hardware-Aware Neural Architecture\n  Search for TinyML Time Series Classification","summary":"  In this work, we present TinyTNAS, a novel hardware-aware multi-objective\nNeural Architecture Search (NAS) tool specifically designed for TinyML time\nseries classification. Unlike traditional NAS methods that rely on GPU\ncapabilities, TinyTNAS operates efficiently on CPUs, making it accessible for a\nbroader range of applications. Users can define constraints on RAM, FLASH, and\nMAC operations to discover optimal neural network architectures within these\nparameters. Additionally, the tool allows for time-bound searches, ensuring the\nbest possible model is found within a user-specified duration. By experimenting\nwith benchmark dataset UCI HAR, PAMAP2, WISDM, MIT BIH, and PTB Diagnostic ECG\nDatabas TinyTNAS demonstrates state-of-the-art accuracy with significant\nreductions in RAM, FLASH, MAC usage, and latency. For example, on the UCI HAR\ndataset, TinyTNAS achieves a 12x reduction in RAM usage, a 144x reduction in\nMAC operations, and a 78x reduction in FLASH memory while maintaining superior\naccuracy and reducing latency by 149x. Similarly, on the PAMAP2 and WISDM\ndatasets, it achieves a 6x reduction in RAM usage, a 40x reduction in MAC\noperations, an 83x reduction in FLASH, and a 67x reduction in latency, all\nwhile maintaining superior accuracy. Notably, the search process completes\nwithin 10 minutes in a CPU environment. These results highlight TinyTNAS's\ncapability to optimize neural network architectures effectively for\nresource-constrained TinyML applications, ensuring both efficiency and high\nperformance. The code for TinyTNAS is available at the GitHub repository and\ncan be accessed at https://github.com/BidyutSaha/TinyTNAS.git.\n","authors":["Bidyut Saha","Riya Samanta","Soumya K. Ghosh","Ram Babu Roy"],"pdf_url":"https://arxiv.org/pdf/2408.16535v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15640v2","updated":"2024-08-29T13:47:38Z","published":"2024-08-28T08:52:14Z","title":"GANs Conditioning Methods: A Survey","summary":"  In recent years, Generative Adversarial Networks (GANs) have seen significant\nadvancements, leading to their widespread adoption across various fields. The\noriginal GAN architecture enables the generation of images without any specific\ncontrol over the content, making it an unconditional generation process.\nHowever, many practical applications require precise control over the generated\noutput, which has led to the development of conditional GANs (cGANs) that\nincorporate explicit conditioning to guide the generation process. cGANs extend\nthe original framework by incorporating additional information (conditions),\nenabling the generation of samples that adhere to that specific criteria.\nVarious conditioning methods have been proposed, each differing in how they\nintegrate the conditioning information into both the generator and the\ndiscriminator networks. In this work, we review the conditioning methods\nproposed for GANs, exploring the characteristics of each method and\nhighlighting their unique mechanisms and theoretical foundations. Furthermore,\nwe conduct a comparative analysis of these methods, evaluating their\nperformance on various image datasets. Through these analyses, we aim to\nprovide insights into the strengths and limitations of various conditioning\ntechniques, guiding future research and application in generative modeling.\n","authors":["Anis Bourou","Auguste Genovesio","Valérie Mezger"],"pdf_url":"https://arxiv.org/pdf/2408.15640v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16532v1","updated":"2024-08-29T13:43:36Z","published":"2024-08-29T13:43:36Z","title":"WavTokenizer: an Efficient Acoustic Discrete Codec Tokenizer for Audio\n  Language Modeling","summary":"  Language models have been effectively applied to modeling natural signals,\nsuch as images, video, speech, and audio. A crucial component of these models\nis the codec tokenizer, which compresses high-dimensional natural signals into\nlower-dimensional discrete tokens. In this paper, we introduce WavTokenizer,\nwhich offers several advantages over previous SOTA acoustic codec models in the\naudio domain: 1)extreme compression. By compressing the layers of quantizers\nand the temporal dimension of the discrete codec, one-second audio of 24kHz\nsampling rate requires only a single quantizer with 40 or 75 tokens. 2)improved\nsubjective quality. Despite the reduced number of tokens, WavTokenizer achieves\nstate-of-the-art reconstruction quality with outstanding UTMOS scores and\ninherently contains richer semantic information. Specifically, we achieve these\nresults by designing a broader VQ space, extended contextual windows, and\nimproved attention networks, as well as introducing a powerful multi-scale\ndiscriminator and an inverse Fourier transform structure. We conducted\nextensive reconstruction experiments in the domains of speech, audio, and\nmusic. WavTokenizer exhibited strong performance across various objective and\nsubjective metrics compared to state-of-the-art models. We also tested semantic\ninformation, VQ utilization, and adaptability to generative models.\nComprehensive ablation studies confirm the necessity of each module in\nWavTokenizer. The related code, demos, and pre-trained models are available at\nhttps://github.com/jishengpeng/WavTokenizer.\n","authors":["Shengpeng Ji","Ziyue Jiang","Xize Cheng","Yifu Chen","Minghui Fang","Jialong Zuo","Qian Yang","Ruiqi Li","Ziang Zhang","Xiaoda Yang","Rongjie Huang","Yidi Jiang","Qian Chen","Siqi Zheng","Wen Wang","Zhou Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.16532v1.pdf","comment":"Working in progress. arXiv admin note: text overlap with\n  arXiv:2402.12208"},{"id":"http://arxiv.org/abs/2408.16527v1","updated":"2024-08-29T13:39:01Z","published":"2024-08-29T13:39:01Z","title":"Multitask learning for improved scour detection: A dynamic wave tank\n  study","summary":"  Population-based structural health monitoring (PBSHM), aims to share\ninformation between members of a population. An offshore wind (OW) farm could\nbe considered as a population of nominally-identical wind-turbine structures.\nHowever, benign variations exist among members, such as geometry, sea-bed\nconditions and temperature differences. These factors could influence\nstructural properties and therefore the dynamic response, making it more\ndifficult to detect structural problems via traditional SHM techniques.\n  This paper explores the use of a Bayesian hierarchical model as a means of\nmultitask learning, to infer foundation stiffness distribution parameters at\nboth population and local levels. To do this, observations of natural frequency\nfrom populations of structures were first generated from both numerical and\nexperimental models. These observations were then used in a partially-pooled\nBayesian hierarchical model in tandem with surrogate FE models of the\nstructures to infer foundation stiffness parameters. Finally, it is\ndemonstrated how the learned parameters may be used as a basis to perform more\nrobust anomaly detection (as compared to a no-pooling approach) e.g. as a\nresult of scour.\n","authors":["Simon M. Brealy","Aidan J. Hughes","Tina A. Dardeno","Lawrence A. Bull","Robin S. Mills","Nikolaos Dervilis","Keith Worden"],"pdf_url":"https://arxiv.org/pdf/2408.16527v1.pdf","comment":"25 pages, 12 figures, early work features in ISWHM 2023 conference\n  proceedings and available here: arXiv:2402.19295. Submitted to the Renewable\n  Energy journal"},{"id":"http://arxiv.org/abs/2408.16517v1","updated":"2024-08-29T13:28:11Z","published":"2024-08-29T13:28:11Z","title":"Adaptive Variational Continual Learning via Task-Heuristic Modelling","summary":"  Variational continual learning (VCL) is a turn-key learning algorithm that\nhas state-of-the-art performance among the best continual learning models. In\nour work, we explore an extension of the generalized variational continual\nlearning (GVCL) model, named AutoVCL, which combines task heuristics for\ninformed learning and model optimization. We demonstrate that our model\noutperforms the standard GVCL with fixed hyperparameters, benefiting from the\nautomatic adjustment of the hyperparameter based on the difficulty and\nsimilarity of the incoming task compared to the previous tasks.\n","authors":["Fan Yang"],"pdf_url":"https://arxiv.org/pdf/2408.16517v1.pdf","comment":"4 pages, 2 figures, 3 tables"},{"id":"http://arxiv.org/abs/2408.15126v2","updated":"2024-08-29T13:21:26Z","published":"2024-08-27T15:07:27Z","title":"Force-Guided Bridge Matching for Full-Atom Time-Coarsened Dynamics of\n  Peptides","summary":"  Molecular Dynamics (MD) simulations are irreplaceable and ubiquitous in\nfields of materials science, chemistry, pharmacology just to name a few.\nConventional MD simulations are plagued by numerical stability as well as long\nequilibration time issues, which limits broader applications of MD simulations.\nRecently, a surge of deep learning approaches have been devised for\ntime-coarsened dynamics, which learns the state transition mechanism over much\nlarger time scales to overcome these limitations. However, only a few methods\ntarget the underlying Boltzmann distribution by resampling techniques, where\nproposals are rarely accepted as new states with low efficiency. In this work,\nwe propose a force-guided bridge matching model, FBM, a novel framework that\nfirst incorporates physical priors into bridge matching for full-atom\ntime-coarsened dynamics. With the guidance of our well-designed intermediate\nforce field, FBM is feasible to target the Boltzmann-like distribution by\ndirect inference without extra steps. Experiments on small peptides verify our\nsuperiority in terms of comprehensive metrics and demonstrate transferability\nto unseen peptide systems.\n","authors":["Ziyang Yu","Wenbing Huang","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2408.15126v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16495v1","updated":"2024-08-29T12:49:22Z","published":"2024-08-29T12:49:22Z","title":"On-device AI: Quantization-aware Training of Transformers in Time-Series","summary":"  Artificial Intelligence (AI) models for time-series in pervasive computing\nkeep getting larger and more complicated. The Transformer model is by far the\nmost compelling of these AI models. However, it is difficult to obtain the\ndesired performance when deploying such a massive model on a sensor device with\nlimited resources. My research focuses on optimizing the Transformer model for\ntime-series forecasting tasks. The optimized model will be deployed as hardware\naccelerators on embedded Field Programmable Gate Arrays (FPGAs). I will\ninvestigate the impact of applying Quantization-aware Training to the\nTransformer model to reduce its size and runtime memory footprint while\nmaximizing the advantages of FPGAs.\n","authors":["Tianheng Ling","Gregor Schiele"],"pdf_url":"https://arxiv.org/pdf/2408.16495v1.pdf","comment":"This paper is accepted by 2023 IEEE International Conference on\n  Pervasive Computing and Communications(PhD Forum)"},{"id":"http://arxiv.org/abs/2408.16463v1","updated":"2024-08-29T11:51:41Z","published":"2024-08-29T11:51:41Z","title":"An Exploratory Deep Learning Approach for Predicting Subsequent Suicidal\n  Acts in Chinese Psychological Support Hotlines","summary":"  Psychological support hotlines are an effective suicide prevention measure\nthat typically relies on professionals using suicide risk assessment scales to\npredict individual risk scores. However, the accuracy of scale-based predictive\nmethods for suicide risk assessment can vary widely depending on the expertise\nof the operator. This limitation underscores the need for more reliable\nmethods, prompting this research's innovative exploration of the use of\nartificial intelligence to improve the accuracy and efficiency of suicide risk\nprediction within the context of psychological support hotlines. The study\nincluded data from 1,549 subjects from 2015-2017 in China who contacted a\npsychological support hotline. Each participant was followed for 12 months to\nidentify instances of suicidal behavior. We proposed a novel multi-task\nlearning method that uses the large-scale pre-trained model Whisper for feature\nextraction and fits psychological scales while predicting the risk of suicide.\nThe proposed method yields a 2.4\\% points improvement in F1-score compared to\nthe traditional manual approach based on the psychological scales. Our model\ndemonstrated superior performance compared to the other eight popular models.\nTo our knowledge, this study is the first to apply deep learning to long-term\nspeech data to predict suicide risk in China, indicating grate potential for\nclinical applications. The source code is publicly available at:\n\\url{https://github.com/songchangwei/Suicide-Risk-Prediction}.\n","authors":["Changwei Song","Qing Zhao","Jianqiang Li","Yining Chen","Yongsheng Tong","Guanghui Fu"],"pdf_url":"https://arxiv.org/pdf/2408.16463v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16457v1","updated":"2024-08-29T11:45:01Z","published":"2024-08-29T11:45:01Z","title":"HYGENE: A Diffusion-based Hypergraph Generation Method","summary":"  Hypergraphs are powerful mathematical structures that can model complex,\nhigh-order relationships in various domains, including social networks,\nbioinformatics, and recommender systems. However, generating realistic and\ndiverse hypergraphs remains challenging due to their inherent complexity and\nlack of effective generative models. In this paper, we introduce a\ndiffusion-based Hypergraph Generation (HYGENE) method that addresses these\nchallenges through a progressive local expansion approach. HYGENE works on the\nbipartite representation of hypergraphs, starting with a single pair of\nconnected nodes and iteratively expanding it to form the target hypergraph. At\neach step, nodes and hyperedges are added in a localized manner using a\ndenoising diffusion process, which allows for the construction of the global\nstructure before refining local details. Our experiments demonstrated the\neffectiveness of HYGENE, proving its ability to closely mimic a variety of\nproperties in hypergraphs. To the best of our knowledge, this is the first\nattempt to employ deep learning models for hypergraph generation, and our work\naims to lay the groundwork for future research in this area.\n","authors":["Dorian Gailhard","Enzo Tartaglione","Lirida Naviner De Barros","Jhony H. Giraldo"],"pdf_url":"https://arxiv.org/pdf/2408.16457v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2312.11529 by other authors"},{"id":"http://arxiv.org/abs/2403.04447v2","updated":"2024-08-29T11:28:10Z","published":"2024-03-07T12:34:03Z","title":"FRRI: a novel algorithm for fuzzy-rough rule induction","summary":"  Interpretability is the next frontier in machine learning research. In the\nsearch for white box models - as opposed to black box models, like random\nforests or neural networks - rule induction algorithms are a logical and\npromising option, since the rules can easily be understood by humans. Fuzzy and\nrough set theory have been successfully applied to this archetype, almost\nalways separately. As both approaches to rule induction involve granular\ncomputing based on the concept of equivalence classes, it is natural to combine\nthem. The QuickRules\\cite{JensenCornelis2009} algorithm was a first attempt at\nusing fuzzy rough set theory for rule induction. It is based on QuickReduct, a\ngreedy algorithm for building decision reducts. QuickRules already showed an\nimprovement over other rule induction methods. However, to evaluate the full\npotential of a fuzzy rough rule induction algorithm, one needs to start from\nthe foundations. In this paper, we introduce a novel rule induction algorithm\ncalled Fuzzy Rough Rule Induction (FRRI). We provide background and explain the\nworkings of our algorithm. Furthermore, we perform a computational experiment\nto evaluate the performance of our algorithm and compare it to other\nstate-of-the-art rule induction approaches. We find that our algorithm is more\naccurate while creating small rulesets consisting of relatively short rules. We\nend the paper by outlining some directions for future work.\n","authors":["Henri Bollaert","Marko Palangetić","Chris Cornelis","Salvatore Greco","Roman Słowiński"],"pdf_url":"https://arxiv.org/pdf/2403.04447v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12862v2","updated":"2024-08-29T11:18:16Z","published":"2024-04-19T13:01:59Z","title":"A Guide to Feature Importance Methods for Scientific Inference","summary":"  While machine learning (ML) models are increasingly used due to their high\npredictive power, their use in understanding the data-generating process (DGP)\nis limited. Understanding the DGP requires insights into feature-target\nassociations, which many ML models cannot directly provide due to their opaque\ninternal mechanisms. Feature importance (FI) methods provide useful insights\ninto the DGP under certain conditions. Since the results of different FI\nmethods have different interpretations, selecting the correct FI method for a\nconcrete use case is crucial and still requires expert knowledge. This paper\nserves as a comprehensive guide to help understand the different\ninterpretations of global FI methods. Through an extensive review of FI methods\nand providing new proofs regarding their interpretation, we facilitate a\nthorough understanding of these methods and formulate concrete recommendations\nfor scientific inference. We conclude by discussing options for FI uncertainty\nestimation and point to directions for future research aiming at full\nstatistical inference from black-box ML models.\n","authors":["Fiona Katharina Ewald","Ludwig Bothmann","Marvin N. Wright","Bernd Bischl","Giuseppe Casalicchio","Gunnar König"],"pdf_url":"https://arxiv.org/pdf/2404.12862v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15381v3","updated":"2024-08-29T11:03:57Z","published":"2024-04-23T09:44:58Z","title":"Advances and Open Challenges in Federated Foundation Models","summary":"  The integration of Foundation Models (FMs) with Federated Learning (FL)\npresents a transformative paradigm in Artificial Intelligence (AI). This\nintegration offers enhanced capabilities while addressing concerns of privacy,\ndata decentralization, and computational efficiency. This paper provides a\ncomprehensive survey of the emerging field of Federated Foundation Models\n(FedFM), elucidating their synergistic relationship and exploring novel\nmethodologies, challenges, and future directions that the FL research field\nneeds to focus on in order to thrive in the age of FMs. A systematic\nmulti-tiered taxonomy is proposed, categorizing existing FedFM approaches for\nmodel training, aggregation, trustworthiness, and incentivization. Key\nchallenges, including how to enable FL to deal with high complexity of\ncomputational demands, privacy considerations, contribution evaluation, and\ncommunication efficiency, are thoroughly discussed. Moreover, the paper\nexplores the intricate challenges of communication, scalability, and security\ninherent in training/fine-tuning FMs via FL. It highlights the potential of\nquantum computing to revolutionize the processes of training, inference,\noptimization, and data encryption. This survey also introduces the\nimplementation requirement of FedFM and some practical FedFM applications.\nThen, this survey provides the lessons with a clear understanding of our\nfindings for FedFM. Finally, this survey not only provides insights into the\ncurrent state and challenges of FedFM but also paves the way for future\nresearch directions, emphasizing the need for developing trustworthy solutions.\nIt serves as a foundational guide for researchers and practitioners interested\nin contributing to this interdisciplinary and rapidly advancing field.\n","authors":["Chao Ren","Han Yu","Hongyi Peng","Xiaoli Tang","Bo Zhao","Liping Yi","Alysa Ziying Tan","Yulan Gao","Anran Li","Xiaoxiao Li","Zengxiang Li","Qiang Yang"],"pdf_url":"https://arxiv.org/pdf/2404.15381v3.pdf","comment":"Survey of Federated Foundation Models (FedFM)"},{"id":"http://arxiv.org/abs/2408.16430v1","updated":"2024-08-29T10:44:59Z","published":"2024-08-29T10:44:59Z","title":"Do Recommender Systems Promote Local Music? A Reproducibility Study\n  Using Music Streaming Data","summary":"  This paper examines the influence of recommender systems on local music\nrepresentation, discussing prior findings from an empirical study on the LFM-2b\npublic dataset. This prior study argued that different recommender systems\nexhibit algorithmic biases shifting music consumption either towards or against\nlocal content. However, LFM-2b users do not reflect the diverse audience of\nmusic streaming services. To assess the robustness of this study's conclusions,\nwe conduct a comparative analysis using proprietary listening data from a\nglobal music streaming service, which we publicly release alongside this paper.\nWe observe significant differences in local music consumption patterns between\nour dataset and LFM-2b, suggesting that caution should be exercised when\ndrawing conclusions on local music based solely on LFM-2b. Moreover, we show\nthat the algorithmic biases exhibited in the original work vary in our dataset,\nand that several unexplored model parameters can significantly influence these\nbiases and affect the study's conclusion on both datasets. Finally, we discuss\nthe complexity of accurately labeling local music, emphasizing the risk of\nmisleading conclusions due to unreliable, biased, or incomplete labels. To\nencourage further research and ensure reproducibility, we have publicly shared\nour dataset and code.\n","authors":["Kristina Matrosova","Lilian Marey","Guillaume Salha-Galvan","Thomas Louail","Olivier Bodini","Manuel Moussallam"],"pdf_url":"https://arxiv.org/pdf/2408.16430v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16429v1","updated":"2024-08-29T10:43:55Z","published":"2024-08-29T10:43:55Z","title":"Gradient-free variational learning with conditional mixture networks","summary":"  Balancing computational efficiency with robust predictive performance is\ncrucial in supervised learning, especially for critical applications. Standard\ndeep learning models, while accurate and scalable, often lack probabilistic\nfeatures like calibrated predictions and uncertainty quantification. Bayesian\nmethods address these issues but can be computationally expensive as model and\ndata complexity increase. Previous work shows that fast variational methods can\nreduce the compute requirements of Bayesian methods by eliminating the need for\ngradient computation or sampling, but are often limited to simple models. We\ndemonstrate that conditional mixture networks (CMNs), a probabilistic variant\nof the mixture-of-experts (MoE) model, are suitable for fast, gradient-free\ninference and can solve complex classification tasks. CMNs employ linear\nexperts and a softmax gating network. By exploiting conditional conjugacy and\nP\\'olya-Gamma augmentation, we furnish Gaussian likelihoods for the weights of\nboth the linear experts and the gating network. This enables efficient\nvariational updates using coordinate ascent variational inference (CAVI),\navoiding traditional gradient-based optimization. We validate this approach by\ntraining two-layer CMNs on standard benchmarks from the UCI repository. Our\nmethod, CAVI-CMN, achieves competitive and often superior predictive accuracy\ncompared to maximum likelihood estimation (MLE) with backpropagation, while\nmaintaining competitive runtime and full posterior distributions over all model\nparameters. Moreover, as input size or the number of experts increases,\ncomputation time scales competitively with MLE and other gradient-based\nsolutions like black-box variational inference (BBVI), making CAVI-CMN a\npromising tool for deep, fast, and gradient-free Bayesian networks.\n","authors":["Conor Heins","Hao Wu","Dimitrije Markovic","Alexander Tschantz","Jeff Beck","Christopher Buckley"],"pdf_url":"https://arxiv.org/pdf/2408.16429v1.pdf","comment":"16 pages main text (3 figures), including references. 9 pages\n  supplementary material (5 figures)"},{"id":"http://arxiv.org/abs/2408.16425v1","updated":"2024-08-29T10:35:07Z","published":"2024-08-29T10:35:07Z","title":"A Comparative Study of Hyperparameter Tuning Methods","summary":"  The study emphasizes the challenge of finding the optimal trade-off between\nbias and variance, especially as hyperparameter optimization increases in\ncomplexity. Through empirical analysis, three hyperparameter tuning algorithms\nTree-structured Parzen Estimator (TPE), Genetic Search, and Random Search are\nevaluated across regression and classification tasks. The results show that\nnonlinear models, with properly tuned hyperparameters, significantly outperform\nlinear models. Interestingly, Random Search excelled in regression tasks, while\nTPE was more effective for classification tasks. This suggests that there is no\none-size-fits-all solution, as different algorithms perform better depending on\nthe task and model type. The findings underscore the importance of selecting\nthe appropriate tuning method and highlight the computational challenges\ninvolved in optimizing machine learning models, particularly as search spaces\nexpand.\n","authors":["Subhasis Dasgupta","Jaydip Sen"],"pdf_url":"https://arxiv.org/pdf/2408.16425v1.pdf","comment":"This chapter has been accepted in the edited volume titles \"Data\n  Science in Theory and Practice\", editor J Sen & S Roy Choudhury. The volume\n  is expected to be published in October 2024 by Cambridge Scholars Publishing,\n  New Castle upon Tyne, UK. This chapter is 34 pages long and it contains 11\n  tables and 8 images"},{"id":"http://arxiv.org/abs/2406.15852v2","updated":"2024-08-29T10:28:42Z","published":"2024-06-22T13:57:09Z","title":"Next Level Message-Passing with Hierarchical Support Graphs","summary":"  Message-Passing Neural Networks (MPNNs) are extensively employed in graph\nlearning tasks but suffer from limitations such as the restricted scope of\ninformation exchange, by being confined to neighboring nodes during each round\nof message passing. Various strategies have been proposed to address these\nlimitations, including incorporating virtual nodes to facilitate global\ninformation exchange. In this study, we introduce the Hierarchical Support\nGraph (HSG), an extension of the virtual node concept created through recursive\ncoarsening of the original graph. This approach provides a flexible framework\nfor enhancing information flow in graphs, independent of the specific MPNN\nlayers utilized. We present a theoretical analysis of HSGs, investigate their\nempirical performance, and demonstrate that HSGs can surpass other methods\naugmented with virtual nodes, achieving state-of-the-art results across\nmultiple datasets.\n","authors":["Carlos Vonessen","Florian Grötschla","Roger Wattenhofer"],"pdf_url":"https://arxiv.org/pdf/2406.15852v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16414v1","updated":"2024-08-29T10:21:00Z","published":"2024-08-29T10:21:00Z","title":"Fourier Spectral Physics Informed Neural Network: An Efficient and\n  Low-Memory PINN","summary":"  With growing investigations into solving partial differential equations by\nphysics-informed neural networks (PINNs), more accurate and efficient PINNs are\nrequired to meet the practical demands of scientific computing. One bottleneck\nof current PINNs is computing the high-order derivatives via automatic\ndifferentiation which often necessitates substantial computing resources. In\nthis paper, we focus on removing the automatic differentiation of the spatial\nderivatives and propose a spectral-based neural network that substitutes the\ndifferential operator with a multiplication. Compared to the PINNs, our\napproach requires lower memory and shorter training time. Thanks to the\nexponential convergence of the spectral basis, our approach is more accurate.\nMoreover, to handle the different situations between physics domain and\nspectral domain, we provide two strategies to train networks by their spectral\ninformation. Through a series of comprehensive experiments, We validate the\naforementioned merits of our proposed network.\n","authors":["Tianchi Yu","Yiming Qi","Ivan Oseledets","Shiyi Chen"],"pdf_url":"https://arxiv.org/pdf/2408.16414v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03973v2","updated":"2024-08-29T10:09:58Z","published":"2024-02-06T13:06:14Z","title":"A comparison between humans and AI at recognizing objects in unusual\n  poses","summary":"  Deep learning is closing the gap with human vision on several object\nrecognition benchmarks. Here we investigate this gap for challenging images\nwhere objects are seen in unusual poses. We find that humans excel at\nrecognizing objects in such poses. In contrast, state-of-the-art deep networks\nfor vision (EfficientNet, SWAG, ViT, SWIN, BEiT, ConvNext) and state-of-the-art\nlarge vision-language models (Claude 3.5, Gemini 1.5, GPT-4) are systematically\nbrittle on unusual poses, with the exception of Gemini showing excellent\nrobustness in that condition. As we limit image exposure time, human\nperformance degrades to the level of deep networks, suggesting that additional\nmental processes (requiring additional time) are necessary to identify objects\nin unusual poses. An analysis of error patterns of humans vs. networks reveals\nthat even time-limited humans are dissimilar to feed-forward deep networks. In\nconclusion, our comparison reveals that humans and deep networks rely on\ndifferent mechanisms for recognizing objects in unusual poses. Understanding\nthe nature of the mental processes taking place during extra viewing time may\nbe key to reproduce the robustness of human vision in silico.\n","authors":["Netta Ollikka","Amro Abbas","Andrea Perin","Markku Kilpeläinen","Stéphane Deny"],"pdf_url":"https://arxiv.org/pdf/2402.03973v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16403v1","updated":"2024-08-29T10:02:29Z","published":"2024-08-29T10:02:29Z","title":"DeepSPoC: A Deep Learning-Based PDE Solver Governed by Sequential\n  Propagation of Chaos","summary":"  Sequential propagation of chaos (SPoC) is a recently developed tool to solve\nmean-field stochastic differential equations and their related nonlinear\nFokker-Planck equations. Based on the theory of SPoC, we present a new method\n(deepSPoC) that combines the interacting particle system of SPoC and deep\nlearning. Under the framework of deepSPoC, two classes of frequently used deep\nmodels include fully connected neural networks and normalizing flows are\nconsidered. For high-dimensional problems, spatial adaptive method are designed\nto further improve the accuracy and efficiency of deepSPoC. We analysis the\nconvergence of the framework of deepSPoC under some simplified conditions and\nalso provide a posterior error estimation for the algorithm. Finally, we test\nour methods on a wide range of different types of mean-field equations.\n","authors":["Kai Du","Yongle Xie","Tao Zhou","Yuancheng Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.16403v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12660v2","updated":"2024-08-29T09:58:47Z","published":"2023-10-19T11:33:33Z","title":"Gradient Descent Fails to Learn High-frequency Functions and Modular\n  Arithmetic","summary":"  Classes of target functions containing a large number of approximately\northogonal elements are known to be hard to learn by the Statistical Query\nalgorithms. Recently this classical fact re-emerged in a theory of\ngradient-based optimization of neural networks. In the novel framework, the\nhardness of a class is usually quantified by the variance of the gradient with\nrespect to a random choice of a target function.\n  A set of functions of the form $x\\to ax \\bmod p$, where $a$ is taken from\n${\\mathbb Z}_p$, has attracted some attention from deep learning theorists and\ncryptographers recently. This class can be understood as a subset of\n$p$-periodic functions on ${\\mathbb Z}$ and is tightly connected with a class\nof high-frequency periodic functions on the real line.\n  We present a mathematical analysis of limitations and challenges associated\nwith using gradient-based learning techniques to train a high-frequency\nperiodic function or modular multiplication from examples. We highlight that\nthe variance of the gradient is negligibly small in both cases when either a\nfrequency or the prime base $p$ is large. This in turn prevents such a learning\nalgorithm from being successful.\n","authors":["Rustem Takhanov","Maxat Tezekbayev","Artur Pak","Arman Bolatov","Zhenisbek Assylbekov"],"pdf_url":"https://arxiv.org/pdf/2310.12660v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16393v1","updated":"2024-08-29T09:55:55Z","published":"2024-08-29T09:55:55Z","title":"Illuminating the Diversity-Fitness Trade-Off in Black-Box Optimization","summary":"  In real-world applications, users often favor structurally diverse design\nchoices over one high-quality solution. It is hence important to consider more\nsolutions that decision-makers can compare and further explore based on\nadditional criteria. Alongside the existing approaches of evolutionary\ndiversity optimization, quality diversity, and multimodal optimization, this\npaper presents a fresh perspective on this challenge by considering the problem\nof identifying a fixed number of solutions with a pairwise distance above a\nspecified threshold while maximizing their average quality.\n  We obtain first insight into these objectives by performing a subset\nselection on the search trajectories of different well-established search\nheuristics, whether specifically designed with diversity in mind or not. We\nemphasize that the main goal of our work is not to present a new algorithm but\nto look at the problem in a more fundamental and theoretically tractable way by\nasking the question: What trade-off exists between the minimum distance within\nbatches of solutions and the average quality of their fitness? These insights\nalso provide us with a way of making general claims concerning the properties\nof optimization problems that shall be useful in turn for benchmarking\nalgorithms of the approaches enumerated above.\n  A possibly surprising outcome of our empirical study is the observation that\nnaive uniform random sampling establishes a very strong baseline for our\nproblem, hardly ever outperformed by the search trajectories of the considered\nheuristics. We interpret these results as a motivation to develop algorithms\ntailored to produce diverse solutions of high average quality.\n","authors":["Maria Laura Santoni","Elena Raponi","Aneta Neumann","Frank Neumann","Mike Preuss","Carola Doerr"],"pdf_url":"https://arxiv.org/pdf/2408.16393v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16391v1","updated":"2024-08-29T09:54:46Z","published":"2024-08-29T09:54:46Z","title":"TempoKGAT: A Novel Graph Attention Network Approach for Temporal Graph\n  Analysis","summary":"  Graph neural networks (GNN) have shown significant capabilities in handling\nstructured data, yet their application to dynamic, temporal data remains\nlimited. This paper presents a new type of graph attention network, called\nTempoKGAT, which combines time-decaying weight and a selective neighbor\naggregation mechanism on the spatial domain, which helps uncover latent\npatterns in the graph data. In this approach, a top-k neighbor selection based\non the edge weights is introduced to represent the evolving features of the\ngraph data. We evaluated the performance of our TempoKGAT on multiple datasets\nfrom the traffic, energy, and health sectors involving spatio-temporal data. We\ncompared the performance of our approach to several state-of-the-art methods\nfound in the literature on several open-source datasets. Our method shows\nsuperior accuracy on all datasets. These results indicate that TempoKGAT builds\non existing methodologies to optimize prediction accuracy and provide new\ninsights into model interpretation in temporal contexts.\n","authors":["Lena Sasal","Daniel Busby","Abdenour Hadid"],"pdf_url":"https://arxiv.org/pdf/2408.16391v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.09441v2","updated":"2024-08-29T09:52:58Z","published":"2024-07-12T17:27:43Z","title":"The $μ\\mathcal{G}$ Language for Programming Graph Neural Networks","summary":"  Graph neural networks form a class of deep learning architectures\nspecifically designed to work with graph-structured data. As such, they share\nthe inherent limitations and problems of deep learning, especially regarding\nthe issues of explainability and trustworthiness. We propose $\\mu\\mathcal{G}$,\nan original domain-specific language for the specification of graph neural\nnetworks that aims to overcome these issues. The language's syntax is\nintroduced, and its meaning is rigorously defined by a denotational semantics.\nAn equivalent characterization in the form of an operational semantics is also\nprovided and, together with a type system, is used to prove the type soundness\nof $\\mu\\mathcal{G}$. We show how $\\mu\\mathcal{G}$ programs can be represented\nin a more user-friendly graphical visualization, and provide examples of its\ngenerality by showing how it can be used to define some of the most popular\ngraph neural network models, or to develop any custom graph processing\napplication.\n","authors":["Matteo Belenchia","Flavio Corradini","Michela Quadrini","Michele Loreti"],"pdf_url":"https://arxiv.org/pdf/2407.09441v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16389v1","updated":"2024-08-29T09:50:31Z","published":"2024-08-29T09:50:31Z","title":"Addressing Common Misinterpretations of KART and UAT in Neural Network\n  Literature","summary":"  This note addresses the Kolmogorov-Arnold Representation Theorem (KART) and\nthe Universal Approximation Theorem (UAT), focusing on their common\nmisinterpretations in some papers related to neural network approximation. Our\nremarks aim to support a more accurate understanding of KART and UAT among\nneural network specialists.\n","authors":["Vugar Ismailov"],"pdf_url":"https://arxiv.org/pdf/2408.16389v1.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2408.15294v2","updated":"2024-08-29T09:43:04Z","published":"2024-08-27T09:48:25Z","title":"Evaluating the Predictive Features of Person-Centric Knowledge Graph\n  Embeddings: Unfolding Ablation Studies","summary":"  Developing novel predictive models with complex biomedical information is\nchallenging due to various idiosyncrasies related to heterogeneity,\nstandardization or sparseness of the data. We previously introduced a\nperson-centric ontology to organize information about individual patients, and\na representation learning framework to extract person-centric knowledge graphs\n(PKGs) and to train Graph Neural Networks (GNNs). In this paper, we propose a\nsystematic approach to examine the results of GNN models trained with both\nstructured and unstructured information from the MIMIC-III dataset. Through\nablation studies on different clinical, demographic, and social data, we show\nthe robustness of this approach in identifying predictive features in PKGs for\nthe task of readmission prediction.\n","authors":["Christos Theodoropoulos","Natasha Mulligan","Joao Bettencourt-Silva"],"pdf_url":"https://arxiv.org/pdf/2408.15294v2.pdf","comment":"Published in the 34th Medical Informatics Europe Conference"},{"id":"http://arxiv.org/abs/2408.16379v1","updated":"2024-08-29T09:41:17Z","published":"2024-08-29T09:41:17Z","title":"TG-PhyNN: An Enhanced Physically-Aware Graph Neural Network framework\n  for forecasting Spatio-Temporal Data","summary":"  Accurately forecasting dynamic processes on graphs, such as traffic flow or\ndisease spread, remains a challenge. While Graph Neural Networks (GNNs) excel\nat modeling and forecasting spatio-temporal data, they often lack the ability\nto directly incorporate underlying physical laws. This work presents TG-PhyNN,\na novel Temporal Graph Physics-Informed Neural Network framework. TG-PhyNN\nleverages the power of GNNs for graph-based modeling while simultaneously\nincorporating physical constraints as a guiding principle during training. This\nis achieved through a two-step prediction strategy that enables the calculation\nof physical equation derivatives within the GNN architecture. Our findings\ndemonstrate that TG-PhyNN significantly outperforms traditional forecasting\nmodels (e.g., GRU, LSTM, GAT) on real-world spatio-temporal datasets like\nPedalMe (traffic flow), COVID-19 spread, and Chickenpox outbreaks. These\ndatasets are all governed by well-defined physical principles, which TG-PhyNN\neffectively exploits to offer more reliable and accurate forecasts in various\ndomains where physical processes govern the dynamics of data. This paves the\nway for improved forecasting in areas like traffic flow prediction, disease\noutbreak prediction, and potentially other fields where physics plays a crucial\nrole.\n","authors":["Zakaria Elabid","Lena Sasal","Daniel Busby","Abdenour Hadid"],"pdf_url":"https://arxiv.org/pdf/2408.16379v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.08800v2","updated":"2024-08-29T09:27:45Z","published":"2024-06-13T04:33:05Z","title":"Can Synthetic Audio From Generative Foundation Models Assist Audio\n  Recognition and Speech Modeling?","summary":"  Recent advances in foundation models have enabled audio-generative models\nthat produce high-fidelity sounds associated with music, events, and human\nactions. Despite the success achieved in modern audio-generative models, the\nconventional approach to assessing the quality of the audio generation relies\nheavily on distance metrics like Frechet Audio Distance. In contrast, we aim to\nevaluate the quality of audio generation by examining the effectiveness of\nusing them as training data. Specifically, we conduct studies to explore the\nuse of synthetic audio for audio recognition. Moreover, we investigate whether\nsynthetic audio can serve as a resource for data augmentation in speech-related\nmodeling. Our comprehensive experiments demonstrate the potential of using\nsynthetic audio for audio recognition and speech-related modeling. Our code is\navailable at https://github.com/usc-sail/SynthAudio.\n","authors":["Tiantian Feng","Dimitrios Dimitriadis","Shrikanth Narayanan"],"pdf_url":"https://arxiv.org/pdf/2406.08800v2.pdf","comment":"Accepted to 2024 INTERSPEECH; corrections to ActivityNet labels"},{"id":"http://arxiv.org/abs/2406.05482v4","updated":"2024-08-29T08:46:46Z","published":"2024-06-08T14:14:19Z","title":"Efficient Topology-aware Data Augmentation for High-Degree Graph Neural\n  Networks","summary":"  In recent years, graph neural networks (GNNs) have emerged as a potent tool\nfor learning on graph-structured data and won fruitful successes in varied\nfields. The majority of GNNs follow the message-passing paradigm, where\nrepresentations of each node are learned by recursively aggregating features of\nits neighbors. However, this mechanism brings severe over-smoothing and\nefficiency issues over high-degree graphs (HDGs), wherein most nodes have\ndozens (or even hundreds) of neighbors, such as social networks, transaction\ngraphs, power grids, etc. Additionally, such graphs usually encompass rich and\ncomplex structure semantics, which are hard to capture merely by feature\naggregations in GNNs. Motivated by the above limitations, we propose TADA, an\nefficient and effective front-mounted data augmentation framework for GNNs on\nHDGs. Under the hood, TADA includes two key modules: (i) feature expansion with\nstructure embeddings, and (ii) topology- and attribute-aware graph\nsparsification. The former obtains augmented node features and enhanced model\ncapacity by encoding the graph structure into high-quality structure embeddings\nwith our highly-efficient sketching method. Further, by exploiting\ntask-relevant features extracted from graph structures and attributes, the\nsecond module enables the accurate identification and reduction of numerous\nredundant/noisy edges from the input graph, thereby alleviating over-smoothing\nand facilitating faster feature aggregations over HDGs. Empirically, TADA\nconsiderably improves the predictive performance of mainstream GNN models on 8\nreal homophilic/heterophilic HDGs in terms of node classification, while\nachieving efficient training and inference processes.\n","authors":["Yurui Lai","Xiaoyang Lin","Renchi Yang","Hongtao Wang"],"pdf_url":"https://arxiv.org/pdf/2406.05482v4.pdf","comment":"This is the technical report for the paper accepted to KDD 2024. 16\n  pages"},{"id":"http://arxiv.org/abs/2408.16349v1","updated":"2024-08-29T08:36:22Z","published":"2024-08-29T08:36:22Z","title":"Machine learning models for daily rainfall forecasting in Northern\n  Tropical Africa using tropical wave predictors","summary":"  Numerical weather prediction (NWP) models often underperform compared to\nsimpler climatology-based precipitation forecasts in northern tropical Africa,\neven after statistical postprocessing. AI-based forecasting models show promise\nbut have avoided precipitation due to its complexity. Synoptic-scale forcings\nlike African easterly waves and other tropical waves (TWs) are important for\npredictability in tropical Africa, yet their value for predicting daily\nrainfall remains unexplored. This study uses two machine-learning models--gamma\nregression and a convolutional neural network (CNN)--trained on TW predictors\nfrom satellite-based GPM IMERG data to predict daily rainfall during the\nJuly-September monsoon season. Predictor variables are derived from the local\namplitude and phase information of seven TW from the target and\nup-and-downstream neighboring grids at 1-degree spatial resolution. The ML\nmodels are combined with Easy Uncertainty Quantification (EasyUQ) to generate\ncalibrated probabilistic forecasts and are compared with three benchmarks:\nExtended Probabilistic Climatology (EPC15), ECMWF operational ensemble forecast\n(ENS), and a probabilistic forecast from the ENS control member using EasyUQ\n(CTRL EasyUQ). The study finds that downstream predictor variables offer the\nhighest predictability, with downstream tropical depression (TD)-type\nwave-based predictors being most important. Other waves like mixed-Rossby\ngravity (MRG), Kelvin, and inertio-gravity waves also contribute significantly\nbut show regional preferences. ENS forecasts exhibit poor skill due to\nmiscalibration. CTRL EasyUQ shows improvement over ENS and marginal enhancement\nover EPC15. Both gamma regression and CNN forecasts significantly outperform\nbenchmarks in tropical Africa. This study highlights the potential of ML models\ntrained on TW-based predictors to improve daily precipitation forecasts in\ntropical Africa.\n","authors":["Athul Rasheeda Satheesh","Peter Knippertz","Andreas H. Fink"],"pdf_url":"https://arxiv.org/pdf/2408.16349v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.12326v2","updated":"2024-08-29T08:27:27Z","published":"2024-02-19T18:00:30Z","title":"PsychoGAT: A Novel Psychological Measurement Paradigm through\n  Interactive Fiction Games with LLM Agents","summary":"  Psychological measurement is essential for mental health, self-understanding,\nand personal development. Traditional methods, such as self-report scales and\npsychologist interviews, often face challenges with engagement and\naccessibility. While game-based and LLM-based tools have been explored to\nimprove user interest and automate assessment, they struggle to balance\nengagement with generalizability. In this work, we propose PsychoGAT\n(Psychological Game AgenTs) to achieve a generic gamification of psychological\nassessment. The main insight is that powerful LLMs can function both as adept\npsychologists and innovative game designers. By incorporating LLM agents into\ndesignated roles and carefully managing their interactions, PsychoGAT can\ntransform any standardized scales into personalized and engaging interactive\nfiction games. To validate the proposed method, we conduct psychometric\nevaluations to assess its effectiveness and employ human evaluators to examine\nthe generated content across various psychological constructs, including\ndepression, cognitive distortions, and personality traits. Results demonstrate\nthat PsychoGAT serves as an effective assessment tool, achieving statistically\nsignificant excellence in psychometric metrics such as reliability, convergent\nvalidity, and discriminant validity. Moreover, human evaluations confirm\nPsychoGAT's enhancements in content coherence, interactivity, interest,\nimmersion, and satisfaction.\n","authors":["Qisen Yang","Zekun Wang","Honghui Chen","Shenzhi Wang","Yifan Pu","Xin Gao","Wenhao Huang","Shiji Song","Gao Huang"],"pdf_url":"https://arxiv.org/pdf/2402.12326v2.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2408.14144v2","updated":"2024-08-29T08:27:26Z","published":"2024-08-26T09:42:18Z","title":"Neighborhood and Global Perturbations Supported SAM in Federated\n  Learning: From Local Tweaks To Global Awareness","summary":"  Federated Learning (FL) can be coordinated under the orchestration of a\ncentral server to collaboratively build a privacy-preserving model without the\nneed for data exchange. However, participant data heterogeneity leads to local\noptima divergence, subsequently affecting convergence outcomes. Recent research\nhas focused on global sharpness-aware minimization (SAM) and dynamic\nregularization techniques to enhance consistency between global and local\ngeneralization and optimization objectives. Nonetheless, the estimation of\nglobal SAM introduces additional computational and memory overhead, while\ndynamic regularization suffers from bias in the local and global dual variables\ndue to training isolation. In this paper, we propose a novel FL algorithm,\nFedTOGA, designed to consider optimization and generalization objectives while\nmaintaining minimal uplink communication overhead. By linking local\nperturbations to global updates, global generalization consistency is improved.\nAdditionally, global updates are used to correct local dynamic regularizers,\nreducing dual variables bias and enhancing optimization consistency. Global\nupdates are passively received by clients, reducing overhead. We also propose\nneighborhood perturbation to approximate local perturbation, analyzing its\nstrengths and limitations. Theoretical analysis shows FedTOGA achieves faster\nconvergence $O(1/T)$ under non-convex functions. Empirical studies demonstrate\nthat FedTOGA outperforms state-of-the-art algorithms, with a 1\\% accuracy\nincrease and 30\\% faster convergence, achieving state-of-the-art.\n","authors":["Boyuan Li","Zihao Peng","Yafei Li","Mingliang Xu","Shengbo Chen","Baofeng Ji","Cong Shen"],"pdf_url":"https://arxiv.org/pdf/2408.14144v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16337v1","updated":"2024-08-29T08:20:02Z","published":"2024-08-29T08:20:02Z","title":"Do Graph Neural Networks Work for High Entropy Alloys?","summary":"  Graph neural networks (GNNs) have excelled in predictive modeling for both\ncrystals and molecules, owing to the expressiveness of graph representations.\nHigh-entropy alloys (HEAs), however, lack chemical long-range order, limiting\nthe applicability of current graph representations. To overcome this challenge,\nwe propose a representation of HEAs as a collection of local environment (LE)\ngraphs. Based on this representation, we introduce the LESets machine learning\nmodel, an accurate, interpretable GNN for HEA property prediction. We\ndemonstrate the accuracy of LESets in modeling the mechanical properties of\nquaternary HEAs. Through analyses and interpretation, we further extract\ninsights into the modeling and design of HEAs. In a broader sense, LESets\nextends the potential applicability of GNNs to disordered materials with\ncombinatorial complexity formed by diverse constituents and their flexible\nconfigurations.\n","authors":["Hengrui Zhang","Ruishu Huang","Jie Chen","James M. Rondinelli","Wei Chen"],"pdf_url":"https://arxiv.org/pdf/2408.16337v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11299v2","updated":"2024-08-29T08:14:31Z","published":"2023-12-18T15:49:03Z","title":"Uncertainty-based Fairness Measures","summary":"  Unfair predictions of machine learning (ML) models impede their broad\nacceptance in real-world settings. Tackling this arduous challenge first\nnecessitates defining what it means for an ML model to be fair. This has been\naddressed by the ML community with various measures of fairness that depend on\nthe prediction outcomes of the ML models, either at the group level or the\nindividual level. These fairness measures are limited in that they utilize\npoint predictions, neglecting their variances, or uncertainties, making them\nsusceptible to noise, missingness and shifts in data. In this paper, we first\nshow that an ML model may appear to be fair with existing point-based fairness\nmeasures but biased against a demographic group in terms of prediction\nuncertainties. Then, we introduce new fairness measures based on different\ntypes of uncertainties, namely, aleatoric uncertainty and epistemic\nuncertainty. We demonstrate on many datasets that (i) our uncertainty-based\nmeasures are complementary to existing measures of fairness, and (ii) they\nprovide more insights about the underlying issues leading to bias.\n","authors":["Selim Kuzucu","Jiaee Cheong","Hatice Gunes","Sinan Kalkan"],"pdf_url":"https://arxiv.org/pdf/2312.11299v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16336v1","updated":"2024-08-29T08:14:20Z","published":"2024-08-29T08:14:20Z","title":"GL-TSVM: A robust and smooth twin support vector machine with guardian\n  loss function","summary":"  Twin support vector machine (TSVM), a variant of support vector machine\n(SVM), has garnered significant attention due to its $3/4$ times lower\ncomputational complexity compared to SVM. However, due to the utilization of\nthe hinge loss function, TSVM is sensitive to outliers or noise. To remedy it,\nwe introduce the guardian loss (G-loss), a novel loss function distinguished by\nits asymmetric, bounded, and smooth characteristics. We then fuse the proposed\nG-loss function into the TSVM and yield a robust and smooth classifier termed\nGL-TSVM. Further, to adhere to the structural risk minimization (SRM) principle\nand reduce overfitting, we incorporate a regularization term into the objective\nfunction of GL-TSVM. To address the optimization challenges of GL-TSVM, we\ndevise an efficient iterative algorithm. The experimental analysis on UCI and\nKEEL datasets substantiates the effectiveness of the proposed GL-TSVM in\ncomparison to the baseline models. Moreover, to showcase the efficacy of the\nproposed GL-TSVM in the biomedical domain, we evaluated it on the breast cancer\n(BreaKHis) and schizophrenia datasets. The outcomes strongly demonstrate the\ncompetitiveness of the proposed GL-TSVM against the baseline models.\n","authors":["Mushir Akhtar","M. Tanveer","Mohd. Arshad"],"pdf_url":"https://arxiv.org/pdf/2408.16336v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2404.18101"},{"id":"http://arxiv.org/abs/2408.16333v1","updated":"2024-08-29T08:12:18Z","published":"2024-08-29T08:12:18Z","title":"Self-Improving Diffusion Models with Synthetic Data","summary":"  The artificial intelligence (AI) world is running out of real data for\ntraining increasingly large generative models, resulting in accelerating\npressure to train on synthetic data. Unfortunately, training new generative\nmodels with synthetic data from current or past generation models creates an\nautophagous (self-consuming) loop that degrades the quality and/or diversity of\nthe synthetic data in what has been termed model autophagy disorder (MAD) and\nmodel collapse. Current thinking around model autophagy recommends that\nsynthetic data is to be avoided for model training lest the system deteriorate\ninto MADness. In this paper, we take a different tack that treats synthetic\ndata differently from real data. Self-IMproving diffusion models with Synthetic\ndata (SIMS) is a new training concept for diffusion models that uses\nself-synthesized data to provide negative guidance during the generation\nprocess to steer a model's generative process away from the non-ideal synthetic\ndata manifold and towards the real data distribution. We demonstrate that SIMS\nis capable of self-improvement; it establishes new records based on the\nFr\\'echet inception distance (FID) metric for CIFAR-10 and ImageNet-64\ngeneration and achieves competitive results on FFHQ-64 and ImageNet-512.\nMoreover, SIMS is, to the best of our knowledge, the first prophylactic\ngenerative AI algorithm that can be iteratively trained on self-generated\nsynthetic data without going MAD. As a bonus, SIMS can adjust a diffusion\nmodel's synthetic data distribution to match any desired in-domain target\ndistribution to help mitigate biases and ensure fairness.\n","authors":["Sina Alemohammad","Ahmed Imtiaz Humayun","Shruti Agarwal","John Collomosse","Richard Baraniuk"],"pdf_url":"https://arxiv.org/pdf/2408.16333v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18269v2","updated":"2024-08-29T08:07:43Z","published":"2024-07-19T22:51:41Z","title":"LaMAGIC: Language-Model-based Topology Generation for Analog Integrated\n  Circuits","summary":"  In the realm of electronic and electrical engineering, automation of analog\ncircuit is increasingly vital given the complexity and customized requirements\nof modern applications. However, existing methods only develop search-based\nalgorithms that require many simulation iterations to design a custom circuit\ntopology, which is usually a time-consuming process. To this end, we introduce\nLaMAGIC, a pioneering language model-based topology generation model that\nleverages supervised finetuning for automated analog circuit design. LaMAGIC\ncan efficiently generate an optimized circuit design from the custom\nspecification in a single pass. Our approach involves a meticulous development\nand analysis of various input and output formulations for circuit. These\nformulations can ensure canonical representations of circuits and align with\nthe autoregressive nature of LMs to effectively addressing the challenges of\nrepresenting analog circuits as graphs. The experimental results show that\nLaMAGIC achieves a success rate of up to 96\\% under a strict tolerance of 0.01.\nWe also examine the scalability and adaptability of LaMAGIC, specifically\ntesting its performance on more complex circuits. Our findings reveal the\nenhanced effectiveness of our adjacency matrix-based circuit formulation with\nfloating-point input, suggesting its suitability for handling intricate circuit\ndesigns. This research not only demonstrates the potential of language models\nin graph generation, but also builds a foundational framework for future\nexplorations in automated analog circuit design.\n","authors":["Chen-Chia Chang","Yikang Shen","Shaoze Fan","Jing Li","Shun Zhang","Ningyuan Cao","Yiran Chen","Xin Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.18269v2.pdf","comment":"Proceedings of the 41st International Conference on Machine Learning,\n  PMLR 235:6253-6262 https://proceedings.mlr.press/v235/chang24c.html"},{"id":"http://arxiv.org/abs/2408.16321v1","updated":"2024-08-29T07:48:55Z","published":"2024-08-29T07:48:55Z","title":"Minimising changes to audit when updating decision trees","summary":"  Interpretable models are important, but what happens when the model is\nupdated on new training data? We propose an algorithm for updating a decision\ntree while minimising the number of changes to the tree that a human would need\nto audit. We achieve this via a greedy approach that incorporates the number of\nchanges to the tree as part of the objective function. We compare our algorithm\nto existing methods and show that it sits in a sweet spot between final\naccuracy and number of changes to audit.\n","authors":["Anj Simmons","Scott Barnett","Anupam Chaudhuri","Sankhya Singh","Shangeetha Sivasothy"],"pdf_url":"https://arxiv.org/pdf/2408.16321v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2310.15274v2","updated":"2024-08-29T07:32:45Z","published":"2023-10-23T18:20:54Z","title":"1 From the Pursuit of Universal AGI Architecture to Systematic Approach\n  to Heterogenous AGI: Addressing Alignment, Energy, & AGI Grand Challenges","summary":"  AI faces a trifecta of grand challenges: the Energy Wall, the Alignment\nProblem and the Leap from Narrow AI to AGI. Contemporary AI solutions consume\nunsustainable amounts of energy during model training and daily operations.\nMaking things worse, the amount of computation required to train each new AI\nmodel has been doubling every 2 months since 2020, directly translating to\nunprecedented increases in energy consumption.\n  The leap from AI to AGI requires multiple functional subsystems operating in\na balanced manner, which requires a system architecture. However, the current\napproach to artificial intelligence lacks system design; even though system\ncharacteristics play a key role in the human brain; from the way it processes\ninformation to how it makes decisions. System design is the key to alignment,\none of the most challenging goals in AI. This difficulty stems from the fact\nthat the complexity of human moral system requires a similarly sophisticated\nsystem for alignment. Without accurately reflecting the complexity of these\ncore moral subsystems and systems, aligning AI with human values becomes\nsignificantly more challenging.\n  In this paper, we posit that system design is the missing piece in overcoming\nthe grand challenges. We present a Systematic Approach to AGI that utilizes\nsystem design principles to AGI, while providing ways to overcome the energy\nwall and the alignment challenges. This paper asserts that artificial\nintelligence can be realized through a multiplicity of design-specific\npathways, rather than a singular, overarching AGI architecture. AGI systems may\nexhibit diverse architectural configurations and capabilities, contingent upon\ntheir intended use cases. It advocates for a focus on employing system design\nprinciples as a guiding framework, rather than solely concentrating on a\nuniversal AGI architecture.\n","authors":["Eren Kurshan"],"pdf_url":"https://arxiv.org/pdf/2310.15274v2.pdf","comment":"International Journal on Semantic Computing (2024) Categories:\n  Artificial Intelligence; AI; Artificial General Intelligence; AGI; System\n  Design; System Architecture"},{"id":"http://arxiv.org/abs/2408.16315v1","updated":"2024-08-29T07:32:30Z","published":"2024-08-29T07:32:30Z","title":"Passenger hazard perception based on EEG signals for highly automated\n  driving vehicles","summary":"  Enhancing the safety of autonomous vehicles is crucial, especially given\nrecent accidents involving automated systems. As passengers in these vehicles,\nhumans' sensory perception and decision-making can be integrated with\nautonomous systems to improve safety. This study explores neural mechanisms in\npassenger-vehicle interactions, leading to the development of a Passenger\nCognitive Model (PCM) and the Passenger EEG Decoding Strategy (PEDS). Central\nto PEDS is a novel Convolutional Recurrent Neural Network (CRNN) that captures\nspatial and temporal EEG data patterns. The CRNN, combined with stacking\nalgorithms, achieves an accuracy of $85.0\\% \\pm 3.18\\%$. Our findings highlight\nthe predictive power of pre-event EEG data, enhancing the detection of\nhazardous scenarios and offering a network-driven framework for safer\nautonomous vehicles.\n","authors":["Ashton Yu Xuan Tan","Yingkai Yang","Xiaofei Zhang","Bowen Li","Xiaorong Gao","Sifa Zheng","Jianqiang Wang","Xinyu Gu","Jun Li","Yang Zhao","Yuxin Zhang","Tania Stathaki"],"pdf_url":"https://arxiv.org/pdf/2408.16315v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16293v1","updated":"2024-08-29T06:49:20Z","published":"2024-08-29T06:49:20Z","title":"Physics of Language Models: Part 2.2, How to Learn From Mistakes on\n  Grade-School Math Problems","summary":"  Language models have demonstrated remarkable performance in solving reasoning\ntasks; however, even the strongest models still occasionally make reasoning\nmistakes. Recently, there has been active research aimed at improving reasoning\naccuracy, particularly by using pretrained language models to \"self-correct\"\ntheir mistakes via multi-round prompting. In this paper, we follow this line of\nwork but focus on understanding the usefulness of incorporating\n\"error-correction\" data directly into the pretraining stage. This data consists\nof erroneous solution steps immediately followed by their corrections. Using a\nsynthetic math dataset, we show promising results: this type of pretrain data\ncan help language models achieve higher reasoning accuracy directly (i.e.,\nthrough simple auto-regression, without multi-round prompting) compared to\npretraining on the same amount of error-free data. We also delve into many\ndetails, such as (1) how this approach differs from beam search, (2) how such\ndata can be prepared, (3) whether masking is needed on the erroneous tokens,\n(4) the amount of error required, (5) whether such data can be deferred to the\nfine-tuning stage, and many others.\n","authors":["Tian Ye","Zicheng Xu","Yuanzhi Li","Zeyuan Allen-Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.16293v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2407.20311"},{"id":"http://arxiv.org/abs/2408.16291v1","updated":"2024-08-29T06:48:07Z","published":"2024-08-29T06:48:07Z","title":"Flexible framework for generating synthetic electrocardiograms and\n  photoplethysmograms","summary":"  By generating synthetic biosignals, the quantity and variety of health data\ncan be increased. This is especially useful when training machine learning\nmodels by enabling data augmentation and introduction of more physiologically\nplausible variation to the data. For these purposes, we have developed a\nsynthetic biosignal model for two signal modalities, electrocardiography (ECG)\nand photoplethysmography (PPG). The model produces realistic signals that\naccount for physiological effects such as breathing modulation and changes in\nheart rate due to physical stress. Arrhythmic signals can be generated with\nbeat intervals extracted from real measurements. The model also includes a\nflexible approach to adding different kinds of noise and signal artifacts. The\nnoise is generated from power spectral densities extracted from both measured\nnoisy signals and modeled power spectra. Importantly, the model also\nautomatically produces labels for noise, segmentation (e.g. P and T waves, QRS\ncomplex, for electrocardiograms), and artifacts. We assessed how this\ncomprehensive model can be used in practice to improve the performance of\nmodels trained on ECG or PPG data. For example, we trained an LSTM to detect\nECG R-peaks using both real ECG signals from the MIT-BIH arrythmia set and our\nnew generator. The F1 score of the model was 0.83 using real data, in\ncomparison to 0.98 using our generator. In addition, the model can be used for\nexample in signal segmentation, quality detection and bench-marking detection\nalgorithms. The model code has been released in\n\\url{https://github.com/UTU-Health-Research/framework_for_synthetic_biosignals}\n","authors":["Katri Karhinoja","Antti Vasankari","Jukka-Pekka Sirkiä","Antti Airola","David Wong","Matti Kaisti"],"pdf_url":"https://arxiv.org/pdf/2408.16291v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16288v1","updated":"2024-08-29T06:40:01Z","published":"2024-08-29T06:40:01Z","title":"OpenFGL: A Comprehensive Benchmarks for Federated Graph Learning","summary":"  Federated graph learning (FGL) has emerged as a promising distributed\ntraining paradigm for graph neural networks across multiple local systems\nwithout direct data sharing. This approach is particularly beneficial in\nprivacy-sensitive scenarios and offers a new perspective on addressing\nscalability challenges in large-scale graph learning. Despite the proliferation\nof FGL, the diverse motivations from practical applications, spanning various\nresearch backgrounds and experimental settings, pose a significant challenge to\nfair evaluation. To fill this gap, we propose OpenFGL, a unified benchmark\ndesigned for the primary FGL scenarios: Graph-FL and Subgraph-FL. Specifically,\nOpenFGL includes 38 graph datasets from 16 application domains, 8 federated\ndata simulation strategies that emphasize graph properties, and 5 graph-based\ndownstream tasks. Additionally, it offers 18 recently proposed SOTA FGL\nalgorithms through a user-friendly API, enabling a thorough comparison and\ncomprehensive evaluation of their effectiveness, robustness, and efficiency.\nEmpirical results demonstrate the ability of FGL while also revealing its\npotential limitations, offering valuable insights for future exploration in\nthis thriving field.\n","authors":["Xunkai Li","Yinlin Zhu","Boyang Pang","Guochen Yan","Yeyu Yan","Zening Li","Zhengyu Wu","Wentao Zhang","Rong-Hua Li","Guoren Wang"],"pdf_url":"https://arxiv.org/pdf/2408.16288v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2408.16286v1","updated":"2024-08-29T06:37:16Z","published":"2024-08-29T06:37:16Z","title":"Near-Optimal Policy Identification in Robust Constrained Markov Decision\n  Processes via Epigraph Form","summary":"  Designing a safe policy for uncertain environments is crucial in real-world\ncontrol applications. However, this challenge remains inadequately addressed\nwithin the Markov decision process (MDP) framework. This paper presents the\nfirst algorithm capable of identifying a near-optimal policy in a robust\nconstrained MDP (RCMDP), where an optimal policy minimizes cumulative cost\nwhile satisfying constraints in the worst-case scenario across a set of\nenvironments. We first prove that the conventional Lagrangian max-min\nformulation with policy gradient methods can become trapped in suboptimal\nsolutions by encountering a sum of conflicting gradients from the objective and\nconstraint functions during its inner minimization problem. To address this, we\nleverage the epigraph form of the RCMDP problem, which resolves the conflict by\nselecting a single gradient from either the objective or the constraints.\nBuilding on the epigraph form, we propose a binary search algorithm with a\npolicy gradient subroutine and prove that it identifies an\n$\\varepsilon$-optimal policy in an RCMDP with\n$\\tilde{\\mathcal{O}}(\\varepsilon^{-4})$ policy evaluations.\n","authors":["Toshinori Kitamura","Tadashi Kozuno","Wataru Kumagai","Kenta Hoshino","Yohei Hosoe","Kazumi Kasaura","Masashi Hamaya","Paavo Parmas","Yutaka Matsuo"],"pdf_url":"https://arxiv.org/pdf/2408.16286v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16285v1","updated":"2024-08-29T06:30:23Z","published":"2024-08-29T06:30:23Z","title":"ART: Actually Robust Training","summary":"  Current interest in deep learning captures the attention of many programmers\nand researchers. Unfortunately, the lack of a unified schema for developing\ndeep learning models results in methodological inconsistencies, unclear\ndocumentation, and problems with reproducibility. Some guidelines have been\nproposed, yet currently, they lack practical implementations. Furthermore,\nneural network training often takes on the form of trial and error, lacking a\nstructured and thoughtful process. To alleviate these issues, in this paper, we\nintroduce Art, a Python library designed to help automatically impose rules and\nstandards while developing deep learning pipelines. Art divides model\ndevelopment into a series of smaller steps of increasing complexity, each\nconcluded with a validation check improving the interpretability and robustness\nof the process. The current version of Art comes equipped with nine predefined\nsteps inspired by Andrej Karpathy's Recipe for Training Neural Networks, a\nvisualization dashboard, and integration with loggers such as Neptune. The code\nrelated to this paper is available at:\nhttps://github.com/SebChw/Actually-Robust-Training.\n","authors":["Sebastian Chwilczyński","Kacper Trębacz","Karol Cyganik","Mateusz Małecki","Dariusz Brzezinski"],"pdf_url":"https://arxiv.org/pdf/2408.16285v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16284v1","updated":"2024-08-29T06:27:42Z","published":"2024-08-29T06:27:42Z","title":"Enhancing Customer Churn Prediction in Telecommunications: An Adaptive\n  Ensemble Learning Approach","summary":"  Customer churn, the discontinuation of services by existing customers, poses\na significant challenge to the telecommunications industry. This paper proposes\na novel adaptive ensemble learning framework for highly accurate customer churn\nprediction. The framework integrates multiple base models, including XGBoost,\nLightGBM, LSTM, a Multi-Layer Perceptron (MLP) neural network, and Support\nVector Machine (SVM). These models are strategically combined using a stacking\nensemble method, further enhanced by meta-feature generation from base model\npredictions. A rigorous data preprocessing pipeline, coupled with a\nmulti-faceted feature engineering approach, optimizes model performance. The\nframework is evaluated on three publicly available telecom churn datasets,\ndemonstrating substantial accuracy improvements over state-of-the-art\ntechniques. The research achieves a remarkable 99.28% accuracy, signifying a\nmajor advancement in churn prediction.The implications of this research for\ndeveloping proactive customer retention strategies withinthe telecommunications\nindustry are discussed.\n","authors":["Mohammed Affan Shaikhsurab","Pramod Magadum"],"pdf_url":"https://arxiv.org/pdf/2408.16284v1.pdf","comment":"12 pages,2 figures"},{"id":"http://arxiv.org/abs/2405.07288v2","updated":"2024-08-29T06:22:48Z","published":"2024-05-12T14:01:05Z","title":"Erasing Concepts from Text-to-Image Diffusion Models with Few-shot\n  Unlearning","summary":"  Generating images from text has become easier because of the scaling of\ndiffusion models and advancements in the field of vision and language. These\nmodels are trained using vast amounts of data from the Internet. Hence, they\noften contain undesirable content such as copyrighted material. As it is\nchallenging to remove such data and retrain the models, methods for erasing\nspecific concepts from pre-trained models have been investigated. We propose a\nnovel concept-erasure method that updates the text encoder using few-shot\nunlearning in which a few real images are used. The discussion regarding the\ngenerated images after erasing a concept has been lacking. While there are\nmethods for specifying the transition destination for concepts, the validity of\nthe specified concepts is unclear. Our method implicitly achieves this by\ntransitioning to the latent concepts inherent in the model or the images. Our\nmethod can erase a concept within 10 s, making concept erasure more accessible\nthan ever before. Implicitly transitioning to related concepts leads to more\nnatural concept erasure. We applied the proposed method to various concepts and\nconfirmed that concept erasure can be achieved tens to hundreds of times faster\nthan with current methods. By varying the parameters to be updated, we obtained\nresults suggesting that, like previous research, knowledge is primarily\naccumulated in the feed-forward networks of the text encoder. Our code is\navailable at \\url{https://github.com/fmp453/few-shot-erasing}\n","authors":["Masane Fuchi","Tomohiro Takagi"],"pdf_url":"https://arxiv.org/pdf/2405.07288v2.pdf","comment":"25 pages, 28 figures, accepted by BMVC2024"},{"id":"http://arxiv.org/abs/2304.14326v2","updated":"2024-08-29T06:17:11Z","published":"2023-04-27T16:58:29Z","title":"A Best-of-Both-Worlds Algorithm for Constrained MDPs with Long-Term\n  Constraints","summary":"  We study online learning in episodic constrained Markov decision processes\n(CMDPs), where the learner aims at collecting as much reward as possible over\nthe episodes, while satisfying some long-term constraints during the learning\nprocess. Rewards and constraints can be selected either stochastically or\nadversarially, and the transition function is not known to the learner. While\nonline learning in classical (unconstrained) MDPs has received considerable\nattention over the last years, the setting of CMDPs is still largely\nunexplored. This is surprising, since in real-world applications, such as,\ne.g., autonomous driving, automated bidding, and recommender systems, there are\nusually additional constraints and specifications that an agent has to obey\nduring the learning process. In this paper, we provide the first\nbest-of-both-worlds algorithm for CMDPs with long-term constraints, in the\nflavor of Balseiro et al. (2023). Our algorithm is capable of handling settings\nin which rewards and constraints are selected either stochastically or\nadversarially, without requiring any knowledge of the underling process.\nMoreover, our algorithm matches state-of-the-art regret and constraint\nviolation bounds for settings in which constraints are selected stochastically,\nwhile it is the first to provide guarantees in the case in which they are\nchosen adversarially.\n","authors":["Jacopo Germano","Francesco Emanuele Stradi","Gianmarco Genalti","Matteo Castiglioni","Alberto Marchesi","Nicola Gatti"],"pdf_url":"https://arxiv.org/pdf/2304.14326v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14014v2","updated":"2024-08-29T06:04:57Z","published":"2024-08-26T04:39:33Z","title":"Category-Theoretical and Topos-Theoretical Frameworks in Machine\n  Learning: A Survey","summary":"  In this survey, we provide an overview of category theory-derived machine\nlearning from four mainstream perspectives: gradient-based learning,\nprobability-based learning, invariance and equivalence-based learning, and\ntopos-based learning. For the first three topics, we primarily review research\nin the past five years, updating and expanding on the previous survey by\nShiebler et al.. The fourth topic, which delves into higher category theory,\nparticularly topos theory, is surveyed for the first time in this paper. In\ncertain machine learning methods, the compositionality of functors plays a\nvital role, prompting the development of specific categorical frameworks.\nHowever, when considering how the global properties of a network reflect in\nlocal structures and how geometric properties are expressed with logic, the\ntopos structure becomes particularly significant and profound.\n","authors":["Yiyang Jia","Guohong Peng","Zheng Yang","Tianhao Chen"],"pdf_url":"https://arxiv.org/pdf/2408.14014v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.06380v3","updated":"2024-08-29T05:59:24Z","published":"2023-10-10T07:46:54Z","title":"CAST: Cluster-Aware Self-Training for Tabular Data via Reliable\n  Confidence","summary":"  Tabular data is one of the most widely used data modalities, encompassing\nnumerous datasets with substantial amounts of unlabeled data. Despite this\nprevalence, there is a notable lack of simple and versatile methods for\nutilizing unlabeled data in the tabular domain, where both gradient-boosting\ndecision trees and neural networks are employed. In this context, self-training\nhas gained attraction due to its simplicity and versatility, yet it is\nvulnerable to noisy pseudo-labels caused by erroneous confidence. Several\nsolutions have been proposed to handle this problem, but they often compromise\nthe inherent advantages of self-training, resulting in limited applicability in\nthe tabular domain. To address this issue, we explore a novel direction of\nreliable confidence in self-training contexts and conclude that self-training\ncan be improved by making that the confidence, which represents the value of\nthe pseudo-label, aligns with the cluster assumption. In this regard, we\npropose Cluster-Aware Self-Training (CAST) for tabular data, which enhances\nexisting self-training algorithms at a negligible cost while maintaining\nsimplicity and versatility. Concretely, CAST calibrates confidence by\nregularizing the classifier's confidence based on local density for each class\nin the labeled training data, resulting in lower confidence for pseudo-labels\nin low-density regions. Extensive empirical evaluations on up to 21 real-world\ndatasets confirm not only the superior performance of CAST but also its\nrobustness in various setups in self-training contexts.\n","authors":["Minwook Kim","Juseong Kim","Ki Beom Kim","Giltae Song"],"pdf_url":"https://arxiv.org/pdf/2310.06380v3.pdf","comment":"11 pages for main body, and 10 additional pages for appendix"},{"id":"http://arxiv.org/abs/2408.16278v1","updated":"2024-08-29T05:56:35Z","published":"2024-08-29T05:56:35Z","title":"Web Service QoS Prediction via Extended Canonical Polyadic-based Tensor\n  Network","summary":"  Today, numerous web services with similar functionalities are available on\nthe Internet. Users often evaluate the Quality of Service (QoS) to choose the\nbest option among them. Predicting the QoS values of these web services is a\nsignificant challenge in the field of web services. A Canonical Polyadic\n(CP)-based tensor network model has proven to be efficient for predicting\ndynamic QoS data. However, current CP-based tensor network models do not\nconsider the correlation of users and services in the low-dimensional latent\nfeature space, thereby limiting model's prediction capability. To tackle this\nissue, this paper proposes an Extended Canonical polyadic-based Tensor Network\n(ECTN) model. It models the correlation of users and services via building a\nrelation dimension between user feature and service feature in low-dimensional\nspace, and then designs an extended CP decomposition structure to improve\nprediction accuracy. Experiments are conducted on two public dynamic QoS data,\nand the results show that compared with state-of-the-art QoS prediction models,\nthe ECTN obtains higher prediction accuracy.\n","authors":["Qu Wang","Hao Wu"],"pdf_url":"https://arxiv.org/pdf/2408.16278v1.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2403.01942v4","updated":"2024-08-29T05:48:42Z","published":"2024-03-04T11:24:51Z","title":"Mitigating Label Noise on Graph via Topological Sample Selection","summary":"  Despite the success of the carefully-annotated benchmarks, the effectiveness\nof existing graph neural networks (GNNs) can be considerably impaired in\npractice when the real-world graph data is noisily labeled. Previous\nexplorations in sample selection have been demonstrated as an effective way for\nrobust learning with noisy labels, however, the conventional studies focus on\ni.i.d data, and when moving to non-iid graph data and GNNs, two notable\nchallenges remain: (1) nodes located near topological class boundaries are very\ninformative for classification but cannot be successfully distinguished by the\nheuristic sample selection. (2) there is no available measure that considers\nthe graph topological information to promote sample selection in a graph. To\naddress this dilemma, we propose a $\\textit{Topological Sample Selection}$\n(TSS) method that boosts the informative sample selection process in a graph by\nutilising topological information. We theoretically prove that our procedure\nminimizes an upper bound of the expected risk under target clean distribution,\nand experimentally show the superiority of our method compared with\nstate-of-the-art baselines.\n","authors":["Yuhao Wu","Jiangchao Yao","Xiaobo Xia","Jun Yu","Ruxin Wang","Bo Han","Tongliang Liu"],"pdf_url":"https://arxiv.org/pdf/2403.01942v4.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2408.14400v2","updated":"2024-08-29T05:37:38Z","published":"2024-08-26T16:34:13Z","title":"Satellite Sunroof: High-res Digital Surface Models and Roof Segmentation\n  for Global Solar Mapping","summary":"  The transition to renewable energy, particularly solar, is key to mitigating\nclimate change. Google's Solar API aids this transition by estimating solar\npotential from aerial imagery, but its impact is constrained by geographical\ncoverage. This paper proposes expanding the API's reach using satellite\nimagery, enabling global solar potential assessment. We tackle challenges\ninvolved in building a Digital Surface Model (DSM) and roof instance\nsegmentation from lower resolution and single oblique views using deep learning\nmodels. Our models, trained on aligned satellite and aerial datasets, produce\n25cm DSMs and roof segments. With ~1m DSM MAE on buildings, ~5deg roof pitch\nerror and ~56% IOU on roof segmentation, they significantly enhance the Solar\nAPI's potential to promote solar adoption.\n","authors":["Vishal Batchu","Alex Wilson","Betty Peng","Carl Elkin","Umangi Jain","Christopher Van Arsdale","Ross Goroshin","Varun Gulshan"],"pdf_url":"https://arxiv.org/pdf/2408.14400v2.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2408.16262v1","updated":"2024-08-29T04:57:44Z","published":"2024-08-29T04:57:44Z","title":"On Convergence of Average-Reward Q-Learning in Weakly Communicating\n  Markov Decision Processes","summary":"  This paper analyzes reinforcement learning (RL) algorithms for Markov\ndecision processes (MDPs) under the average-reward criterion. We focus on\nQ-learning algorithms based on relative value iteration (RVI), which are\nmodel-free stochastic analogues of the classical RVI method for average-reward\nMDPs. These algorithms have low per-iteration complexity, making them\nwell-suited for large state space problems. We extend the almost-sure\nconvergence analysis of RVI Q-learning algorithms developed by Abounadi,\nBertsekas, and Borkar (2001) from unichain to weakly communicating MDPs. This\nextension is important both practically and theoretically: weakly communicating\nMDPs cover a much broader range of applications compared to unichain MDPs, and\ntheir optimality equations have a richer solution structure (with multiple\ndegrees of freedom), introducing additional complexity in proving algorithmic\nconvergence. We also characterize the sets to which RVI Q-learning algorithms\nconverge, showing that they are compact, connected, potentially nonconvex, and\ncomprised of solutions to the average-reward optimality equation, with exactly\none less degree of freedom than the general solution set of this equation.\nFurthermore, we extend our analysis to two RVI-based hierarchical\naverage-reward RL algorithms using the options framework, proving their\nalmost-sure convergence and characterizing their sets of convergence under the\nassumption that the underlying semi-Markov decision process is weakly\ncommunicating.\n","authors":["Yi Wan","Huizhen Yu","Richard S. Sutton"],"pdf_url":"https://arxiv.org/pdf/2408.16262v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16261v1","updated":"2024-08-29T04:46:49Z","published":"2024-08-29T04:46:49Z","title":"Evaluating Time-Series Training Dataset through Lens of Spectrum in Deep\n  State Space Models","summary":"  This study investigates a method to evaluate time-series datasets in terms of\nthe performance of deep neural networks (DNNs) with state space models (deep\nSSMs) trained on the dataset. SSMs have attracted attention as components\ninside DNNs to address time-series data. Since deep SSMs have powerful\nrepresentation capacities, training datasets play a crucial role in solving a\nnew task. However, the effectiveness of training datasets cannot be known until\ndeep SSMs are actually trained on them. This can increase the cost of data\ncollection for new tasks, as a trial-and-error process of data collection and\ntime-consuming training are needed to achieve the necessary performance. To\nadvance the practical use of deep SSMs, the metric of datasets to estimate the\nperformance early in the training can be one key element. To this end, we\nintroduce the concept of data evaluation methods used in system identification.\nIn system identification of linear dynamical systems, the effectiveness of\ndatasets is evaluated by using the spectrum of input signals. We introduce this\nconcept to deep SSMs, which are nonlinear dynamical systems. We propose the\nK-spectral metric, which is the sum of the top-K spectra of signals inside deep\nSSMs, by focusing on the fact that each layer of a deep SSM can be regarded as\na linear dynamical system. Our experiments show that the K-spectral metric has\na large absolute value of the correlation coefficient with the performance and\ncan be used to evaluate the quality of training datasets.\n","authors":["Sekitoshi Kanai","Yasutoshi Ida","Kazuki Adachi","Mihiro Uchida","Tsukasa Yoshida","Shin'ya Yamaguchi"],"pdf_url":"https://arxiv.org/pdf/2408.16261v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.16256v1","updated":"2024-08-29T04:35:36Z","published":"2024-08-29T04:35:36Z","title":"Coalitions of AI-based Methods Predict 15-Year Risks of Breast Cancer\n  Metastasis Using Real-World Clinical Data with AUC up to 0.9","summary":"  Breast cancer is one of the two cancers responsible for the most deaths in\nwomen, with about 42,000 deaths each year in the US. That there are over\n300,000 breast cancers newly diagnosed each year suggests that only a fraction\nof the cancers result in mortality. Thus, most of the women undergo seemingly\ncurative treatment for localized cancers, but a significant later succumb to\nmetastatic disease for which current treatments are only temporizing for the\nvast majority. The current prognostic metrics are of little actionable value\nfor 4 of the 5 women seemingly cured after local treatment, and many women are\nexposed to morbid and even mortal adjuvant therapies unnecessarily, with these\nadjuvant therapies reducing metastatic recurrence by only a third. Thus, there\nis a need for better prognostics to target aggressive treatment at those who\nare likely to relapse and spare those who were actually cured. While there is a\nplethora of molecular and tumor-marker assays in use and under-development to\ndetect recurrence early, these are time consuming, expensive and still often\nun-validated as to actionable prognostic utility. A different approach would\nuse large data techniques to determine clinical and histopathological\nparameters that would provide accurate prognostics using existing data. Herein,\nwe report on machine learning, together with grid search and Bayesian Networks\nto develop algorithms that present a AUC of up to 0.9 in ROC analyses, using\nonly extant data. Such algorithms could be rapidly translated to clinical\nmanagement as they do not require testing beyond routine tumor evaluations.\n","authors":["Xia Jiang","Yijun Zhou","Alan Wells","Adam Brufsky"],"pdf_url":"https://arxiv.org/pdf/2408.16256v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16249v1","updated":"2024-08-29T04:06:34Z","published":"2024-08-29T04:06:34Z","title":"Iterated Energy-based Flow Matching for Sampling from Boltzmann\n  Densities","summary":"  In this work, we consider the problem of training a generator from\nevaluations of energy functions or unnormalized densities. This is a\nfundamental problem in probabilistic inference, which is crucial for scientific\napplications such as learning the 3D coordinate distribution of a molecule. To\nsolve this problem, we propose iterated energy-based flow matching (iEFM), the\nfirst off-policy approach to train continuous normalizing flow (CNF) models\nfrom unnormalized densities. We introduce the simulation-free energy-based flow\nmatching objective, which trains the model to predict the Monte Carlo\nestimation of the marginal vector field constructed from known energy\nfunctions. Our framework is general and can be extended to variance-exploding\n(VE) and optimal transport (OT) conditional probability paths. We evaluate iEFM\non a two-dimensional Gaussian mixture model (GMM) and an eight-dimensional\nfour-particle double-well potential (DW-4) energy function. Our results\ndemonstrate that iEFM outperforms existing methods, showcasing its potential\nfor efficient and scalable probabilistic modeling in complex high-dimensional\nsystems.\n","authors":["Dongyeop Woo","Sungsoo Ahn"],"pdf_url":"https://arxiv.org/pdf/2408.16249v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16246v1","updated":"2024-08-29T03:58:19Z","published":"2024-08-29T03:58:19Z","title":"PACiM: A Sparsity-Centric Hybrid Compute-in-Memory Architecture via\n  Probabilistic Approximation","summary":"  Approximate computing emerges as a promising approach to enhance the\nefficiency of compute-in-memory (CiM) systems in deep neural network\nprocessing. However, traditional approximate techniques often significantly\ntrade off accuracy for power efficiency, and fail to reduce data transfer\nbetween main memory and CiM banks, which dominates power consumption. This\npaper introduces a novel probabilistic approximate computation (PAC) method\nthat leverages statistical techniques to approximate multiply-and-accumulation\n(MAC) operations, reducing approximation error by 4X compared to existing\napproaches. PAC enables efficient sparsity-based computation in CiM systems by\nsimplifying complex MAC vector computations into scalar calculations. Moreover,\nPAC enables sparsity encoding and eliminates the LSB activations transmission,\nsignificantly reducing data reads and writes. This sets PAC apart from\ntraditional approximate computing techniques, minimizing not only computation\npower but also memory accesses by 50%, thereby boosting system-level\nefficiency. We developed PACiM, a sparsity-centric architecture that fully\nexploits sparsity to reduce bit-serial cycles by 81% and achieves a peak 8b/8b\nefficiency of 14.63 TOPS/W in 65 nm CMOS while maintaining high accuracy of\n93.85/72.36/66.02% on CIFAR-10/CIFAR-100/ImageNet benchmarks using a ResNet-18\nmodel, demonstrating the effectiveness of our PAC methodology.\n","authors":["Wenlun Zhang","Shimpei Ando","Yung-Chin Chen","Satomi Miyagi","Shinya Takamaeda-Yamazaki","Kentaro Yoshioka"],"pdf_url":"https://arxiv.org/pdf/2408.16246v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16245v1","updated":"2024-08-29T03:56:40Z","published":"2024-08-29T03:56:40Z","title":"Large-Scale Multi-omic Biosequence Transformers for Modeling\n  Peptide-Nucleotide Interactions","summary":"  The transformer architecture has revolutionized bioinformatics and driven\nprogress in the understanding and prediction of the properties of biomolecules.\nAlmost all research on large-scale biosequence transformers has focused on one\ndomain at a time (single-omic), usually nucleotides or peptides. These models\nhave seen incredible success in downstream tasks in each domain and have\nachieved particularly noteworthy breakthroughs in sequences of peptides and\nstructural modeling. However, these single-omic models are naturally incapable\nof modeling multi-omic tasks, one of the most biologically critical being\nnucleotide-peptide interactions.\n  We present our work training the first multi-omic nucleotide-peptide\nfoundation models. We show that these multi-omic models (MOMs) can learn joint\nrepresentations between various single-omic distributions that are emergently\nconsistent with the Central Dogma of molecular biology, despite only being\ntrained on unlabeled biosequences. We further demonstrate that MOMs can be\nfine-tuned to achieve state-of-the-art results on peptide-nucleotide\ninteraction tasks, namely predicting the change in Gibbs free energy\n({\\Delta}G) of the binding interaction between a given oligonucleotide and\npeptide, as well as the effect on this binding interaction due to mutations in\nthe oligonucleotide sequence ({\\Delta}{\\Delta}G).\n  Remarkably, we show that multi-omic biosequence transformers emergently learn\nuseful structural information without any prior structural training, allowing\nus to predict which peptide residues are most involved in the\npeptide-nucleotide binding interaction. Lastly, we provide evidence that\nmulti-omic biosequence models are non-inferior to foundation models trained on\nsingle-omics distributions, suggesting a more generalized or foundational\napproach to building these models.\n","authors":["Sully F. Chen","Robert J. Steele","Beakal Lemeneh","Shivanand P. Lad","Eric Oermann"],"pdf_url":"https://arxiv.org/pdf/2408.16245v1.pdf","comment":"27 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.16232v1","updated":"2024-08-29T03:12:04Z","published":"2024-08-29T03:12:04Z","title":"Enhancing Conditional Image Generation with Explainable Latent Space\n  Manipulation","summary":"  In the realm of image synthesis, achieving fidelity to a reference image\nwhile adhering to conditional prompts remains a significant challenge. This\npaper proposes a novel approach that integrates a diffusion model with latent\nspace manipulation and gradient-based selective attention mechanisms to address\nthis issue. Leveraging Grad-SAM (Gradient-based Selective Attention\nManipulation), we analyze the cross attention maps of the cross attention\nlayers and gradients for the denoised latent vector, deriving importance scores\nof elements of denoised latent vector related to the subject of interest. Using\nthis information, we create masks at specific timesteps during denoising to\npreserve subjects while seamlessly integrating the reference image features.\nThis approach ensures the faithful formation of subjects based on conditional\nprompts, while concurrently refining the background for a more coherent\ncomposition. Our experiments on places365 dataset demonstrate promising\nresults, with our proposed model achieving the lowest mean and median Frechet\nInception Distance (FID) scores compared to baseline models, indicating\nsuperior fidelity preservation. Furthermore, our model exhibits competitive\nperformance in aligning the generated images with provided textual\ndescriptions, as evidenced by high CLIP scores. These results highlight the\neffectiveness of our approach in both fidelity preservation and textual context\npreservation, offering a significant advancement in text-to-image synthesis\ntasks.\n","authors":["Kshitij Pathania"],"pdf_url":"https://arxiv.org/pdf/2408.16232v1.pdf","comment":"7 pages , 5 figures"},{"id":"http://arxiv.org/abs/2312.02139v3","updated":"2024-08-29T03:09:40Z","published":"2023-12-04T18:57:01Z","title":"DiffiT: Diffusion Vision Transformers for Image Generation","summary":"  Diffusion models with their powerful expressivity and high sample quality\nhave achieved State-Of-The-Art (SOTA) performance in the generative domain. The\npioneering Vision Transformer (ViT) has also demonstrated strong modeling\ncapabilities and scalability, especially for recognition tasks. In this paper,\nwe study the effectiveness of ViTs in diffusion-based generative learning and\npropose a new model denoted as Diffusion Vision Transformers (DiffiT).\nSpecifically, we propose a methodology for finegrained control of the denoising\nprocess and introduce the Time-dependant Multihead Self Attention (TMSA)\nmechanism. DiffiT is surprisingly effective in generating high-fidelity images\nwith significantly better parameter efficiency. We also propose latent and\nimage space DiffiT models and show SOTA performance on a variety of\nclass-conditional and unconditional synthesis tasks at different resolutions.\nThe Latent DiffiT model achieves a new SOTA FID score of 1.73 on ImageNet256\ndataset while having 19.85%, 16.88% less parameters than other\nTransformer-based diffusion models such as MDT and DiT,respectively. Code:\nhttps://github.com/NVlabs/DiffiT\n","authors":["Ali Hatamizadeh","Jiaming Song","Guilin Liu","Jan Kautz","Arash Vahdat"],"pdf_url":"https://arxiv.org/pdf/2312.02139v3.pdf","comment":"Accepted to ECCV'24"},{"id":"http://arxiv.org/abs/2408.16228v1","updated":"2024-08-29T03:03:35Z","published":"2024-08-29T03:03:35Z","title":"Policy Adaptation via Language Optimization: Decomposing Tasks for\n  Few-Shot Imitation","summary":"  Learned language-conditioned robot policies often struggle to effectively\nadapt to new real-world tasks even when pre-trained across a diverse set of\ninstructions. We propose a novel approach for few-shot adaptation to unseen\ntasks that exploits the semantic understanding of task decomposition provided\nby vision-language models (VLMs). Our method, Policy Adaptation via Language\nOptimization (PALO), combines a handful of demonstrations of a task with\nproposed language decompositions sampled from a VLM to quickly enable rapid\nnonparametric adaptation, avoiding the need for a larger fine-tuning dataset.\nWe evaluate PALO on extensive real-world experiments consisting of challenging\nunseen, long-horizon robot manipulation tasks. We find that PALO is able of\nconsistently complete long-horizon, multi-tier tasks in the real world,\noutperforming state of the art pre-trained generalist policies, and methods\nthat have access to the same demonstrations.\n","authors":["Vivek Myers","Bill Chunyuan Zheng","Oier Mees","Sergey Levine","Kuan Fang"],"pdf_url":"https://arxiv.org/pdf/2408.16228v1.pdf","comment":"27 pages, 14 figures"},{"id":"http://arxiv.org/abs/2307.03411v2","updated":"2024-08-29T02:45:14Z","published":"2023-07-07T06:26:44Z","title":"Learning from Heterogeneity: A Dynamic Learning Framework for\n  Hypergraphs","summary":"  Graph neural network (GNN) has gained increasing popularity in recent years\nowing to its capability and flexibility in modeling complex graph structure\ndata. Among all graph learning methods, hypergraph learning is a technique for\nexploring the implicit higher-order correlations when training the embedding\nspace of the graph. In this paper, we propose a hypergraph learning framework\nnamed LFH that is capable of dynamic hyperedge construction and attentive\nembedding update utilizing the heterogeneity attributes of the graph.\nSpecifically, in our framework, the high-quality features are first generated\nby the pairwise fusion strategy that utilizes explicit graph structure\ninformation when generating initial node embedding. Afterwards, a hypergraph is\nconstructed through the dynamic grouping of implicit hyperedges, followed by\nthe type-specific hypergraph learning process. To evaluate the effectiveness of\nour proposed framework, we conduct comprehensive experiments on several popular\ndatasets with eleven state-of-the-art models on both node classification and\nlink prediction tasks, which fall into categories of homogeneous pairwise graph\nlearning, heterogeneous pairwise graph learning, and hypergraph learning. The\nexperiment results demonstrate a significant performance gain (average 12.5% in\nnode classification and 13.3% in link prediction) compared with recent\nstate-of-the-art methods.\n","authors":["Tiehua Zhang","Yuze Liu","Zhishu Shen","Xingjun Ma","Peng Qi","Zhijun Ding","Jiong Jin"],"pdf_url":"https://arxiv.org/pdf/2307.03411v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.07585v2","updated":"2024-08-29T02:37:03Z","published":"2024-03-12T12:15:57Z","title":"Communication Optimization for Distributed Training: Architecture,\n  Advances, and Opportunities","summary":"  The past few years have witnessed the flourishing of large-scale deep neural\nnetwork models with ever-growing parameter numbers. Training such large-scale\nmodels typically requires massive memory and computing resources, necessitating\ndistributed training. As GPU performance has rapidly evolved in recent years,\ncomputation time has shrunk, making communication a larger portion of the\noverall training time. Consequently, optimizing communication for distributed\ntraining has become crucial. In this article, we briefly introduce the general\narchitecture of distributed deep neural network training and analyze\nrelationships among Parallelization Strategy, Collective Communication Library,\nand Network from the perspective of communication optimization, which forms a\nthree-layer paradigm. We then review current representative research advances\nwithin this three-layer paradigm. We find that layers in the current\nthree-layer paradigm are relatively independent and there is a rich design\nspace for cross-layer collaborative optimization in distributed training\nscenarios. Therefore, we advocate \"Vertical\" and \"Horizontal\" co-designs which\nextend the three-layer paradigm to a five-layer paradigm. We also advocate\n\"Intra-Inter\" and \"Host-Net\" co-designs to further utilize the potential of\nheterogeneous resources. We hope this article can shed some light on future\nresearch on communication optimization for distributed training.\n","authors":["Yunze Wei","Tianshuo Hu","Cong Liang","Yong Cui"],"pdf_url":"https://arxiv.org/pdf/2403.07585v2.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.21191v2","updated":"2024-08-29T02:27:19Z","published":"2024-07-30T20:58:36Z","title":"GenRec: Generative Sequential Recommendation with Large Language Models","summary":"  Sequential recommendation is a task to capture hidden user preferences from\nhistorical user item interaction data and recommend next items for the user.\nSignificant progress has been made in this domain by leveraging classification\nbased learning methods. Inspired by the recent paradigm of 'pretrain, prompt\nand predict' in NLP, we consider sequential recommendation as a sequence to\nsequence generation task and propose a novel model named Generative\nRecommendation (GenRec). Unlike classification based models that learn explicit\nuser and item representations, GenRec utilizes the sequence modeling capability\nof Transformer and adopts the masked item prediction objective to effectively\nlearn the hidden bidirectional sequential patterns. Different from existing\ngenerative sequential recommendation models, GenRec does not rely on manually\ndesigned hard prompts. The input to GenRec is textual user item sequence and\nthe output is top ranked next items. Moreover, GenRec is lightweight and\nrequires only a few hours to train effectively in low-resource settings, making\nit highly applicable to real-world scenarios and helping to democratize large\nlanguage models in the sequential recommendation domain. Our extensive\nexperiments have demonstrated that GenRec generalizes on various public\nreal-world datasets and achieves state-of-the-art results. Our experiments also\nvalidate the effectiveness of the the proposed masked item prediction objective\nthat improves the model performance by a large margin.\n","authors":["Panfeng Cao","Pietro Lio"],"pdf_url":"https://arxiv.org/pdf/2407.21191v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16218v1","updated":"2024-08-29T02:21:11Z","published":"2024-08-29T02:21:11Z","title":"Targeted Cause Discovery with Data-Driven Learning","summary":"  We propose a novel machine learning approach for inferring causal variables\nof a target variable from observations. Our goal is to identify both direct and\nindirect causes within a system, thereby efficiently regulating the target\nvariable when the difficulty and cost of intervening on each causal variable\nvary. Our method employs a neural network trained to identify causality through\nsupervised learning on simulated data. By implementing a local-inference\nstrategy, we achieve linear complexity with respect to the number of variables,\nefficiently scaling up to thousands of variables. Empirical results demonstrate\nthe effectiveness of our method in identifying causal relationships within\nlarge-scale gene regulatory networks, outperforming existing causal discovery\nmethods that primarily focus on direct causality. We validate our model's\ngeneralization capability across novel graph structures and generating\nmechanisms, including gene regulatory networks of E. coli and the human K562\ncell line. Implementation codes are available at\nhttps://github.com/snu-mllab/Targeted-Cause-Discovery.\n","authors":["Jang-Hyun Kim","Claudia Skok Gibbs","Sangdoo Yun","Hyun Oh Song","Kyunghyun Cho"],"pdf_url":"https://arxiv.org/pdf/2408.16218v1.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2408.16215v1","updated":"2024-08-29T02:18:28Z","published":"2024-08-29T02:18:28Z","title":"Adversarial Network Optimization under Bandit Feedback: Maximizing\n  Utility in Non-Stationary Multi-Hop Networks","summary":"  Stochastic Network Optimization (SNO) concerns scheduling in stochastic\nqueueing systems. It has been widely studied in network theory. Classical SNO\nalgorithms require network conditions to be stationary with time, which fails\nto capture the non-stationary components in many real-world scenarios. Many\nexisting algorithms also assume knowledge of network conditions before\ndecision, which rules out applications where unpredictability presents.\n  Motivated by these issues, we consider Adversarial Network Optimization (ANO)\nunder bandit feedback. Specifically, we consider the task of *i)* maximizing\nsome unknown and time-varying utility function associated to scheduler's\nactions, where *ii)* the underlying network is a non-stationary multi-hop one\nwhose conditions change arbitrarily with time, and *iii)* only bandit feedback\n(effect of actually deployed actions) is revealed after decisions. Our proposed\n`UMO2` algorithm ensures network stability and also matches the utility\nmaximization performance of any \"mildly varying\" reference policy up to a\npolynomially decaying gap. To our knowledge, no previous ANO algorithm handled\nmulti-hop networks or achieved utility guarantees under bandit feedback,\nwhereas ours can do both.\n  Technically, our method builds upon a novel integration of online learning\ninto Lyapunov analyses: To handle complex inter-dependencies among queues in\nmulti-hop networks, we propose meticulous techniques to balance online learning\nand Lyapunov arguments. To tackle the learning obstacles due to potentially\nunbounded queue sizes, we design a new online linear optimization algorithm\nthat automatically adapts to loss magnitudes. To maximize utility, we propose a\nbandit convex optimization algorithm with novel queue-dependent learning rate\nscheduling that suites drastically varying queue lengths. Our new insights in\nonline learning can be of independent interest.\n","authors":["Yan Dai","Longbo Huang"],"pdf_url":"https://arxiv.org/pdf/2408.16215v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.13989v2","updated":"2024-08-29T02:17:51Z","published":"2024-07-19T02:34:10Z","title":"Enhancing Data-Limited Graph Neural Networks by Actively Distilling\n  Knowledge from Large Language Models","summary":"  Graphs are pervasive in the real-world, such as social network analysis,\nbioinformatics, and knowledge graphs. Graph neural networks (GNNs) have great\nability in node classification, a fundamental task on graphs. Unfortunately,\nconventional GNNs still face challenges in scenarios with few labeled nodes,\ndespite the prevalence of few-shot node classification tasks in real-world\napplications. To address this challenge, various approaches have been proposed,\nincluding graph meta-learning, transfer learning, and methods based on Large\nLanguage Models (LLMs). However, traditional meta-learning and transfer\nlearning methods often require prior knowledge from base classes or fail to\nexploit the potential advantages of unlabeled nodes. Meanwhile, LLM-based\nmethods may overlook the zero-shot capabilities of LLMs and rely heavily on the\nquality of generated contexts. In this paper, we propose a novel approach that\nintegrates LLMs and GNNs, leveraging the zero-shot inference and reasoning\ncapabilities of LLMs and employing a Graph-LLM-based active learning paradigm\nto enhance GNNs' performance. Extensive experiments demonstrate the\neffectiveness of our model in improving node classification accuracy with\nconsiderably limited labeled data, surpassing state-of-the-art baselines by\nsignificant margins.\n","authors":["Quan Li","Tianxiang Zhao","Lingwei Chen","Junjie Xu","Suhang Wang"],"pdf_url":"https://arxiv.org/pdf/2407.13989v2.pdf","comment":"10 pages, 3 Figures"},{"id":"http://arxiv.org/abs/2408.16212v1","updated":"2024-08-29T02:09:19Z","published":"2024-08-29T02:09:19Z","title":"The Application of Machine Learning in Tidal Evolution Simulation of\n  Star-Planet Systems","summary":"  With the release of a large amount of astronomical data, an increasing number\nof close-in hot Jupiters have been discovered. Calculating their evolutionary\ncurves using star-planet interaction models presents a challenge. To expedite\nthe generation of evolutionary curves for these close-in hot Jupiter systems,\nwe utilized tidal interaction models established on MESA to create 15,745\nsamples of star-planet systems and 7,500 samples of stars. Additionally, we\nemployed a neural network (Multi-Layer Perceptron - MLP) to predict the\nevolutionary curves of the systems, including stellar effective temperature,\nradius, stellar rotation period, and planetary orbital period. The median\nrelative errors of the predicted evolutionary curves were found to be 0.15%,\n0.43%, 2.61%, and 0.57%, respectively. Furthermore, the speed at which we\ngenerate evolutionary curves exceeds that of model-generated curves by more\nthan four orders of magnitude. We also extracted features of planetary\nmigration states and utilized lightGBM to classify the samples into 6\ncategories for prediction. We found that by combining three types that undergo\nlong-term double synchronization into one label, the classifier effectively\nrecognized these features. Apart from systems experiencing long-term double\nsynchronization, the median relative errors of the predicted evolutionary\ncurves were all below 4%. Our work provides an efficient method to save\nsignificant computational resources and time with minimal loss in accuracy.\nThis research also lays the foundation for analyzing the evolutionary\ncharacteristics of systems under different migration states, aiding in the\nunderstanding of the underlying physical mechanisms of such systems. Finally,\nto a large extent, our approach could replace the calculations of theoretical\nmodels.\n","authors":["Shuaishuai Guo","Jianheng Guo","KaiFan Ji","Hui Liu","Lei Xing"],"pdf_url":"https://arxiv.org/pdf/2408.16212v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16208v1","updated":"2024-08-29T02:03:05Z","published":"2024-08-29T02:03:05Z","title":"ReXamine-Global: A Framework for Uncovering Inconsistencies in Radiology\n  Report Generation Metrics","summary":"  Given the rapidly expanding capabilities of generative AI models for\nradiology, there is a need for robust metrics that can accurately measure the\nquality of AI-generated radiology reports across diverse hospitals. We develop\nReXamine-Global, a LLM-powered, multi-site framework that tests metrics across\ndifferent writing styles and patient populations, exposing gaps in their\ngeneralization. First, our method tests whether a metric is undesirably\nsensitive to reporting style, providing different scores depending on whether\nAI-generated reports are stylistically similar to ground-truth reports or not.\nSecond, our method measures whether a metric reliably agrees with experts, or\nwhether metric and expert scores of AI-generated report quality diverge for\nsome sites. Using 240 reports from 6 hospitals around the world, we apply\nReXamine-Global to 7 established report evaluation metrics and uncover serious\ngaps in their generalizability. Developers can apply ReXamine-Global when\ndesigning new report evaluation metrics, ensuring their robustness across\nsites. Additionally, our analysis of existing metrics can guide users of those\nmetrics towards evaluation procedures that work reliably at their sites of\ninterest.\n","authors":["Oishi Banerjee","Agustina Saenz","Kay Wu","Warren Clements","Adil Zia","Dominic Buensalido","Helen Kavnoudias","Alain S. Abi-Ghanem","Nour El Ghawi","Cibele Luna","Patricia Castillo","Khaled Al-Surimi","Rayyan A. Daghistani","Yuh-Min Chen","Heng-sheng Chao","Lars Heiliger","Moon Kim","Johannes Haubold","Frederic Jonske","Pranav Rajpurkar"],"pdf_url":"https://arxiv.org/pdf/2408.16208v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15591v2","updated":"2024-08-29T02:01:56Z","published":"2024-08-28T07:31:32Z","title":"VFLIP: A Backdoor Defense for Vertical Federated Learning via\n  Identification and Purification","summary":"  Vertical Federated Learning (VFL) focuses on handling vertically partitioned\ndata over FL participants. Recent studies have discovered a significant\nvulnerability in VFL to backdoor attacks which specifically target the distinct\ncharacteristics of VFL. Therefore, these attacks may neutralize existing\ndefense mechanisms designed primarily for Horizontal Federated Learning (HFL)\nand deep neural networks. In this paper, we present the first backdoor defense,\ncalled VFLIP, specialized for VFL. VFLIP employs the identification and\npurification techniques that operate at the inference stage, consequently\nimproving the robustness against backdoor attacks to a great extent. VFLIP\nfirst identifies backdoor-triggered embeddings by adopting a participant-wise\nanomaly detection approach. Subsequently, VFLIP conducts purification which\nremoves the embeddings identified as malicious and reconstructs all the\nembeddings based on the remaining embeddings. We conduct extensive experiments\non CIFAR10, CINIC10, Imagenette, NUS-WIDE, and BankMarketing to demonstrate\nthat VFLIP can effectively mitigate backdoor attacks in VFL.\nhttps://github.com/blingcho/VFLIP-esorics24\n","authors":["Yungi Cho","Woorim Han","Miseon Yu","Younghan Lee","Ho Bae","Yunheung Paek"],"pdf_url":"https://arxiv.org/pdf/2408.15591v2.pdf","comment":"Accepted by 29th European Symposium on Research in Computer Security\n  (ESORICS 2024)"},{"id":"http://arxiv.org/abs/2406.02126v3","updated":"2024-08-29T02:00:25Z","published":"2024-06-04T09:10:14Z","title":"CityLight: A Universal Model for Coordinated Traffic Signal Control in\n  City-scale Heterogeneous Intersections","summary":"  The increasingly severe congestion problem in modern cities strengthens the\nsignificance of developing city-scale traffic signal control (TSC) methods for\ntraffic efficiency enhancement. While reinforcement learning has been widely\nexplored in TSC, most of them still target small-scale optimization and cannot\ndirectly scale to the city level due to unbearable resource demand. Only a few\nof them manage to tackle city-level optimization, namely a thousand-scale\noptimization, by incorporating parameter-sharing mechanisms, but hardly have\nthey fully tackled the heterogeneity of intersections and intricate\nbetween-intersection interactions inherent in real-world city road networks. To\nfill in the gap, we target at the two important challenges in adopting\nparameter-sharing paradigms to solve TSC: inconsistency of inner state\nrepresentations for intersections heterogeneous in configuration, scale, and\norders of available traffic phases; intricacy of impacts from neighborhood\nintersections that have various relative traffic relationships due to\ninconsistent phase orders and diverse relative positioning. Our method,\nCityLight, features a universal representation module that not only aligns the\nstate representations of intersections by reindexing their phases based on\ntheir semantics and designing heterogeneity-preserving observations, but also\nencodes the narrowed relative traffic relation types to project the\nneighborhood intersections onto a uniform relative traffic impact space. We\nfurther attentively fuse neighborhood representations based on their competing\nrelations and incorporate neighborhood-integrated rewards to boost\ncoordination. Extensive experiments with hundreds to tens of thousands of\nintersections validate the surprising effectiveness and generalizability of\nCityLight, with an overall performance gain of 11.68% and a 22.59% improvement\nin transfer scenarios in throughput.\n","authors":["Jinwei Zeng","Chao Yu","Xinyi Yang","Wenxuan Ao","Qianyue Hao","Jian Yuan","Yong Li","Yu Wang","Huazhong Yang"],"pdf_url":"https://arxiv.org/pdf/2406.02126v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16204v1","updated":"2024-08-29T01:50:13Z","published":"2024-08-29T01:50:13Z","title":"Revisit Micro-batch Clipping: Adaptive Data Pruning via Gradient\n  Manipulation","summary":"  Micro-batch clipping, a gradient clipping method, has recently shown\npotential in enhancing auto-speech recognition (ASR) model performance.\nHowever, the underlying mechanism behind this improvement remains mysterious,\nparticularly the observation that only certain micro-batch sizes are\nbeneficial. In this paper, we make the first attempt to explain this\nphenomenon. Inspired by recent data pruning research, we assume that specific\ntraining samples may impede model convergence during certain training phases.\nUnder this assumption, the convergence analysis shows that micro-batch clipping\ncan improve the convergence rate asymptotically at the cost of an additional\nconstant bias that does not diminish with more training iterations. The bias is\ndependent on a few factors and can be minimized at specific micro-batch size,\nthereby elucidating the existence of the sweet-spot micro-batch size observed\npreviously. We also verify the effectiveness of micro-batch clipping beyond\nspeech models on vision and language models, and show promising performance\ngains in these domains. An exploration of potential limitations shows that\nmicro-batch clipping is less effective when training data originates from\nmultiple distinct domains.\n","authors":["Lun Wang"],"pdf_url":"https://arxiv.org/pdf/2408.16204v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16202v1","updated":"2024-08-29T01:47:09Z","published":"2024-08-29T01:47:09Z","title":"Short-Term Electricity-Load Forecasting by Deep Learning: A\n  Comprehensive Survey","summary":"  Short-Term Electricity-Load Forecasting (STELF) refers to the prediction of\nthe immediate demand (in the next few hours to several days) for the power\nsystem. Various external factors, such as weather changes and the emergence of\nnew electricity consumption scenarios, can impact electricity demand, causing\nload data to fluctuate and become non-linear, which increases the complexity\nand difficulty of STELF. In the past decade, deep learning has been applied to\nSTELF, modeling and predicting electricity demand with high accuracy, and\ncontributing significantly to the development of STELF. This paper provides a\ncomprehensive survey on deep-learning-based STELF over the past ten years. It\nexamines the entire forecasting process, including data pre-processing, feature\nextraction, deep-learning modeling and optimization, and results evaluation.\nThis paper also identifies some research challenges and potential research\ndirections to be further investigated in future work.\n","authors":["Qi Dong","Rubing Huang","Chenhui Cui","Dave Towey","Ling Zhou","Jinyu Tian","Jianzhou Wang"],"pdf_url":"https://arxiv.org/pdf/2408.16202v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16201v1","updated":"2024-08-29T01:46:37Z","published":"2024-08-29T01:46:37Z","title":"Uni-3DAD: GAN-Inversion Aided Universal 3D Anomaly Detection on\n  Model-free Products","summary":"  Anomaly detection is a long-standing challenge in manufacturing systems.\nTraditionally, anomaly detection has relied on human inspectors. However, 3D\npoint clouds have gained attention due to their robustness to environmental\nfactors and their ability to represent geometric data. Existing 3D anomaly\ndetection methods generally fall into two categories. One compares scanned 3D\npoint clouds with design files, assuming these files are always available.\nHowever, such assumptions are often violated in many real-world applications\nwhere model-free products exist, such as fresh produce (i.e., ``Cookie\",\n``Potato\", etc.), dentures, bone, etc. The other category compares patches of\nscanned 3D point clouds with a library of normal patches named memory bank.\nHowever, those methods usually fail to detect incomplete shapes, which is a\nfairly common defect type (i.e., missing pieces of different products). The\nmain challenge is that missing areas in 3D point clouds represent the absence\nof scanned points. This makes it infeasible to compare the missing region with\nexisting point cloud patches in the memory bank. To address these two\nchallenges, we proposed a unified, unsupervised 3D anomaly detection framework\ncapable of identifying all types of defects on model-free products. Our method\nintegrates two detection modules: a feature-based detection module and a\nreconstruction-based detection module. Feature-based detection covers geometric\ndefects, such as dents, holes, and cracks, while the reconstruction-based\nmethod detects missing regions. Additionally, we employ a One-class Support\nVector Machine (OCSVM) to fuse the detection results from both modules. The\nresults demonstrate that (1) our proposed method outperforms the\nstate-of-the-art methods in identifying incomplete shapes and (2) it still\nmaintains comparable performance with the SOTA methods in detecting all other\ntypes of anomalies.\n","authors":["Jiayu Liu","Shancong Mou","Nathan Gaw","Yinan Wang"],"pdf_url":"https://arxiv.org/pdf/2408.16201v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.01970v7","updated":"2024-08-29T01:38:57Z","published":"2022-05-04T09:37:16Z","title":"Non-Stationary Bandit Learning via Predictive Sampling","summary":"  Thompson sampling has proven effective across a wide range of stationary\nbandit environments. However, as we demonstrate in this paper, it can perform\npoorly when applied to non-stationary environments. We attribute such failures\nto the fact that, when exploring, the algorithm does not differentiate actions\nbased on how quickly the information acquired loses its usefulness due to\nnon-stationarity. Building upon this insight, we propose predictive sampling,\nan algorithm that deprioritizes acquiring information that quickly loses\nusefulness. A theoretical guarantee on the performance of predictive sampling\nis established through a Bayesian regret bound. We provide versions of\npredictive sampling for which computations tractably scale to complex bandit\nenvironments of practical interest. Through numerical simulations, we\ndemonstrate that predictive sampling outperforms Thompson sampling in all\nnon-stationary environments examined.\n","authors":["Yueyang Liu","Xu Kuang","Benjamin Van Roy"],"pdf_url":"https://arxiv.org/pdf/2205.01970v7.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06727v2","updated":"2024-08-29T01:26:31Z","published":"2023-06-11T17:17:48Z","title":"A Normalized Bottleneck Distance on Persistence Diagrams and Homology\n  Preservation under Dimension Reduction","summary":"  Persistence diagrams (PDs) are used as signatures of point cloud data. Two\nclouds of points can be compared using the bottleneck distance d_B between\ntheir PDs. A potential drawback of this pipeline is that point clouds sampled\nfrom topologically similar manifolds can have arbitrarily large d_B when there\nis a large scaling between them. This situation is typical in dimension\nreduction frameworks.\n  We define, and study properties of, a new scale-invariant distance between\nPDs termed normalized bottleneck distance, d_N. In defining d_N, we develop a\nbroader framework called metric decomposition for comparing finite metric\nspaces of equal cardinality with a bijection. We utilize metric decomposition\nto prove a stability result for d_N by deriving an explicit bound on the\ndistortion of the bijective map. We then study two popular dimension reduction\ntechniques, Johnson-Lindenstrauss (JL) projections and metric multidimensional\nscaling (mMDS), and a third class of general biLipschitz mappings. We provide\nnew bounds on how well these dimension reduction techniques preserve homology\nwith respect to d_N. For a JL map f that transforms input X to f(X), we show\nthat d_N(dgm(X),dgm(f(X))) < e, where dgm(X) is the Vietoris-Rips PD of X, and\npairwise distances are preserved by f up to the tolerance 0 < \\epsilon < 1. For\nmMDS, we present new bounds for d_B and d_N between PDs of X and its projection\nin terms of the eigenvalues of the covariance matrix. And for k-biLipschitz\nmaps, we show that d_N is bounded by the product of (k^2-1)/k and the ratio of\ndiameters of X and f(X). Finally, we use computational experiments to\ndemonstrate the increased effectiveness of using the normalized bottleneck\ndistance for clustering sets of point clouds sampled from different shapes.\n","authors":["Nathan H. May","Bala Krishnamoorthy","Patrick Gambill"],"pdf_url":"https://arxiv.org/pdf/2306.06727v2.pdf","comment":"Added computational experiments; published in La Matematica"},{"id":"http://arxiv.org/abs/2408.16191v1","updated":"2024-08-29T01:09:30Z","published":"2024-08-29T01:09:30Z","title":"Variational Mode-Driven Graph Convolutional Network for Spatiotemporal\n  Traffic Forecasting","summary":"  This paper focuses on spatio-temporal (ST) traffic prediction traffic using\ngraph neural networks. Given that ST data consists of non-stationary and\ncomplex time events, interpreting and predicting such trends is comparatively\ncomplicated. Representation of ST data in modes helps us infer behavior and\nassess the impact of noise on prediction applications. We propose a framework\nthat decomposes ST data into modes using the variational mode decomposition\n(VMD) method, which is then fed into the neural network for forecasting future\nstates. This hybrid approach is known as a variational mode graph convolutional\nnetwork (VMGCN). Instead of exhaustively searching for the number of modes,\nthey are determined using the reconstruction loss from the real-time\napplication data. We also study the significance of each mode and the impact of\nbandwidth constraints on different horizon predictions in traffic flow data. We\nevaluate the performance of our proposed network on the LargeST dataset for\nboth short and long-term predictions. Our framework yields better results\ncompared to state-of-the-art methods.\n","authors":["Osama Ahmad","Zubair Khalid"],"pdf_url":"https://arxiv.org/pdf/2408.16191v1.pdf","comment":"IEEE Transactions on Intelligent Transportation Systems Submission,\n  2024"},{"id":"http://arxiv.org/abs/2408.16189v1","updated":"2024-08-29T01:02:40Z","published":"2024-08-29T01:02:40Z","title":"A More Unified Theory of Transfer Learning","summary":"  We show that some basic moduli of continuity $\\delta$ -- which measure how\nfast target risk decreases as source risk decreases -- appear to be at the root\nof many of the classical relatedness measures in transfer learning and related\nliterature. Namely, bounds in terms of $\\delta$ recover many of the existing\nbounds in terms of other measures of relatedness -- both in regression and\nclassification -- and can at times be tighter.\n  We are particularly interested in general situations where the learner has\naccess to both source data and some or no target data. The unified perspective\nallowed by the moduli $\\delta$ allow us to extend many existing notions of\nrelatedness at once to these scenarios involving target data: interestingly,\nwhile $\\delta$ itself might not be efficiently estimated, adaptive procedures\nexist -- based on reductions to confidence sets -- which can get nearly tight\nrates in terms of $\\delta$ with no prior distributional knowledge. Such\nadaptivity to unknown $\\delta$ immediately implies adaptivity to many classical\nrelatedness notions, in terms of combined source and target samples' sizes.\n","authors":["Steve Hanneke","Samory Kpotufe"],"pdf_url":"https://arxiv.org/pdf/2408.16189v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13467v2","updated":"2024-08-29T00:54:27Z","published":"2024-08-24T05:03:08Z","title":"LlamaDuo: LLMOps Pipeline for Seamless Migration from Service LLMs to\n  Small-Scale Local LLMs","summary":"  The widespread adoption of cloud-based proprietary large language models\n(LLMs) has introduced significant challenges, including operational\ndependencies, privacy concerns, and the necessity of continuous internet\nconnectivity. In this work, we introduce an LLMOps pipeline, \"LlamaDuo\", for\nthe seamless migration of knowledge and abilities from service-oriented LLMs to\nsmaller, locally manageable models. This pipeline is crucial for ensuring\nservice continuity in the presence of operational failures, strict privacy\npolicies, or offline requirements. Our LlamaDuo involves fine-tuning a small\nlanguage model against the service LLM using a synthetic dataset generated by\nthe latter. If the performance of the fine-tuned model falls short of\nexpectations, it is enhanced by further fine-tuning with additional similar\ndata created by the service LLM. This iterative process guarantees that the\nsmaller model can eventually match or even surpass the service LLM's\ncapabilities in specific downstream tasks, offering a practical and scalable\nsolution for managing AI deployments in constrained environments. Extensive\nexperiments with leading edge LLMs are conducted to demonstrate the\neffectiveness, adaptability, and affordability of LlamaDuo across various\ndownstream tasks. Our pipeline implementation is available at\nhttps://github.com/deep-diver/llamaduo.\n","authors":["Chansung Park","Juyong Jiang","Fan Wang","Sayak Paul","Jing Tang"],"pdf_url":"https://arxiv.org/pdf/2408.13467v2.pdf","comment":"28 pages, 18 figures, 6 tables"},{"id":"http://arxiv.org/abs/2408.16187v1","updated":"2024-08-29T00:53:21Z","published":"2024-08-29T00:53:21Z","title":"Real-Time Energy Pricing in New Zealand: An Evolving Stream Analysis","summary":"  This paper introduces a group of novel datasets representing real-time\ntime-series and streaming data of energy prices in New Zealand, sourced from\nthe Electricity Market Information (EMI) website maintained by the New Zealand\ngovernment. The datasets are intended to address the scarcity of proper\ndatasets for streaming regression learning tasks. We conduct extensive analyses\nand experiments on these datasets, covering preprocessing techniques,\nregression tasks, prediction intervals, concept drift detection, and anomaly\ndetection. Our experiments demonstrate the datasets' utility and highlight the\nchallenges and opportunities for future research in energy price forecasting.\n","authors":["Yibin Sun","Heitor Murilo Gomes","Bernhard Pfahringer","Albert Bifet"],"pdf_url":"https://arxiv.org/pdf/2408.16187v1.pdf","comment":"12 Pages, 8 figures, short version accepted by PRICAI"},{"id":"http://arxiv.org/abs/2408.16186v1","updated":"2024-08-29T00:50:35Z","published":"2024-08-29T00:50:35Z","title":"Single-Loop Deterministic and Stochastic Interior-Point Algorithms for\n  Nonlinearly Constrained Optimization","summary":"  An interior-point algorithm framework is proposed, analyzed, and tested for\nsolving nonlinearly constrained continuous optimization problems. The main\nsetting of interest is when the objective and constraint functions may be\nnonlinear and/or nonconvex, and when constraint values and derivatives are\ntractable to compute, but objective function values and derivatives can only be\nestimated. The algorithm is intended primarily for a setting that is similar\nfor stochastic-gradient methods for unconstrained optimization, namely, the\nsetting when stochastic-gradient estimates are available and employed in place\nof gradients of the objective, and when no objective function values (nor\nestimates of them) are employed. This is achieved by the interior-point\nframework having a single-loop structure rather than the nested-loop structure\nthat is typical of contemporary interior-point methods. For completeness,\nconvergence guarantees for the framework are provided both for deterministic\nand stochastic settings. Numerical experiments show that the algorithm yields\ngood performance on a large set of test problems.\n","authors":["Frank E. Curtis","Xin Jiang","Qi Wang"],"pdf_url":"https://arxiv.org/pdf/2408.16186v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04992v2","updated":"2024-08-29T00:40:05Z","published":"2024-07-06T07:56:23Z","title":"Scalable Variational Causal Discovery Unconstrained by Acyclicity","summary":"  Bayesian causal discovery offers the power to quantify epistemic\nuncertainties among a broad range of structurally diverse causal theories\npotentially explaining the data, represented in forms of directed acyclic\ngraphs (DAGs). However, existing methods struggle with efficient DAG sampling\ndue to the complex acyclicity constraint. In this study, we propose a scalable\nBayesian approach to effectively learn the posterior distribution over causal\ngraphs given observational data thanks to the ability to generate DAGs without\nexplicitly enforcing acyclicity. Specifically, we introduce a novel\ndifferentiable DAG sampling method that can generate a valid acyclic causal\ngraph by mapping an unconstrained distribution of implicit topological orders\nto a distribution over DAGs. Given this efficient DAG sampling scheme, we are\nable to model the posterior distribution over causal graphs using a simple\nvariational distribution over a continuous domain, which can be learned via the\nvariational inference framework. Extensive empirical experiments on both\nsimulated and real datasets demonstrate the superior performance of the\nproposed model compared to several state-of-the-art baselines.\n","authors":["Nu Hoang","Bao Duong","Thin Nguyen"],"pdf_url":"https://arxiv.org/pdf/2407.04992v2.pdf","comment":"Accepted at ECAI 2024"},{"id":"http://arxiv.org/abs/2408.16181v1","updated":"2024-08-29T00:36:34Z","published":"2024-08-29T00:36:34Z","title":"A Minibatch-SGD-Based Learning Meta-Policy for Inventory Systems with\n  Myopic Optimal Policy","summary":"  Stochastic gradient descent (SGD) has proven effective in solving many\ninventory control problems with demand learning. However, it often faces the\npitfall of an infeasible target inventory level that is lower than the current\ninventory level. Several recent works (e.g., Huh and Rusmevichientong (2009),\nShi et al.(2016)) are successful to resolve this issue in various inventory\nsystems. However, their techniques are rather sophisticated and difficult to be\napplied to more complicated scenarios such as multi-product and\nmulti-constraint inventory systems.\n  In this paper, we address the infeasible-target-inventory-level issue from a\nnew technical perspective -- we propose a novel minibatch-SGD-based\nmeta-policy. Our meta-policy is flexible enough to be applied to a general\ninventory systems framework covering a wide range of inventory management\nproblems with myopic clairvoyant optimal policy. By devising the optimal\nminibatch scheme, our meta-policy achieves a regret bound of\n$\\mathcal{O}(\\sqrt{T})$ for the general convex case and $\\mathcal{O}(\\log T)$\nfor the strongly convex case. To demonstrate the power and flexibility of our\nmeta-policy, we apply it to three important inventory control problems:\nmulti-product and multi-constraint systems, multi-echelon serial systems, and\none-warehouse and multi-store systems by carefully designing\napplication-specific subroutines.We also conduct extensive numerical\nexperiments to demonstrate that our meta-policy enjoys competitive regret\nperformance, high computational efficiency, and low variances among a wide\nrange of applications.\n","authors":["Jiameng Lyu","Jinxing Xie","Shilin Yuan","Yuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.16181v1.pdf","comment":"Forthcoming in Management Science"},{"id":"http://arxiv.org/abs/2407.04980v2","updated":"2024-08-29T00:34:59Z","published":"2024-07-06T07:19:21Z","title":"Enabling Causal Discovery in Post-Nonlinear Models with Normalizing\n  Flows","summary":"  Post-nonlinear (PNL) causal models stand out as a versatile and adaptable\nframework for modeling intricate causal relationships. However, accurately\ncapturing the invertibility constraint required in PNL models remains\nchallenging in existing studies. To address this problem, we introduce CAF-PoNo\n(Causal discovery via Normalizing Flows for Post-Nonlinear models), harnessing\nthe power of the normalizing flows architecture to enforce the crucial\ninvertibility constraint in PNL models. Through normalizing flows, our method\nprecisely reconstructs the hidden noise, which plays a vital role in\ncause-effect identification through statistical independence testing.\nFurthermore, the proposed approach exhibits remarkable extensibility, as it can\nbe seamlessly expanded to facilitate multivariate causal discovery via causal\norder identification, empowering us to efficiently unravel complex causal\nrelationships. Extensive experimental evaluations on both simulated and real\ndatasets consistently demonstrate that the proposed method outperforms several\nstate-of-the-art approaches in both bivariate and multivariate causal discovery\ntasks.\n","authors":["Nu Hoang","Bao Duong","Thin Nguyen"],"pdf_url":"https://arxiv.org/pdf/2407.04980v2.pdf","comment":"Acepted at ECAI 2024"},{"id":"http://arxiv.org/abs/2108.02497v5","updated":"2024-08-29T10:12:35Z","published":"2021-08-05T10:15:17Z","title":"How to avoid machine learning pitfalls: a guide for academic researchers","summary":"  Mistakes in machine learning practice are commonplace, and can result in a\nloss of confidence in the findings and products of machine learning. This guide\noutlines common mistakes that occur when using machine learning, and what can\nbe done to avoid them. Whilst it should be accessible to anyone with a basic\nunderstanding of machine learning techniques, it focuses on issues that are of\nparticular concern within academic research, such as the need to do rigorous\ncomparisons and reach valid conclusions. It covers five stages of the machine\nlearning process: what to do before model building, how to reliably build\nmodels, how to robustly evaluate models, how to compare models fairly, and how\nto report results.\n","authors":["Michael A. Lones"],"pdf_url":"https://arxiv.org/pdf/2108.02497v5.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2408.16625v1","updated":"2024-08-29T15:34:25Z","published":"2024-08-29T15:34:25Z","title":"MultiMediate'24: Multi-Domain Engagement Estimation","summary":"  Estimating the momentary level of participant's engagement is an important\nprerequisite for assistive systems that support human interactions. Previous\nwork has addressed this task in within-domain evaluation scenarios, i.e.\ntraining and testing on the same dataset. This is in contrast to real-life\nscenarios where domain shifts between training and testing data frequently\noccur. With MultiMediate'24, we present the first challenge addressing\nmulti-domain engagement estimation. As training data, we utilise the NOXI\ndatabase of dyadic novice-expert interactions. In addition to within-domain\ntest data, we add two new test domains. First, we introduce recordings\nfollowing the NOXI protocol but covering languages that are not present in the\nNOXI training data. Second, we collected novel engagement annotations on the\nMPIIGroupInteraction dataset which consists of group discussions between three\nto four people. In this way, MultiMediate'24 evaluates the ability of\napproaches to generalise across factors such as language and cultural\nbackground, group size, task, and screen-mediated vs. face-to-face interaction.\nThis paper describes the MultiMediate'24 challenge and presents baseline\nresults. In addition, we discuss selected challenge solutions.\n","authors":["Philipp Müller","Michal Balazia","Tobias Baur","Michael Dietz","Alexander Heimerl","Anna Penzkofer","Dominik Schiller","François Brémond","Jan Alexandersson","Elisabeth André","Andreas Bulling"],"pdf_url":"https://arxiv.org/pdf/2408.16625v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2308.08256"},{"id":"http://arxiv.org/abs/2408.16564v1","updated":"2024-08-29T14:30:56Z","published":"2024-08-29T14:30:56Z","title":"Human-Inspired Audio-Visual Speech Recognition: Spike Activity, Cueing\n  Interaction and Causal Processing","summary":"  Humans naturally perform audiovisual speech recognition (AVSR), enhancing the\naccuracy and robustness by integrating auditory and visual information. Spiking\nneural networks (SNNs), which mimic the brain's information-processing\nmechanisms, are well-suited for emulating the human capability of AVSR. Despite\ntheir potential, research on SNNs for AVSR is scarce, with most existing\naudio-visual multimodal methods focused on object or digit recognition. These\nmodels simply integrate features from both modalities, neglecting their unique\ncharacteristics and interactions. Additionally, they often rely on future\ninformation for current processing, which increases recognition latency and\nlimits real-time applicability. Inspired by human speech perception, this paper\nproposes a novel human-inspired SNN named HI-AVSNN for AVSR, incorporating\nthree key characteristics: cueing interaction, causal processing and spike\nactivity. For cueing interaction, we propose a visual-cued auditory attention\nmodule (VCA2M) that leverages visual cues to guide attention to auditory\nfeatures. We achieve causal processing by aligning the SNN's temporal dimension\nwith that of visual and auditory features and applying temporal masking to\nutilize only past and current information. To implement spike activity, in\naddition to using SNNs, we leverage the event camera to capture lip movement as\nspikes, mimicking the human retina and providing efficient visual data. We\nevaluate HI-AVSNN on an audiovisual speech recognition dataset combining the\nDVS-Lip dataset with its corresponding audio samples. Experimental results\ndemonstrate the superiority of our proposed fusion method, outperforming\nexisting audio-visual SNN fusion methods and achieving a 2.27% improvement in\naccuracy over the only existing SNN-based AVSR method.\n","authors":["Qianhui Liu","Jiadong Wang","Yang Wang","Xin Yang","Gang Pan","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2408.16564v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16532v1","updated":"2024-08-29T13:43:36Z","published":"2024-08-29T13:43:36Z","title":"WavTokenizer: an Efficient Acoustic Discrete Codec Tokenizer for Audio\n  Language Modeling","summary":"  Language models have been effectively applied to modeling natural signals,\nsuch as images, video, speech, and audio. A crucial component of these models\nis the codec tokenizer, which compresses high-dimensional natural signals into\nlower-dimensional discrete tokens. In this paper, we introduce WavTokenizer,\nwhich offers several advantages over previous SOTA acoustic codec models in the\naudio domain: 1)extreme compression. By compressing the layers of quantizers\nand the temporal dimension of the discrete codec, one-second audio of 24kHz\nsampling rate requires only a single quantizer with 40 or 75 tokens. 2)improved\nsubjective quality. Despite the reduced number of tokens, WavTokenizer achieves\nstate-of-the-art reconstruction quality with outstanding UTMOS scores and\ninherently contains richer semantic information. Specifically, we achieve these\nresults by designing a broader VQ space, extended contextual windows, and\nimproved attention networks, as well as introducing a powerful multi-scale\ndiscriminator and an inverse Fourier transform structure. We conducted\nextensive reconstruction experiments in the domains of speech, audio, and\nmusic. WavTokenizer exhibited strong performance across various objective and\nsubjective metrics compared to state-of-the-art models. We also tested semantic\ninformation, VQ utilization, and adaptability to generative models.\nComprehensive ablation studies confirm the necessity of each module in\nWavTokenizer. The related code, demos, and pre-trained models are available at\nhttps://github.com/jishengpeng/WavTokenizer.\n","authors":["Shengpeng Ji","Ziyue Jiang","Xize Cheng","Yifu Chen","Minghui Fang","Jialong Zuo","Qian Yang","Ruiqi Li","Ziang Zhang","Xiaoda Yang","Rongjie Huang","Yidi Jiang","Qian Chen","Siqi Zheng","Wen Wang","Zhou Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.16532v1.pdf","comment":"Working in progress. arXiv admin note: text overlap with\n  arXiv:2402.12208"}]}}
\ No newline at end of file
diff --git a/favicon.ico b/favicon.ico
new file mode 100644
index 0000000000000000000000000000000000000000..7f5166c7afa0cda370aafaf91ba8d66cdeff74e5
GIT binary patch
literal 15086
zcmeHO33yaRwyq9g34stc4G>5efuJa|8H50X3WB1d4n)8Z5JVvktB7$yKtv!Kb`lau
zdP!$Nc9B&OSrq|+r=TLJ;K~kR2@vF;|J<9Kba%RAH(}<z`M#I>4!8Q=syhFvI(6#Q
zsX{4}Dx;b;Q+$T2oQ6t8Dy7213w{SH^#k7p^C{m4`z!S>3p8dKR#E*)@?QIEpg)}c
zg{r6iyXi3T|KFt>>TsDWWe%JEG<m`Z;&-3^R0ug1f<^}$q5Q=P?#uie-}>eI;m)t_
z#K0v35xKK9{I2==#4YqlUA%3Xh?MkH#4d|P#d8&Xs_&s!ylR8}jrLpHV^;bsWZSYa
zH!TUx_B8jZuJHA{>W61Pj6tR~6PcNrNKa44y<ax-x_)V}OWlBT0T`Px7TNY}*zI;?
zWn^JY)))ky4lEHv(5WD_*xdquU*8ck(q|ykPW+@)bE-C$rN!lK{RRgcVzzm<##`_U
zhBRv$;y#E&mknJ|`AlUeIZXK`M0_2g$!Ip4kuom@NlD3tvX9Ccg@E$`PIbJ+;%m9!
z<G~o6H5!>2nHqn=Z;*^b+JZFPz5nh)ES%qX=#z&q(>zlfC!?&@YSrplEEcre-mb8`
zuXubZFMZU1X@6u`GYT;qc;sp5g4h(J-Ri$pM?zXcp{_ZWm%P<t#lxR;fx2$gDB*dT
z-+m<>AnhXUA$><WLApjNA(g#c7XMh?0~WKTNN+d>^4n&yX>&2hmV*Ry0tB<smelLF
ztRFHgMKX(=h8&WgVVQw@_BMsOVbpSXo@0Ik=~dE0q{g&!VYIhlK_u#FczLLMOi)ko
z+Z57n(iM_JDs#0Ax^C!-1?B}BcHt@g_Dc3Eh}sc_4%<2)dUZ4gEg6IkTRNc3wK5L1
z6{?)5f*}isz;3of^glU085Y(z<Pg1^YMF|L2O1XD6WkV32U49XP;J{o4SyMGej3!O
zHBg^!hWca^)bbCZX3l{c@B&m*+OSZ{)`NA{lQ<?~=R0ohh!k^5A$GAB-B&+}N~ay>
zDc2*Jr;1N^DfmP%o?7)3+HY@<`rp+@r%jzP%QCA_hD^#Z(KZo(JM=fG&)C8vq&l}j
zJwF&~<0nw3PebMBv;9AHx_+HHM>2k2y$bcquTUQ>g6h^CsupeGV1<^;S|Zt!?1a7Z
z#?J81^LLBW9d_fL={n^r&=JYsxAQ)IUfZ*@bwK9T6E8jvRhMmd?FO}(eVmu4<Ggka
zN-&m0Qby!Ce-0`$i+Xi0!o%ue+*@xUD=Q20&GR)q^1tFcc0&for^wLqi<X-H+iiBl
z#>AjvwHnm*c+SZaI_{G2dio}E$k=@I4b)RlA>{TDjCf@P9^3fXP4&AYX4kyP&|*&u
zL_Z&m!0N<4WeU`#OS-PO!zb88j|}~hyr;2|GQa;071I}ibplcL)3QG6j4NJuzfD_B
zC=*%^D*iPcyDJ`}Kd)TT$K}u=sD1mO_U@%EJplFd&rlaHy4N$2?^n)?N2!-$3x0El
zpcL=UvTj#WH?}W2Bm5luU4G~0LM>eiHE<x*gAYQrYe#$37OG7fouZ<ko_rE&?p&yo
zCyTcs<FJ?VMnpipMqVCl%uPv=##(hlptfu+uJ3~V+i#)z_SJEY8wa&^E!0<ELH+Op
z)Q>+x9X!aoTcDDYxjsAu_3X1y$Bwy|VK(IpqYlX$aVwJaeLK?Nm$>FoH>BT1SA+z+
zU=vI)(6%2wtjj0%MZP(b^zV%uI__S*bQz3sFxr#nAAdmI&v6?@p6=F4Uu9a$c0#M_
zYms0O{AbSSdL;Y>%a9?ohv$p;r=yM;d1*uX{*gzhC!9-CPjph+lrr*t+6=DYwBtv~
zyR_+Lw$R}Ly?yB);gOO8)vups_cUD>9g)5^F#gq3Fz(vLe!d?nI$Cc_+LU_I&i?)M
zIch^KE+zVltlyC|I^G%I@#DH)3(vSXsLPkV$8N|bbwuZ+4Uu2klyA~U=gyHYb#inm
z@gHOTMt<~hZ2FpM@D?7T<1$AF4AA)*-@JVaMzK}WhO}jjts%pUoNtelKmDVdPWxH2
zUPby@A3Nh09x~3tJ0qiLUVDpO%84zIy3&TL?uk4TCquO+|J<8Kuls0W!BE?_7q^=R
zR>LM4G8ykZJsq(+)^#i|_{CpsPVA>jf&X*X4WnPYb(?4W24BG<cICna&vQY(Nf}SL
zjkW~;^jT0XN{Rn#!5;v%dTnw2F?h*S?8%NDFg0ri)4X}b^+C$8-{qT>k+NDH$2J`E
zf`8mZuHWQ;fpoJ;{E)k7hv&^N8NXnQkB3QdfAN5etrc7{H)-GHn^uNpi|M>0e#!TL
zUf<(*vuE`rpX~9(?-?@G**>`PqAzOd-d)F5zrMZ>JL<XQxi*{lO~cQ!n*ZwBj{Fu`
zPo2`&;BhzI^F<PkXB({|^>y9R#yRq)UYk01*0I&Dt@{+N_~~bu_)WvlvL5G&ri-7^
z->MF^<`&@J!8Sr^LszWytV3LjOg($**cvs`*CSW_T%%0<rr~E<k87UPdvlyVcO_;~
z+9dH1Sx0(2qxYG+F<_sW-hEfc_BefP@ueNdN9ypMiQhE*EbDgu3icyMpqel?^>(R|
z7fJph+p5D@i1_zn8yvAoUifk!XnKY+uH-m5_PtS7-tn7OM)r*E%1GHa#zNgmoALcE
z#4q!>Kk2^KMLx2D%Xms3%l^vv?dd6HJdB}Qx1PEh0yX;<V&^EBZkW(`MPt+n<=pJk
z$Nm1v6JBrL%bXQIqD4!nXNbKtHor(~w4ChSo9}9Y>DK9ZoqOLJHETi*8l?M+!q*#o
zC6zI-cj$nK1`W|ZyFL7`-F)mM@LV85j)p*Dx<aiO|CwQ)fvke~&&#7;MztE%p`Pxc
z@5_b1JAEh#vwRc%mwMtGHgAQ-4I3e0Y64QNsd#8pCrCWeS1k%U8HDE-4??0j5ic%z
z5pj#+FxN7-F#aYn#3{BZ2+au9eZ#wGCm74t_Tw_IJ<Ul&7Qrj>N;%mZkENXqy&csb
zsD_E}O+<E9w#I9oc^*1#?1ZwHxDWO+1)dGWphbhU7_RhnmW@QbM*LHr(=5}_WM4tt
zUG@eeNOc&?l>J!}{T|s1i|rJAB99{pc8M?U{DQvC+q9AQaBsmro~7V_${$@ebz$tS
zD0VAxxY*^fnl6;TP##r}S4F+U^$_|)D9T<bF4y0;bcEGh6t6X6&+-)P5}%CR$#vK<
zu8~jt5(j;O)PPv(*MoYP``Tfnpb`=pZ%ARhgmS&Zc-NpIP*EMYu5$j|OZzyEbq<jV
z_UIA=e{$`UFk8$TKds-ac1*QS#j9DbVzO;AlFUh%xnL$*?&3UmN%wDSgOs!sP0t;%
zOJ2qyJ}Vv-&a<!GCGoM2knE(Pq%NeEB#D_fB}Fpdo%A~CBhnuvH?f5ZxfKwzJVuN4
z8yG|m;jhg<=H)D}&q~@r7r##8rIDYv#=F*cwQ~X~FXuF7#?mF9Xh}q!AM3b7@BB`1
z$$BAim*XVy0nU?Tth`8-PgTZ07yc7=ThaN0A&We-GqN%4gW>LDq~u-ur`m{J5sTMs
zuVdouiO8@OoM~_@pI-BHF}c0L<ZGE)Kh@IW^qLM4vw`1YwP1Go9Q0fnjgroS#dcLV
zS3x^}O=tfba*Ga?&n@poFZznur9Atw+sEBBJutA_*?)bP_eHHAYat+)dnzwdrPGxV
zZ;ywZIc1nL(EG#QJQw!eE_=hvwM-M9FD5TY)^u0&_kHvG2t7i(?PHR2*daV~?M~ZJ
z+3_sL$98<>>wncBZML_?6w4IMOrMFU?NJb4z)d@OeCL^Ns63wI<ni3ebD5Irm*N}m
z5ae+_=7k*!&!fJ3;xDIy|Fs0|&*?_c|Nr<8^nlUta!)Iq)R9zpPm6cCiv2OtNj>}D
zBv}ESDELfpe#mbj`F_{^oZh>Z^W}G7ZeV`Y?soZMN5fs)bg~^4FI2?vWy3K&V>+50
zU`*>4B=PI|ujbte-Xj>la6GD=q@VAS<lMgAv3lsbsVma0>BzQb<MA!rv)_VL`<L37
zY3Cgh+Iv~|2d)~3M&C1L!MjY-e$d$<L~o47+>E(Mou3Mu)rQ&jISBtLywuzSE(YL*
zRWHzQDO&Hv*NR_EzfbO-(Ql3ZI2xH2{XUVbt8zbPa`t3YJ<0PO*Cc+F#GZd3NgVut
zNOB${@er3J{oZq9PuOfW&DRq@LnzyljUgWmDclQ1?vKPg+dRz&)b3@`ACx*V>tj&<
zQh{G_jjc<}=nbyZ(YJA*)|_Wd4{~4Ik$L+3y{kb@W`7DE$|U3Y$hJq3Zlm8!pKa`-
zv1q-oHJX3j0-ZkZ<kYuvPa<k}6#A|0kH@w?hDkY-bi6hzdTj22vIn>?7)E-Ue)=q)
z1McS8?eE9wOY)5BEYBN$`Hivk&!HwUHvc$vb{y}ht!;-idz!|3=!&7Jc7pgyNTLgZ
zL&}654a0<m6Od!iLHrx9pmN6usKjaXuTSWEpX&vCQ`!3%B=3ZNJj%1^cr=M>gZHLP
z#fT3_pz0|%<5&U~4Z|;Ccy3ZZ)RDU<D_7}zFX3hEaUJ(yk{8~_H;JRC&PQmwrl|Hs
z@LzE9&?}5F`#;8UUd-5(W7~2kLt+}smT`}}fP7M+va`AW+gTKcGx}uu4BkPZ{>XWm
zuW1r%$T_63N0z;(oDoczz`f8=o@319zR0eVoPCet-frwzJsvA%1%sSn_Upy_AU<-J
z`Se6X?t8y0>UX)zFl-nU@1AMxy7s@^n~*a*<Lp%UOTTUSl=Fn+?@R1Q@D7Wk43tCo
z@{Xj&Th7U)O+ycizLBTAld+LI9HU1=rKRb0&*yS(c7fXNLm!Oc9JjD<MMjCs$vNcI
zsegUGA=tKU(=kXL?Op}j7(bqI1TXoEo#`J0wP|aS-U*Hk8}zu4jIE>Gj(PbecRHl3
z)RAxU<Q_x4rqAuaL?L$c*&5X6Jw^Pb9XXekc_sbqssH7@l##LqF5$n3b7e&F*oIpy
zSNXOt=j_(cocT+8BA?7V(KF#=@Q}Q`lW(s5BmB8HRx+mErs2ezv$&`a4L%%CEmq#7
z3Gc$t<{Ev$yovARTd`QL@SjZGndUWj+7y<Jf7MwQ;l((UDKZkKTD5rh@jLyy$jiLE
zlW$rq%5i)ya$ffP2h_(t@IT11-J8?i{Njv3&I;r_ry*^F*djxI=H;Dyb2?*?`LOFt
zu3NPk3-p1%#Ii<lyyU)xv|;qW$VwYvdgBde9g}k!c_;S4as2bB7b9r1X8ORpKu7$(
zQYhDnXFl7hhcbo+2GMPq*T!$-DeoNhUT}Xv9qLJ4_VK(&_OtEww!`zo2Oy|3ZI|3%
za>QWpboZ1o5#BsAxUFp)gfC+3&x=I=7_IiV()^N6pLIfV24e(_kM$kW2W1{+TyObG
z%18SuI2`rMz##ABo2)s!CwhC^NWA&#Ye>jRK*XU4w{bZGSA|Oz&~HsYEykEkKg?pY
zXufJvMiN@@Z_T@=EZKu=$l!rI`}>%4?++b|p?{Y+CP#nP?M%$mP|pP*d{r3U&v{>q
zi@lfm9`4_JKPsK8gz6`1fO`u_9Kqm!&w+bjW;{vaQU+P+dv)2-r6{&=nx(CzzLj}D
zrv-UL^G+<+sG)<QVxQt2G}?qg0qiH1DOm5D9&CzPcK$yB+Mfls1$Zat9VBKYVuo!7
z?*67A-b@r#&Q!s3s|RR)zn$xj+)EZeRs2N*W8qU`h~JYol9rH`k+zUz&6c&pF>xM6
z?TJFFEmf0C{C}YwOAfki>*iPz<?_mD{9;zdV2*i?hI#mk5h$DIrQLRFtq6t0^LlOQ
zrTL-a+nY1Y7_)ecll(=c={GYDHek&FWSg@wa@k0fyIjsg%Z{~5&dXkBo^_rUZyUXI
zH2=RyV2S!z^LR~k-q9Jgexsee1xfY-G7c}TcnS4?s_)ujB6<AViF@fp#@Q1YA9vy<
z@q%J8f&btOiBm{eBAiqhr{MoJEA~f=#(r8?`9kG&)z4Pf^77jw-B9Zq9BZ(y=INS7
z83g49q1V=4h}|3uzduX%hpL{higC-wVc@2L2tFT-`oGr4gk=+;#cQ|q*7ZmBqGw1)
z3P|qFGnKto7Jat$(R4k<o`MJVJ%Cy#YawoZ940NBgeglWBm8UH<;9`&ZE4dMPp9ol
zWWF!wa|j;cnNh;x1hhTa7I*x52dZ4CqP6Rv?~kgvRndQ8ESkJE66(ucw1c#}vd&#5
zT_o+;j#jS^N0Tq_bk#HcH*JLff|f9S`@Md@ZQVNk{C&w1-UnJu`%Qm&AJ^=xJ7~{!
zd7S<|Xd>QOx^KY$zohMj<o=SZe{#Q&cg;0hoH?F;vr|o-|2o?7-8=PjTDjvc>$PFW
zCBJEte(!S2disF0r>^Ov+jX9@#tC1!$1G3B{B^EFJGawD(vNmcn=A5eJ=^~ChPFSD
z`yJZd2HtPb^0MEMZ)+A3XVDr_*beuF%KQ@h`>O7b$(~E@NRv#Gm$SfJ`dkb8=(^#^
zpZemTUj}!jMxoMTuBTTxc8<P;=0jXXV5S|FR#8reRN+bmJi7Z)#BGS<8uA_@lG<}$
z_cr%v->|>VvOmu-_V5+=E%E4@&<i714M*KS>Z01YYUJsU+fLnv2}>u?uI6Cm+L8KM
z<F}jxOX_l6iz2n5?$u^rm{9XfO}x4CP0Y=niz$nyBH}35Z>Ac2-+N1Mj9EDb0eKE%
zBWGqVy3+V)V<crI70w@)*dtUWx5}P~pCa~f{%OCB_pZ-!SIg~bzrVvCV$sVLGYG!D
F_!mjUQ#=3w

literal 0
HcmV?d00001

diff --git a/index.css b/index.css
new file mode 100644
index 00000000..9ded9d94
--- /dev/null
+++ b/index.css
@@ -0,0 +1,355 @@
+:root {
+    /* Palette: Nord (https://www.nordtheme.com)*/
+    --nord00: #2e3440;
+    --nord01: #3b4252;
+    --nord02: #434c5e;
+    --nord03: #4c566a;
+    --nord04: #d8dee9;
+    --nord05: #e5e9f0;
+    --nord06: #eceff4;
+    --nord07: #8fbcbb;
+    --nord08: #88c0d0;
+    --nord09: #81a1c1;
+    --nord0A: #5e81ac;
+    --nord0B: #bf616a;
+    --nord0C: #d08770;
+    --nord0D: #ebcb8b;
+    --nord0E: #a3be8c;
+    --nord0F: #b48ead;
+
+
+    /* Typograph */
+    --font-family-default: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen-Sans, Ubuntu, Cantarell, "Helvetica Neue",
+    sans-serif;
+    --font-size-scaler: 62.5%;
+    --font-size-m: 1.6rem;
+    --font-size-s: 1.4rem;
+
+    /* Components */
+    --body-color: var(--nord06);
+    --body-bg: var(--nord00);
+
+    --header-title: var(--nord06);
+    --header-container: var(--nord00);
+    --header-title-preffix: var(--nord0F);
+
+    --chip-font: var(--nord08);
+    --chip-color: var(--nord0B);
+
+    --icons: var(--nord06);
+    --icons-hover: var(--nord0F);
+
+    --day-container: var(--nord01);
+    --date: var(--nord09);
+
+    --summary: var(--nord0E);
+    --summary-hover: var(--nord0F);
+
+    --details-open: var(--nord02);
+    --details-content: var(--nord05);
+    --details-a: var(--nord07);
+    --details-a-hover: var(--nord0F);
+
+    --highlight-title: var(--nord0B);
+    --highlight-author: var(--nord0B);
+
+    --article-summary-hover-color: var(--nord0D);
+    --article-summary-color: var(--nord04);
+
+    --article-title-color: var(--nord05);
+    --article-title-hover-color: var(--nord0E);
+
+    --accordion-content-rail-color: var(--nord01);
+    --accordion-content-hover-rail-color: var(--nord0D);
+    --accordion-title-marker-color: var(--nord01);
+    --accordion-title-hover-marker-color: var(--nord0E);
+
+    --footer-color: var(--nord04);
+    --footer-link-hover-color: var(--nord0D);
+}
+
+[data-theme="light"] {
+    /* Theme design */
+
+    --color-primary: var(--nord07);
+    --color-primary-second: var(--nord00);
+    --color-info: var(--nord0A);
+    --color-success: var(--nord0E);
+    --color-warning: var(--nord0C);
+    --color-danger: var(--nord0B);
+
+    --color-text: var(--nord00);
+    --color-hover: var(--nord0D);
+    --color-shadow: var(--nord03);
+
+    --color-primary-h: var(--nord09);
+    --color-primary-s: var(--nord08);
+    --color-primary-l: var(--nord07);
+
+    --color-contrast-higher-h: var(--nord01);
+    --color-contrast-higher-l: var(--nord02);
+    --color-contrast-higher-s: var(--nord03);
+
+    --color-content: white;
+
+    --background: var(--nord06);
+    --background-content: var(--nord05);
+    --background-color: var(--nord04);
+
+    /* Components */
+
+    --chip-font: var(--nord06);
+    --chip-color: var(--nord09);
+
+    --body-color: var(--background-color);
+    --body-bg: var(--background);
+
+    --header-title: var(--color-shadow);
+    --header-container: var(--background);
+    --header-title-preffix: var(--color-primary-h);
+
+    --icons: var(--color-shadow);
+    --icons-hover: var(--color-hover);
+
+    --day-container: var(--background-content);
+    --date: var(--color-primary-l);
+
+    --summary: var(--color-info);
+    --summary-hover: var(--color-success);
+
+    --details-open: var(--color-content);
+    --details-content: var(--color-text);
+    --details-a: var(--color-primary-h);
+    --details-a-hover: var(--color-hover);
+
+    --highlight-title: var(--color-danger);
+    --highlight-author: var(--color-warning);
+
+    --article-summary-color: var(--color-text);
+    --article-summary-hover-color: var(--color-primary-s);
+
+    --article-title-color: var(--color-primary);
+    --article-title-hover-color: var(--color-success);
+
+    --accordion-content-rail-color: var(--color-warning);
+    --accordion-content-hover-rail-color: var(--color-warning);
+    --accordion-title-marker-color: var(--color-success);
+    --accordion-title-hover-marker-color: var(--color-success);
+
+    --footer-color: var(--color-text);
+    --footer-link-hover-color: var(--color-hover);
+}
+
+html {
+    font-size: var(--font-size-scaler);
+}
+
+body {
+    background-color: var(--body-bg);
+    font-family: var(--font-family-default);
+    color: var(--body-color);
+    margin: 0;
+    padding-top: 16px;
+    display: grid;
+}
+
+.header-container {
+    width: 90%;
+    max-width: 1200px;
+    background: var(--header-container);
+    margin: 0 auto;
+}
+
+.header-title {
+    font-size: 32px;
+    font-weight: bold;
+    color: var(--header-title);
+    margin: 0;
+    padding-bottom: 14px;
+}
+
+.header-title-preffix {
+    color: var(--header-title-preffix);
+}
+
+.icons {
+    color: var(--icons);
+    padding-bottom: 16px;
+}
+
+.icons a {
+    color: var(--icons);
+    text-decoration: none;
+}
+
+.icons a:hover {
+    color: var(--icons-hover);
+}
+
+.day-container {
+    padding: 16px 16px 16px 16px;
+    background: var(--day-container);
+    width: 90%;
+    max-width: 1200px;
+    margin: 0 auto;
+    margin-bottom: 8px;
+    border-radius: 10px;
+}
+
+.date {
+    font-size: 24px;
+    font-weight: 700;
+    margin: 0;
+    color: var(--date);
+}
+
+p {
+    margin: 0;
+}
+
+summary {
+    font-weight: 600;
+    color: var(--summary);
+}
+
+summary:hover {
+    text-decoration: underline;
+    cursor: pointer;
+    color: var(--summary-hover);
+}
+
+details {
+    --border-color: transparent;
+
+    padding: 2px 4px;
+    font-size: 20px;
+    border: 1px solid var(--border-color);
+    border-radius: 4px;
+}
+
+details[open] {
+    background-color: var(--details-open);
+    margin-bottom: 8px;
+}
+
+.details-content {
+    padding: 12px 3px;
+    gap: 16px;
+    color: var(--details-content);
+}
+
+details a {
+    color: var(--details-a);
+}
+
+details a:hover {
+    color: var(--details-a-hover);
+}
+
+footer {
+    margin: 0 auto;
+    color: var(--footer-color);
+    font-size: var(--font-size-s);
+    display: flex;
+    padding: 0 16px;
+    justify-content: space-between;
+}
+
+.description {
+    margin: 0 auto;
+    color: var(--footer-color);
+    font-size: var(--font-size-s);
+    display: flex;
+    padding: 0 16px;
+    text-align: center;
+}
+
+.highlight-author {
+    color: var(--highlight-author);
+    font-weight: bold;
+}
+
+.highlight-title {
+    color: var(--highlight-title);
+    font-weight: bold;
+}
+
+.channel-description {
+    text-align: center;
+    font-size: var(--font-size-scaler);
+}
+
+.article-summary-link {
+    color: var(--article-summary-color);
+    font-size: var(--font-size-s);
+    text-decoration: none;
+}
+
+.article-summary-link:hover {
+    color: var(--article-summary-hover-color);
+    --accordion-content-rail-color: var(--accordion-content-hover-rail-color);
+}
+
+.article-summary-box-outer {
+    display: block;
+    padding: 4px 8px 8px 4px;
+}
+
+.article-summary-box-inner {
+    padding-left: 8px;
+    border-left: 1px solid var(--accordion-content-rail-color);
+    font-size: var(--font-size-m);
+}
+
+.article-expander {
+    padding: 10px 4px;
+    border-radius: 4px;
+}
+
+.article-authors {
+    font-size: var(--font-size-m);
+    padding: 0.25em 1em;
+}
+
+.article-authors a {
+    text-decoration: none;
+}
+
+.article-expander-title {
+    font-size: var(--font-size-m);
+    font-weight: 600;
+}
+
+.article-expander-title:hover {
+    cursor: pointer;
+}
+
+.article-expander-title::marker {
+    color: var(--accordion-title-marker-color);
+}
+
+.article-expander-title:hover::marker {
+    color: var(--accordion-title-hover-marker-color);
+}
+
+/* for switcher */
+.theme-switch {
+    display: inline-block;
+    position: relative;
+}
+
+.theme-switch input {
+    display: none;
+}
+
+/* chip */
+.chip {
+    font-size: 90%;
+    align-items: center;
+    color: var(--chip-font);
+    background: var(--chip-color);
+    border-radius: 5rem;
+    display: inline-flex;
+    padding: .2rem .4rem;
+    vertical-align: middle;
+}
\ No newline at end of file
diff --git a/index.html b/index.html
new file mode 100644
index 00000000..95af67e8
--- /dev/null
+++ b/index.html
@@ -0,0 +1,64390 @@
+<!DOCTYPE html>
+<html lang="en">
+
+<head>
+    <title>MyArxiv</title>
+    <meta charset="utf-8"/>
+    <meta http-equiv="X-UA-Compatible" content="IE=edge"/>
+    <meta name="robots" content="noindex, nofollow"/>
+    <meta name="viewport" content="width=device-width, initial-scale=1"/>
+    <link rel="shortcut icon" type="image/x-icon" href="favicon.ico"/>
+    <link href="index.css" rel="stylesheet"/>
+    <link href="https://cdn.jsdelivr.net/npm/remixicon@2.5.0/fonts/remixicon.css" rel="stylesheet">
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.15.1/dist/katex.min.css"
+          integrity="sha384-R4558gYOUz8mP9YWpZJjofhk+zx0AS11p36HnD2ZKj/6JR5z27gSSULCNHIRReVs" crossorigin="anonymous">
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.15.1/dist/katex.min.css"
+          integrity="sha384-R4558gYOUz8mP9YWpZJjofhk+zx0AS11p36HnD2ZKj/6JR5z27gSSULCNHIRReVs" crossorigin="anonymous">
+    <script defer src="https://cdn.jsdelivr.net/npm/katex@0.15.1/dist/katex.min.js"
+            integrity="sha384-z1fJDqw8ZApjGO3/unPWUPsIymfsJmyrDVWC8Tv/a1HeOtGmkwNd/7xUS0Xcnvsx"
+            crossorigin="anonymous"></script>
+    <script defer src="https://cdn.jsdelivr.net/npm/katex@0.15.1/dist/contrib/auto-render.min.js"
+            integrity="sha384-+XBljXPPiv+OzfbB3cVmLHf4hdUFHlWNZN5spNQ7rmHTXpd7WvJum6fIACpNNfIR"
+            crossorigin="anonymous"></script>
+    <script>
+        document.addEventListener("DOMContentLoaded", function () {
+            renderMathInElement(document.body, {
+                // customised options
+                // • auto-render specific keys, e.g.:
+                delimiters: [
+                    {left: '$$', right: '$$', display: true},
+                    {left: '$', right: '$', display: false},
+                    {left: '\\(', right: '\\)', display: false},
+                    {left: '\\[', right: '\\]', display: true},
+                    {left: "\\begin{equation}", right: "\\end{equation}", display: true},
+                    {left: "\\begin{align}", right: "\\end{align}", display: true},
+                    {left: "\\begin{alignat}", right: "\\end{alignat}", display: true},
+                    {left: "\\begin{gather}", right: "\\end{gather}", display: true},
+                    {left: "\\begin{CD}", right: "\\end{CD}", display: true},
+                ],
+                // • rendering keys, e.g.:
+                throwOnError: false
+            });
+        });
+    </script>
+</head>
+
+<body>
+<section class="header-container">
+    <div style="display:flex; justify-content:space-between; align-items:flex-end;">
+        <div>
+            <div class="header-title">
+                MyArxiv
+            </div>
+        </div>
+
+        <div class=icons>
+            <label class="theme-switch" for="checkbox">
+                <input type="checkbox" id="checkbox"/>
+                <i id="theme-icon" class="ri-moon-line" style="font-size: 32px" rel="noopener noreferrer"></i>
+            </label>
+        </div>
+    </div>
+</section>
+
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2024-08-29T00:00:00Z">2024-08-29</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">60</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SAM2Point: Segment Any 3D as Videos in Zero-shot and <span class="highlight-title">Prompt</span>able Manners 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16768v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16768v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyu Guo, Renrui Zhang, Xiangyang Zhu, Chengzhuo Tong, Peng Gao, Chunyuan Li, Pheng-Ann Heng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce SAM2Point, a preliminary exploration adapting Segment Anything
+Model 2 (SAM 2) for zero-shot and promptable 3D segmentation. SAM2Point
+interprets any 3D data as a series of multi-directional videos, and leverages
+SAM 2 for 3D-space segmentation, without further training or 2D-3D projection.
+Our framework supports various prompt types, including 3D points, boxes, and
+masks, and can generalize across diverse scenarios, such as 3D objects, indoor
+scenes, outdoor environments, and raw sparse LiDAR. Demonstrations on multiple
+3D datasets, e.g., Objaverse, S3DIS, ScanNet, Semantic3D, and KITTI, highlight
+the robust generalization capabilities of SAM2Point. To our best knowledge, we
+present the most faithful implementation of SAM in 3D, which may serve as a
+starting point for future research in promptable 3D segmentation. Online Demo:
+https://huggingface.co/spaces/ZiyuG/SAM2Point . Code:
+https://github.com/ZiyuGuo99/SAM2Point .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress. Online Demo:
+  https://huggingface.co/spaces/ZiyuG/SAM2Point . Code:
+  https://github.com/ZiyuGuo99/SAM2Point</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How Far Can Cantonese NLP Go? Benchmarking Cantonese Capabilities of
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16756v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16756v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiyue Jiang, Liheng Chen, Pengan Chen, Sheng Wang, Qinghang Bao, Lingpeng Kong, Yu Li, Chuan Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid evolution of large language models (LLMs) has transformed the
+competitive landscape in natural language processing (NLP), particularly for
+English and other data-rich languages. However, underrepresented languages like
+Cantonese, spoken by over 85 million people, face significant development gaps,
+which is particularly concerning given the economic significance of the
+Guangdong-Hong Kong-Macau Greater Bay Area, and in substantial
+Cantonese-speaking populations in places like Singapore and North America.
+Despite its wide use, Cantonese has scant representation in NLP research,
+especially compared to other languages from similarly developed regions. To
+bridge these gaps, we outline current Cantonese NLP methods and introduce new
+benchmarks designed to evaluate LLM performance in factual generation,
+mathematical logic, complex reasoning, and general knowledge in Cantonese,
+which aim to advance open-source Cantonese LLM technology. We also propose
+future research directions and recommended models to enhance Cantonese LLM
+development.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reinforcement Learning without Human Feedback for Last Mile Fine-Tuning
+  of Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16753v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16753v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alec Solway
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning is used to align language models with human preference
+signals after first pre-training the model to predict the next token of text
+within a large corpus using likelihood maximization. Before being deployed in a
+specific domain, models are often further fine-tuned on task specific data.
+Since human preferences are often unavailable for the last step, it is
+performed using likelihood maximization as that is the typical default method.
+However, reinforcement learning has other advantages besides facilitating
+alignment to a human derived reward function. For one, whereas likelihood
+maximization is a form of imitation learning in which the model is trained on
+what to do under ideal conditions, reinforcement learning is not limited to
+demonstrating actions just for optimally reached states and trains a model what
+to do under a range of scenarios as it explores the policy space. In addition,
+it also trains a model what not to do, suppressing competitive but poor
+actions. This work develops a framework for last-mile fine-tuning using
+reinforcement learning and tests whether it garners performance gains. The
+experiments center on abstractive summarization, but the framework is general
+and broadly applicable. Use of the procedure produced significantly better
+results than likelihood maximization when comparing raw predictions. For the
+specific data tested, the gap could be bridged by employing post-processing of
+the maximum likelihood outputs. Nonetheless, the framework offers a new avenue
+for model optimization in situations where post-processing may be less
+straightforward or effective, and it can be extended to include more complex
+classes of undesirable outputs to penalize and train against, such as
+hallucinations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Gradient Analysis Framework for Rewarding Good and Penalizing Bad
+  Examples in Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16751v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16751v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi-Lin Tuan, William Yang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Beyond maximum likelihood estimation (MLE), the standard objective of a
+language model (LM) that optimizes good examples probabilities, many studies
+have explored ways that also penalize bad examples for enhancing the quality of
+output distribution, including unlikelihood training, exponential maximizing
+average treatment effect (ExMATE), and direct preference optimization (DPO). To
+systematically compare these methods and further provide a unified recipe for
+LM optimization, in this paper, we present a unique angle of gradient analysis
+of loss functions that simultaneously reward good examples and penalize bad
+ones in LMs. Through both mathematical results and experiments on
+CausalDialogue and Anthropic HH-RLHF datasets, we identify distinct functional
+characteristics among these methods. We find that ExMATE serves as a superior
+surrogate for MLE, and that combining DPO with ExMATE instead of MLE further
+enhances both the statistical (5-7%) and generative (+18% win rate)
+performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Assessing Large Language Models for Online Extremism Research:
+  Identification, Explanation, and New Knowledge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16749v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16749v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Beidi Dong, Jin R. Lee, Ziwei Zhu, Balassubramanian Srinivasan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The United States has experienced a significant increase in violent
+extremism, prompting the need for automated tools to detect and limit the
+spread of extremist ideology online. This study evaluates the performance of
+Bidirectional Encoder Representations from Transformers (BERT) and Generative
+Pre-Trained Transformers (GPT) in detecting and classifying online domestic
+extremist posts. We collected social media posts containing "far-right" and
+"far-left" ideological keywords and manually labeled them as extremist or
+non-extremist. Extremist posts were further classified into one or more of five
+contributing elements of extremism based on a working definitional framework.
+The BERT model's performance was evaluated based on training data size and
+knowledge transfer between categories. We also compared the performance of GPT
+3.5 and GPT 4 models using different prompts: na\"ive, layperson-definition,
+role-playing, and professional-definition. Results showed that the best
+performing GPT models outperformed the best performing BERT models, with more
+detailed prompts generally yielding better results. However, overly complex
+prompts may impair performance. Different versions of GPT have unique
+sensitives to what they consider extremist. GPT 3.5 performed better at
+classifying far-left extremist posts, while GPT 4 performed better at
+classifying far-right extremist posts. Large language models, represented by
+GPT models, hold significant potential for online extremism classification
+tasks, surpassing traditional BERT models in a zero-shot setting. Future
+research should explore human-computer interactions in optimizing GPT models
+for extremist detection and classification tasks to develop more efficient
+(e.g., quicker, less effort) and effective (e.g., fewer errors or mistakes)
+methods for identifying extremist content.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Theoretical and Methodological Framework for Studying Texts Produced by
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16740v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16740v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiří Milička
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper addresses the conceptual, methodological and technical challenges
+in studying large language models (LLMs) and the texts they produce from a
+quantitative linguistics perspective. It builds on a theoretical framework that
+distinguishes between the LLM as a substrate and the entities the model
+simulates. The paper advocates for a strictly non-anthropomorphic approach to
+models while cautiously applying methodologies used in studying human
+linguistic behavior to the simulated entities. While natural language
+processing researchers focus on the models themselves, their architecture,
+evaluation, and methods for improving performance, we as quantitative linguists
+should strive to build a robust theory concerning the characteristics of texts
+produced by LLMs, how they differ from human-produced texts, and the properties
+of simulated entities. Additionally, we should explore the potential of LLMs as
+an instrument for studying human culture, of which language is an integral
+part.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Smaller, Weaker, Yet Better: Training LLM Reasoners via Compute-Optimal
+  Sampling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16737v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16737v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hritik Bansal, Arian Hosseini, Rishabh Agarwal, Vinh Q. Tran, Mehran Kazemi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training on high-quality synthetic data from strong language models (LMs) is
+a common strategy to improve the reasoning performance of LMs. In this work, we
+revisit whether this strategy is compute-optimal under a fixed inference budget
+(e.g., FLOPs). To do so, we investigate the trade-offs between generating
+synthetic data using a stronger but more expensive (SE) model versus a weaker
+but cheaper (WC) model. We evaluate the generated data across three key
+metrics: coverage, diversity, and false positive rate, and show that the data
+from WC models may have higher coverage and diversity, but also exhibit higher
+false positive rates. We then finetune LMs on data from SE and WC models in
+different settings: knowledge distillation, self-improvement, and a novel
+weak-to-strong improvement setup where a weaker LM teaches reasoning to a
+stronger LM. Our findings reveal that models finetuned on WC-generated data
+consistently outperform those trained on SE-generated data across multiple
+benchmarks and multiple choices of WC and SE models. These results challenge
+the prevailing practice of relying on SE models for synthetic data generation,
+suggesting that WC may be the compute-optimal approach for training advanced LM
+reasoners.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mini-Omni: Language Models Can Hear, Talk While Thinking in Streaming 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16725v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16725v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhifei Xie, Changqiao Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in language models have achieved significant progress.
+GPT-4o, as a new milestone, has enabled real-time conversations with humans,
+demonstrating near-human natural fluency. Such human-computer interaction
+necessitates models with the capability to perform reasoning directly with the
+audio modality and generate output in streaming. However, this remains beyond
+the reach of current academic models, as they typically depend on extra TTS
+systems for speech synthesis, resulting in undesirable latency. This paper
+introduces the Mini-Omni, an audio-based end-to-end conversational model,
+capable of real-time speech interaction. To achieve this capability, we propose
+a text-instructed speech generation method, along with batch-parallel
+strategies during inference to further boost the performance. Our method also
+helps to retain the original model's language capabilities with minimal
+degradation, enabling other works to establish real-time interaction
+capabilities. We call this training method "Any Model Can Talk". We also
+introduce the VoiceAssistant-400K dataset to fine-tune models optimized for
+speech output. To our best knowledge, Mini-Omni is the first fully end-to-end,
+open-source model for real-time speech interaction, offering valuable potential
+for future research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Jina-Col<span class="highlight-title">BERT</span>-v2: A General-Purpose Multilingual Late Interaction
+  Retriever 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16672v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16672v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rohan Jha, Bo Wang, Michael Günther, Saba Sturua, Mohammad Kalim Akram, Han Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-vector dense models, such as ColBERT, have proven highly effective in
+information retrieval. ColBERT's late interaction scoring approximates the
+joint query-document attention seen in cross-encoders while maintaining
+inference efficiency closer to traditional dense retrieval models, thanks to
+its bi-encoder architecture and recent optimizations in indexing and search. In
+this paper, we introduce several improvements to the ColBERT model architecture
+and training pipeline, leveraging techniques successful in the more established
+single-vector embedding model paradigm, particularly those suited for
+heterogeneous multilingual data. Our new model, Jina-ColBERT-v2, demonstrates
+strong performance across a range of English and multilingual retrieval tasks,
+while also cutting storage requirements by up to 50% compared to previous
+models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Iterative Graph Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16667v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16667v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fangyuan Yu, Hardeep Singh Arora, Matt Johnson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  By compressing diverse narratives, LLMs go beyond memorization, achieving
+intelligence by capturing generalizable causal relationships. However, they
+suffer from local 'representation gaps' due to insufficient training data
+diversity, limiting their real-world utility, especially in tasks requiring
+strict alignment to rules. Traditional alignment methods relying on heavy human
+annotations are inefficient and unscalable. Recent self-alignment techniques
+also fall short, as they often depend on self-selection based prompting and
+memorization-based learning. To address these issues, we introduce Iterative
+Graph Alignment (IGA), an annotation-free rule-based alignment algorithm. A
+teacher model (VLM) employs Iterative Graph Prompting (IGP) to create logical
+graphs and reference answers. The student model (LLM) identifies local
+knowledge gaps by attempting to align its responses with these references,
+collaborating with helper models to generate diverse answers. These aligned
+responses are then used for iterative supervised fine-tuning (SFT). Our
+evaluations across five rule-based scenarios demonstrate IGP's effectiveness,
+with a 73.12\% alignment improvement in Claude Sonnet 3.5, and
+Llama3-8B-Instruct achieving an 86.20\% improvement, outperforming Claude
+Sonnet 3.5 in rule-based alignment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Dialogue Generation in Werewolf Game Through Situation
+  Analysis and Persuasion Strategies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16586v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16586v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiyang Qi, Michimasa Inaba
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in natural language processing, particularly with large
+language models (LLMs) like GPT-4, have significantly enhanced dialogue
+systems, enabling them to generate more natural and fluent conversations.
+Despite these improvements, challenges persist, such as managing continuous
+dialogues, memory retention, and minimizing hallucinations. The AIWolfDial2024
+addresses these challenges by employing the Werewolf Game, an incomplete
+information game, to test the capabilities of LLMs in complex interactive
+environments. This paper introduces a LLM-based Werewolf Game AI, where each
+role is supported by situation analysis to aid response generation.
+Additionally, for the werewolf role, various persuasion strategies, including
+logical appeal, credibility appeal, and emotional appeal, are employed to
+effectively persuade other players to align with its actions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the AIWolfDial2024 workshop at INLG 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Predictability maximization and the origins of word order harmony 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16570v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16570v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ramon Ferrer-i-Cancho
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We address the linguistic problem of the sequential arrangement of a head and
+its dependents from an information theoretic perspective. In particular, we
+consider the optimal placement of a head that maximizes the predictability of
+the sequence. We assume that dependents are statistically independent given a
+head, in line with the open-choice principle and the core assumptions of
+dependency grammar. We demonstrate the optimality of harmonic order, i.e.,
+placing the head last maximizes the predictability of the head whereas placing
+the head first maximizes the predictability of dependents. We also show that
+postponing the head is the optimal strategy to maximize its predictability
+while bringing it forward is the optimal strategy to maximize the
+predictability of dependents. We unravel the advantages of the strategy of
+maximizing the predictability of the head over maximizing the predictability of
+dependents. Our findings shed light on the placements of the head adopted by
+real languages or emerging in different kinds of experiments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SALSA: Speedy ASR-LLM Synchronous Aggregation <span class="chip">INTERSPEECH 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16542v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16542v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ashish Mittal, Darshan Prabhu, Sunita Sarawagi, Preethi Jyothi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Harnessing pre-trained LLMs to improve ASR systems, particularly for
+low-resource languages, is now an emerging area of research. Existing methods
+range from using LLMs for ASR error correction to tightly coupled systems that
+replace the ASR decoder with the LLM. These approaches either increase decoding
+time or require expensive training of the cross-attention layers. We propose
+SALSA, which couples the decoder layers of the ASR to the LLM decoder, while
+synchronously advancing both decoders. Such coupling is performed with a simple
+projection of the last decoder state, and is thus significantly more training
+efficient than earlier approaches. A challenge of our proposed coupling is
+handling the mismatch between the tokenizers of the LLM and ASR systems. We
+handle this mismatch using cascading tokenization with respect to the LLM and
+ASR vocabularies. We evaluate SALSA on 8 low-resource languages in the FLEURS
+benchmark, yielding substantial WER reductions of up to 38%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to INTERSPEECH 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CNIMA: A Universal Evaluation Framework and Automated Approach for
+  Assessing Second Language Dialogues 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16518v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16518v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rena Gao, Jingxuan Wu, Carsten Roever, Xuetong Wu, Jing Wu, Long Lv, Jey Han Lau
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We develop CNIMA (Chinese Non-Native Interactivity Measurement and
+Automation), a Chinese-as-a-second-language labelled dataset with 10K
+dialogues. We annotate CNIMA using an evaluation framework -- originally
+introduced for English-as-a-second-language dialogues -- that assesses
+micro-level features (e.g.\ backchannels) and macro-level interactivity labels
+(e.g.\ topic management) and test the framework's transferability from English
+to Chinese. We found the framework robust across languages and revealed
+universal and language-specific relationships between micro-level and
+macro-level features. Next, we propose an approach to automate the evaluation
+and find strong performance, creating a new tool for automated second language
+assessment. Our system can be adapted to other languages easily as it uses
+large language models and as such does not require large-scale annotated
+training data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLMs vs Established Text Augmentation Techniques for Classification:
+  When do the Benefits Outweight the Costs? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16502v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16502v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jan Cegin, Jakub Simko, Peter Brusilovsky
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The generative large language models (LLMs) are increasingly being used for
+data augmentation tasks, where text samples are LLM-paraphrased and then used
+for classifier fine-tuning. However, a research that would confirm a clear
+cost-benefit advantage of LLMs over more established augmentation methods is
+largely missing. To study if (and when) is the LLM-based augmentation
+advantageous, we compared the effects of recent LLM augmentation methods with
+established ones on 6 datasets, 3 classifiers and 2 fine-tuning methods. We
+also varied the number of seeds and collected samples to better explore the
+downstream model accuracy space. Finally, we performed a cost-benefit analysis
+and show that LLM-based methods are worthy of deployment only when very small
+number of seeds is used. Moreover, in many cases, established methods lead to
+similar or better model accuracies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning from Negative Samples in Generative Biomedical Entity Linking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16493v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16493v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chanhwi Kim, Hyunjae Kim, Sihyeon Park, Jiwoo Lee, Mujeen Sung, Jaewoo Kang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative models have become widely used in biomedical entity linking
+(BioEL) due to their excellent performance and efficient memory usage. However,
+these models are usually trained only with positive samples--entities that
+match the input mention's identifier--and do not explicitly learn from hard
+negative samples, which are entities that look similar but have different
+meanings. To address this limitation, we introduce ANGEL (Learning from
+Negative Samples in Generative Biomedical Entity Linking), the first framework
+that trains generative BioEL models using negative samples. Specifically, a
+generative model is initially trained to generate positive samples from the
+knowledge base for given input entities. Subsequently, both correct and
+incorrect outputs are gathered from the model's top-k predictions. The model is
+then updated to prioritize the correct predictions through direct preference
+optimization. Our models fine-tuned with ANGEL outperform the previous best
+baseline models by up to an average top-1 accuracy of 1.4% on five benchmarks.
+When incorporating our framework into pre-training, the performance improvement
+further increases to 1.7%, demonstrating its effectiveness in both the
+pre-training and fine-tuning stages. Our code is available at
+https://github.com/dmis-lab/ANGEL.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Self-Alignment: Improving Alignment of Cultural Values in LLMs via
+  In-Context Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16482v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16482v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rochelle Choenni, Ekaterina Shutova
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Improving the alignment of Large Language Models (LLMs) with respect to the
+cultural values that they encode has become an increasingly important topic. In
+this work, we study whether we can exploit existing knowledge about cultural
+values at inference time to adjust model responses to cultural value probes. We
+present a simple and inexpensive method that uses a combination of in-context
+learning (ICL) and human survey data, and show that we can improve the
+alignment to cultural values across 5 models that include both English-centric
+and multilingual LLMs. Importantly, we show that our method could prove useful
+in test languages other than English and can improve alignment to the cultural
+values that correspond to a range of culturally diverse countries.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Is text normalization relevant for classifying medieval charters? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16446v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16446v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Florian Atzenhofer-Baumgartner, Tamás Kovács
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study examines the impact of historical text normalization on the
+classification of medieval charters, specifically focusing on document dating
+and locating. Using a data set of Middle High German charters from a digital
+archive, we evaluate various classifiers, including traditional and
+transformer-based models, with and without normalization. Our results indicate
+that the given normalization minimally improves locating tasks but reduces
+accuracy for dating, implying that original texts contain crucial features that
+normalization may obscure. We find that support vector machines and gradient
+boosting outperform other models, questioning the efficiency of transformers
+for this use case. Results suggest a selective approach to historical text
+normalization, emphasizing the significance of preserving some textual
+characteristics that are critical for classification tasks in document
+analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This preprint has not undergone peer review or any post-submission
+  improvements or corrections</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Survey</span>Sum: A <span class="highlight-title">Dataset</span> for Summarizing Multiple Scientific Articles into a
+  <span class="highlight-title">Survey</span> Section 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16444v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16444v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leandro Carísio Fernandes, Gustavo Bartz Guedes, Thiago Soares Laitz, Thales Sales Almeida, Rodrigo Nogueira, Roberto Lotufo, Jayr Pereira
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Document summarization is a task to shorten texts into concise and
+informative summaries. This paper introduces a novel dataset designed for
+summarizing multiple scientific articles into a section of a survey. Our
+contributions are: (1) SurveySum, a new dataset addressing the gap in
+domain-specific summarization tools; (2) two specific pipelines to summarize
+scientific articles into a section of a survey; and (3) the evaluation of these
+pipelines using multiple metrics to compare their performance. Our results
+highlight the importance of high-quality retrieval stages and the impact of
+different configurations on the quality of generated summaries.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 6 figures, 1 table. Submitted to BRACIS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Instruction-tuned Large Language Models for Machine Translation in the
+  Medical Domain 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16440v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16440v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miguel Rios
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have shown promising results on machine
+translation for high resource language pairs and domains. However, in
+specialised domains (e.g. medical) LLMs have shown lower performance compared
+to standard neural machine translation models. The consistency in the machine
+translation of terminology is crucial for users, researchers, and translators
+in specialised domains. In this study, we compare the performance between
+baseline LLMs and instruction-tuned LLMs in the medical domain. In addition, we
+introduce terminology from specialised medical dictionaries into the
+instruction formatted datasets for fine-tuning LLMs. The instruction-tuned LLMs
+significantly outperform the baseline models with automatic metrics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MQM-Chat: Multidimensional Quality Metrics for Chat Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16390v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16390v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunmeng Li, Jun Suzuki, Makoto Morishita, Kaori Abe, Kentaro Inui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The complexities of chats pose significant challenges for machine translation
+models. Recognizing the need for a precise evaluation metric to address the
+issues of chat translation, this study introduces Multidimensional Quality
+Metrics for Chat Translation (MQM-Chat). Through the experiments of five models
+using MQM-Chat, we observed that all models generated certain fundamental
+errors, while each of them has different shortcomings, such as omission, overly
+correcting ambiguous source content, and buzzword issues, resulting in the loss
+of stylized information. Our findings underscore the effectiveness of MQM-Chat
+in evaluating chat translation, emphasizing the importance of stylized content
+and dialogue consistency for future studies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Unreasonable Ineffectiveness of Nucleus Sampling on Mitigating Text
+  Memorization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16345v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16345v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luka Borec, Philipp Sadler, David Schlangen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work analyses the text memorization behavior of large language models
+(LLMs) when subjected to nucleus sampling. Stochastic decoding methods like
+nucleus sampling are typically applied to overcome issues such as monotonous
+and repetitive text generation, which are often observed with
+maximization-based decoding techniques. We hypothesize that nucleus sampling
+might also reduce the occurrence of memorization patterns, because it could
+lead to the selection of tokens outside the memorized sequence. To test this
+hypothesis we create a diagnostic dataset with a known distribution of
+duplicates that gives us some control over the likelihood of memorization of
+certain parts of the training data. Our analysis of two GPT-Neo models
+fine-tuned on this dataset interestingly shows that (i) an increase of the
+nucleus size reduces memorization only modestly, and (ii) even when models do
+not engage in "hard" memorization -- a verbatim reproduction of training
+samples -- they may still display "soft" memorization whereby they generate
+outputs that echo the training data but without a complete one-by-one
+resemblance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, Accepted at INLG 2024 (International Natural Language
+  Generation Conference)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Critic-CoT: Boosting the reasoning abilities of large language model via
+  Chain-of-thoughts Critic 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16326v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16326v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Zheng, Jie Lou, Boxi Cao, Xueru Wen, Yuqiu Ji, Hongyu Lin, Yaojie Lu, Xianpei Han, Debing Zhang, Le Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-critic has become an important mechanism for enhancing the reasoning
+performance of LLMs. However, current approaches mainly involve basic prompts
+without further training, which tend to be over-simplified, leading to limited
+accuracy.Moreover, there is a lack of in-depth investigation of the
+relationship between LLM's ability to criticism and its task-solving
+performance.To address these issues, we propose Critic-CoT, a novel framework
+that pushes LLMs toward System-2-like critic capability, via step-wise CoT
+reasoning format and distant-supervision data construction, without the need
+for human annotation. Experiments on GSM8K and MATH show that via filtering out
+invalid solutions or iterative refinement, our enhanced model boosts
+task-solving performance, which demonstrates the effectiveness of our method.
+Further, we find that training on critique and refinement alone improves the
+generation. We hope our work could shed light on future research on improving
+the reasoning and critic ability of LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Physics of Language Models: Part 2.2, How to Learn From Mistakes on
+  Grade-School Math Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16293v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16293v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tian Ye, Zicheng Xu, Yuanzhi Li, Zeyuan Allen-Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language models have demonstrated remarkable performance in solving reasoning
+tasks; however, even the strongest models still occasionally make reasoning
+mistakes. Recently, there has been active research aimed at improving reasoning
+accuracy, particularly by using pretrained language models to "self-correct"
+their mistakes via multi-round prompting. In this paper, we follow this line of
+work but focus on understanding the usefulness of incorporating
+"error-correction" data directly into the pretraining stage. This data consists
+of erroneous solution steps immediately followed by their corrections. Using a
+synthetic math dataset, we show promising results: this type of pretrain data
+can help language models achieve higher reasoning accuracy directly (i.e.,
+through simple auto-regression, without multi-round prompting) compared to
+pretraining on the same amount of error-free data. We also delve into many
+details, such as (1) how this approach differs from beam search, (2) how such
+data can be prepared, (3) whether masking is needed on the erroneous tokens,
+(4) the amount of error required, (5) whether such data can be deferred to the
+fine-tuning stage, and many others.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2407.20311</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Measuring the Accuracy of Automatic Speech Recognition Solutions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16287v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16287v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Korbinian Kuhn, Verena Kersken, Benedikt Reuter, Niklas Egger, Gottfried Zimmermann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For d/Deaf and hard of hearing (DHH) people, captioning is an essential
+accessibility tool. Significant developments in artificial intelligence (AI)
+mean that Automatic Speech Recognition (ASR) is now a part of many popular
+applications. This makes creating captions easy and broadly available - but
+transcription needs high levels of accuracy to be accessible. Scientific
+publications and industry report very low error rates, claiming AI has reached
+human parity or even outperforms manual transcription. At the same time the DHH
+community reports serious issues with the accuracy and reliability of ASR.
+There seems to be a mismatch between technical innovations and the real-life
+experience for people who depend on transcription. Independent and
+comprehensive data is needed to capture the state of ASR. We measured the
+performance of eleven common ASR services with recordings of Higher Education
+lectures. We evaluated the influence of technical conditions like streaming,
+the use of vocabularies, and differences between languages. Our results show
+that accuracy ranges widely between vendors and for the individual audio
+samples. We also measured a significant lower quality for streaming ASR, which
+is used for live events. Our study shows that despite the recent improvements
+of ASR, common services lack reliability in accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing AI-Driven Psychological Consultation: Layered <span class="highlight-title">Prompt</span>s with
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16276v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16276v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rafael Souza, Jia-Hao Lim, Alexander Davis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Psychological consultation is essential for improving mental health and
+well-being, yet challenges such as the shortage of qualified professionals and
+scalability issues limit its accessibility. To address these challenges, we
+explore the use of large language models (LLMs) like GPT-4 to augment
+psychological consultation services. Our approach introduces a novel layered
+prompting system that dynamically adapts to user input, enabling comprehensive
+and relevant information gathering. We also develop empathy-driven and
+scenario-based prompts to enhance the LLM's emotional intelligence and
+contextual understanding in therapeutic settings. We validated our approach
+through experiments using a newly collected dataset of psychological
+consultation dialogues, demonstrating significant improvements in response
+quality. The results highlight the potential of our prompt engineering
+techniques to enhance AI-driven psychological consultation, offering a scalable
+and accessible solution to meet the growing demand for mental health support.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LoraMap: Harnessing the Power of LoRA Connections 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16264v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16264v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyeryun Park, Jeongwon Kwak, Dongsuk Jang, Sumin Park, Jinwook Choi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) can benefit from mitigating hallucinations
+through fact-checking and overcoming substantial computational overhead with
+parameter-efficient techniques such as Low-Rank Adaptation (LoRA). While some
+studies have explored the parallel integration of multiple LoRAs, these
+approaches need attention to the connections between them. This paper
+investigates methods to establish connections among multiple LoRAs. We create
+three reasoning datasets tailored to fact-checking and fine-tune individual
+LoRAs, allowing them to view and reason from diverse perspectives. Then, we
+explore strategies for allocating these reasoning LoRAs and introduce LoraMap,
+an approach to map connections between them. The results on the fact-checking
+task demonstrate that the performance of LoraMap is superior to LoraHub, an
+existing LoRA composition method. LoraMap also outperforms with significantly
+fewer parameters than LoraConcat, which concatenates LoRAs and further
+fine-tunes them.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 9 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Making the Most of your Model: Methods for Finetuning and Applying
+  <span class="highlight-title">Pretrain</span>ed <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16241v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16241v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Davis Yoshida
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This thesis provides methods and analysis of models which make progress on
+this goal. The techniques outlined are task agnostic, and should provide
+benefit when used with nearly any transformer LM. We introduce two new
+finetuning methods which add new capabilities to the models they are used on.
+The first adds a recurrence mechanism, which removes the fixed-window sized
+constraint and improves the efficiency of a transformer decoder. The second
+allows masked language models (MLMs) to be used for initialization of both the
+encoder and decoder of a non-autoregressive sequence-to-sequence transformer,
+opening up generative applications of models which were previously only used
+for natural language understanding tasks.
+  We also introduce two new techniques for improving the quality of predictions
+of any transformer decoder without additional finetuning. One, hidden state
+optimization, can be applied to any transformer decoder to improve the quality
+of predictions at inference time, especially for few-shot classification. The
+other, conditional beam search, allows practitioners to search for natural
+language generation (NLG) model outputs with high likelihood while conditioning
+on the event that the output is not degenerate (e.g. empty, repetitive, etc.).
+  Finally, we provide theoretical and empirical insights on the divergence of
+model-likelihood and output quality which has widely been observed in prior
+work. These insights apply to any model which represents a distribution over
+text, and apply to language models which are not transformers or even
+autoregressive. We argue that the NLP community has, to some extent,
+misunderstood the implications of these findings, and encourage a point of view
+which has more nuance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>PhD thesis</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SSDM: Scalable Speech Dysfluency Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16221v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16221v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiachen Lian, Xuanru Zhou, Zoe Ezzes, Jet Vonk, Brittany Morin, David Baquirin, Zachary Mille, Maria Luisa Gorno Tempini, Gopala Anumanchipalli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speech dysfluency modeling is the core module for spoken language learning,
+and speech therapy. However, there are three challenges. First, current
+state-of-the-art solutions suffer from poor scalability. Second, there is a
+lack of a large-scale dysfluency corpus. Third, there is not an effective
+learning framework. In this paper, we propose \textit{SSDM: Scalable Speech
+Dysfluency Modeling}, which (1) adopts articulatory gestures as scalable forced
+alignment; (2) introduces connectionist subsequence aligner (CSA) to achieve
+dysfluency alignment; (3) introduces a large-scale simulated dysfluency corpus
+called Libri-Dys; and (4) develops an end-to-end system by leveraging the power
+of large language models (LLMs). We expect SSDM to serve as a standard in the
+area of dysfluency modeling. Demo is available at
+\url{https://eureka235.github.io}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ M4CXR: Exploring Multi-task Potentials of Multi-modal Large Language
+  Models for Chest X-ray Interpretation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16213v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16213v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonggwon Park, Soobum Kim, Byungmu Yoon, Jihun Hyun, Kyoyun Choi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid evolution of artificial intelligence, especially in large language
+models (LLMs), has significantly impacted various domains, including
+healthcare. In chest X-ray (CXR) analysis, previous studies have employed LLMs,
+but with limitations: either underutilizing the multi-tasking capabilities of
+LLMs or lacking clinical accuracy. This paper presents M4CXR, a multi-modal LLM
+designed to enhance CXR interpretation. The model is trained on a visual
+instruction-following dataset that integrates various task-specific datasets in
+a conversational format. As a result, the model supports multiple tasks such as
+medical report generation (MRG), visual grounding, and visual question
+answering (VQA). M4CXR achieves state-of-the-art clinical accuracy in MRG by
+employing a chain-of-thought prompting strategy, in which it identifies
+findings in CXR images and subsequently generates corresponding reports. The
+model is adaptable to various MRG scenarios depending on the available inputs,
+such as single-image, multi-image, and multi-study contexts. In addition to
+MRG, M4CXR performs visual grounding at a level comparable to specialized
+models and also demonstrates outstanding performance in VQA. Both quantitative
+and qualitative assessments reveal M4CXR's versatility in MRG, visual
+grounding, and VQA, while consistently maintaining clinical accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From cart to truck: meaning shift through words in English in the last
+  two centuries 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16209v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16209v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Esteban Rodríguez Betancourt, Edgar Casasola Murillo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This onomasiological study uses diachronic word embeddings to explore how
+different words represented the same concepts over time, using historical word
+data from 1800 to 2000. We identify shifts in energy, transport, entertainment,
+and computing domains, revealing connections between language and societal
+changes.
+  Our approach consisted in using diachronic word embeddings trained using
+word2vec with skipgram and aligning them using orthogonal Procrustes. We
+discuss possible difficulties linked to the relationships the method
+identifies. Moreover, we look at the ethical aspects of interpreting results,
+highlighting the need for expert insights to understand the method's
+significance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ReXamine-Global: A Framework for Uncovering Inconsistencies in Radiology
+  Report Generation Metrics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16208v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16208v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oishi Banerjee, Agustina Saenz, Kay Wu, Warren Clements, Adil Zia, Dominic Buensalido, Helen Kavnoudias, Alain S. Abi-Ghanem, Nour El Ghawi, Cibele Luna, Patricia Castillo, Khaled Al-Surimi, Rayyan A. Daghistani, Yuh-Min Chen, Heng-sheng Chao, Lars Heiliger, Moon Kim, Johannes Haubold, Frederic Jonske, Pranav Rajpurkar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given the rapidly expanding capabilities of generative AI models for
+radiology, there is a need for robust metrics that can accurately measure the
+quality of AI-generated radiology reports across diverse hospitals. We develop
+ReXamine-Global, a LLM-powered, multi-site framework that tests metrics across
+different writing styles and patient populations, exposing gaps in their
+generalization. First, our method tests whether a metric is undesirably
+sensitive to reporting style, providing different scores depending on whether
+AI-generated reports are stylistically similar to ground-truth reports or not.
+Second, our method measures whether a metric reliably agrees with experts, or
+whether metric and expert scores of AI-generated report quality diverge for
+some sites. Using 240 reports from 6 hospitals around the world, we apply
+ReXamine-Global to 7 established report evaluation metrics and uncover serious
+gaps in their generalizability. Developers can apply ReXamine-Global when
+designing new report evaluation metrics, ensuring their robustness across
+sites. Additionally, our analysis of existing metrics can guide users of those
+metrics towards evaluation procedures that work reliably at their sites of
+interest.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Benchmarking Japanese Speech Recognition on ASR-LLM Setups with
+  Multi-Pass Augmented Generative Error Correction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16180v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16180v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuka Ko, Sheng Li, Chao-Han Huck Yang, Tatsuya Kawahara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the strong representational power of large language models (LLMs),
+generative error correction (GER) for automatic speech recognition (ASR) aims
+to provide semantic and phonetic refinements to address ASR errors. This work
+explores how LLM-based GER can enhance and expand the capabilities of Japanese
+language processing, presenting the first GER benchmark for Japanese ASR with
+0.9-2.6k text utterances. We also introduce a new multi-pass augmented
+generative error correction (MPA GER) by integrating multiple system hypotheses
+on the input side with corrections from multiple LLMs on the output side and
+then merging them. To the best of our knowledge, this is the first
+investigation of the use of LLMs for Japanese GER, which involves second-pass
+language modeling on the output transcriptions generated by the ASR system
+(e.g., N-best hypotheses). Our experiments demonstrated performance improvement
+in the proposed methods of ASR quality and generalization both in SPREDS-U1-ja
+and CSJ data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>submitted to SLT2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Awes, Laws, and Flaws From Today's LLM Research 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15409v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15409v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adrian de Wynter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We perform a critical examination of the scientific methodology behind
+contemporary large language model (LLM) research. For this we assess over 2,000
+research works based on criteria typical of what is considered good research
+(e.g. presence of statistical tests and reproducibility) and cross-validate it
+with arguments that are at the centre of controversy (e.g., claims of emergent
+behaviour, the use of LLMs as evaluators). We find multiple trends, such as
+declines in claims of emergent behaviour and ethics disclaimers; the rise of
+LLMs as evaluators in spite of a lack of consensus from the community about
+their useability; and an increase of claims of LLM reasoning abilities,
+typically without leveraging human evaluation. This paper underscores the need
+for more scrutiny and rigour by and from this field to live up to the
+fundamentals of a responsible scientific method that is ethical, reproducible,
+systematic, and open to criticism.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review -- v1 was an old draft with an unrevised abstract (oops)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CC-GPX: Extracting High-Quality Annotated Geospatial Data from Common
+  Crawl <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.11039v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.11039v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ilya Ilyankou, Meihui Wang, Stefano Cavazzi, James Haworth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Common Crawl (CC) corpus is the largest open web crawl dataset containing
+9.5+ petabytes of data captured since 2008. The dataset is instrumental in
+training large language models, and as such it has been studied for
+(un)desirable content, and distilled for smaller, domain-specific datasets.
+However, to our knowledge, no research has been dedicated to using CC as a
+source of annotated geospatial data. In this paper, we introduce an efficient
+pipeline to extract annotated user-generated tracks from GPX files found in CC,
+and the resulting multimodal dataset with 1,416 pairings of human-written
+descriptions and MultiLineString vector data from the 6 most recent CC
+releases. The dataset can be used to study people's outdoor activity patterns,
+the way people talk about their outdoor experiences, as well as for developing
+trajectory generation or track annotation models, or for various other problems
+in place of synthetically generated routes. Our reproducible code is available
+on GitHub: https://github.com/ilyankou/cc-gpx
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as a poster to ACM SIGSPATIAL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Quantifying Geospatial in the Common Crawl Corpus <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04952v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04952v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ilya Ilyankou, Meihui Wang, Stefano Cavazzi, James Haworth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) exhibit emerging geospatial capabilities,
+stemming from their pre-training on vast unlabelled text datasets that are
+often derived from the Common Crawl (CC) corpus. However, the geospatial
+content within CC remains largely unexplored, impacting our understanding of
+LLMs' spatial reasoning. This paper investigates the prevalence of geospatial
+data in recent Common Crawl releases using Gemini 1.5, a powerful language
+model. By analyzing a sample of documents and manually revising the results, we
+estimate that 18.7% of web documents in CC contain geospatial information such
+as coordinates and addresses. We find little difference in prevalence between
+Enlgish- and non-English-language documents. Our findings provide quantitative
+insights into the nature and extent of geospatial data in CC, and lay the
+groundwork for future studies of geospatial biases of LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as a poster to ACM SIGSPATIAL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GEAR: An Efficient KV Cache Compression Recipe for Near-Lossless
+  Generative Inference of LLM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.05527v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.05527v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Kang, Qingru Zhang, Souvik Kundu, Geonhwa Jeong, Zaoxing Liu, Tushar Krishna, Tuo Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Key-value (KV) caching has become the de-facto to accelerate generation speed
+for large language models (LLMs) inference. However, the growing cache demand
+with increasing sequence length has transformed LLM inference to be a memory
+bound problem, significantly constraining the system throughput. Existing
+methods rely on dropping unimportant tokens or quantizing all entries
+uniformly. Such methods, however, often incur high approximation errors to
+represent the compressed matrices. The autoregressive decoding process further
+compounds the error of each step, resulting in critical deviation in model
+generation and deterioration of performance. To tackle this challenge, we
+propose GEAR, an efficient KV cache compression framework that achieves
+near-lossless high-ratio compression. GEAR first applies quantization to
+majority of entries of similar magnitudes to ultra-low precision. It then
+employs a low rank matrix to approximate the quantization error, and a sparse
+matrix to remedy individual errors from outlier entries. By adeptly integrating
+three techniques, GEAR is able to fully exploit their synergistic potentials.
+Our experiments demonstrate that compared to alternatives, GEAR achieves
+near-lossless 4-bit KV cache compression with up to 2.38x throughput
+improvement, while reducing peak-memory size up to 2.29x. Our code is publicly
+available at https://github.com/HaoKang-Timmy/GEAR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Not (yet) the whole story: Evaluating Visual Storytelling Requires More
+  than Measuring Coherence, Grounding, and Repetition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.04559v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.04559v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aditya K Surikuchi, Raquel Fernández, Sandro Pezzelle
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual storytelling consists in generating a natural language story given a
+temporally ordered sequence of images. This task is not only challenging for
+models, but also very difficult to evaluate with automatic metrics since there
+is no consensus about what makes a story 'good'. In this paper, we introduce a
+novel method that measures story quality in terms of human likeness regarding
+three key aspects highlighted in previous work: visual grounding, coherence,
+and repetitiveness. We then use this method to evaluate the stories generated
+by several models, showing that the foundation model LLaVA obtains the best
+result, but only slightly so compared to TAPM, a 50-times smaller visual
+storytelling model. Upgrading the visual and language components of TAPM
+results in a model that yields competitive performance with a relatively low
+number of parameters. Finally, we carry out a human evaluation study, whose
+results suggest that a 'good' story may require more than a human-like level of
+visual grounding, coherence, and repetition.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Smart Multi-Modal Search: Contextual Sparse and Dense Embedding
+  Integration in Adobe Express <span class="chip">CIKM 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14698v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14698v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cherag Aroraa, Tracy Holloway King, Jayant Kumar, Yi Lu, Sanat Sharma, Arvind Srikantan, David Uvalle, Josep Valls-Vargas, Harsha Vardhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As user content and queries become increasingly multi-modal, the need for
+effective multi-modal search systems has grown. Traditional search systems
+often rely on textual and metadata annotations for indexed images, while
+multi-modal embeddings like CLIP enable direct search using text and image
+embeddings. However, embedding-based approaches face challenges in integrating
+contextual features such as user locale and recency. Building a scalable
+multi-modal search system requires fine-tuning several components. This paper
+presents a multi-modal search architecture and a series of AB tests that
+optimize embeddings and multi-modal technologies in Adobe Express template
+search. We address considerations such as embedding model selection, the roles
+of embeddings in matching and ranking, and the balance between dense and sparse
+embeddings. Our iterative approach demonstrates how utilizing sparse, dense,
+and contextual features enhances short and long query search, significantly
+reduces null rates (over 70\%), and increases click-through rates (CTR). Our
+findings provide insights into developing robust multi-modal search systems,
+thereby enhancing relevance for complex queries.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CIKM 2024 (International Conference on Information and Knowledge
+  Management), Multimodal Search and Recommendations Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mitigating Exaggerated Safety in Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.05418v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.05418v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruchira Ray, Ruchi Bhalani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the popularity of Large Language Models (LLMs) grow, combining model
+safety with utility becomes increasingly important. The challenge is making
+sure that LLMs can recognize and decline dangerous prompts without sacrificing
+their ability to be helpful. The problem of "exaggerated safety" demonstrates
+how difficult this can be. To reduce excessive safety behaviours -- which was
+discovered to be 26.1% of safe prompts being misclassified as dangerous and
+refused -- we use a combination of XSTest dataset prompts as well as
+interactive, contextual, and few-shot prompting to examine the decision bounds
+of LLMs such as Llama2, Gemma Command R+, and Phi-3. We find that few-shot
+prompting works best for Llama2, interactive prompting works best Gemma, and
+contextual prompting works best for Command R+ and Phi-3. Using a combination
+of these prompting strategies, we are able to mitigate exaggerated safety
+behaviors by an overall 92.9% across all LLMs. Our work presents a multiple
+prompting strategies to jailbreak LLMs' decision-making processes, allowing
+them to navigate the tight line between refusing unsafe prompts and remaining
+helpful.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 8 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adaptive Reinforcement Learning Planning: Harnessing Large Language
+  Models for Complex Information Extraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.11455v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.11455v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zepeng Ding, Ruiyang Ke, Wenhao Huang, Guochao Jiang, Yanda Li, Deqing Yang, Jiaqing Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing research on large language models (LLMs) shows that they can solve
+information extraction tasks through multi-step planning. However, their
+extraction behavior on complex sentences and tasks is unstable, emerging issues
+such as false positives and missing elements. We observe that decomposing
+complex extraction tasks and extracting them step by step can effectively
+improve LLMs' performance, and the extraction orders of entities significantly
+affect the final results of LLMs. This paper proposes a two-stage multi-step
+method for LLM-based information extraction and adopts the RL framework to
+execute the multi-step planning. We regard sequential extraction as a Markov
+decision process, build an LLM-based extraction environment, design a decision
+module to adaptively provide the optimal order for sequential entity extraction
+on different sentences, and utilize the DDQN algorithm to train the decision
+model. We also design the rewards and evaluation metrics suitable for the
+extraction results of LLMs. We conduct extensive experiments on multiple public
+datasets to demonstrate the effectiveness of our method in improving the
+information extraction capabilities of LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Conan-embedding: General Text Embedding with More and Better Negative
+  Samples 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15710v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15710v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiyu Li, Yang Tang, Shizhe Chen, Xi Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the growing popularity of RAG, the capabilities of embedding models are
+gaining increasing attention. Embedding models are primarily trained through
+contrastive loss learning, with negative examples being a key component.
+Previous work has proposed various hard negative mining strategies, but these
+strategies are typically employed as preprocessing steps. In this paper, we
+propose the conan-embedding model, which maximizes the utilization of more and
+higher-quality negative examples. Specifically, since the model's ability to
+handle preprocessed negative examples evolves during training, we propose
+dynamic hard negative mining method to expose the model to more challenging
+negative examples throughout the training process. Secondly, contrastive
+learning requires as many negative examples as possible but is limited by GPU
+memory constraints. Therefore, we use a Cross-GPU balancing Loss to provide
+more negative examples for embedding training and balance the batch size across
+multiple tasks. Moreover, we also discovered that the prompt-response pairs
+from LLMs can be used for embedding training. Our approach effectively enhances
+the capabilities of embedding models, currently ranking first on the Chinese
+leaderboard of Massive text embedding benchmark
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Innovative Speech-Based Deep Learning Approaches for Parkinson's Disease
+  Classification: A Systematic <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.17844v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.17844v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lisanne van Gelderen, Cristian Tejedor-García
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Parkinson's disease (PD), the second most prevalent neurodegenerative
+disorder worldwide, frequently presents with early-stage speech impairments.
+Recent advancements in Artificial Intelligence (AI), particularly deep learning
+(DL), have significantly enhanced PD diagnosis through the analysis of speech
+data. Nevertheless, the progress of research is restricted by the limited
+availability of publicly accessible speech-based PD datasets, primarily due to
+privacy concerns. The goal of this systematic review is to explore the current
+landscape of speech-based DL approaches for PD classification, based on 33
+scientific works published between 2020 and March 2024. We discuss their
+available resources, capabilities, potential limitations, and issues related to
+bias, explainability, and privacy. Furthermore, this review provides an
+overview of publicly accessible speech-based datasets and open-source material
+for PD. The DL approaches are categorized into end-to-end (E2E) learning,
+transfer learning (TL) and deep acoustic features extraction (DAFE) approaches.
+Among E2E approaches, Convolutional Neural Networks (CNNs) are prevalent,
+though Transformers are increasingly popular. E2E approaches face challenges
+such as limited data and computational resources, especially with Transformers.
+TL addresses these issues by providing more robust PD diagnosis and better
+generalizability across languages. DAFE aims to improve the explainability and
+interpretability of results by examining the specific effects of deep features
+on both other DL approaches and more traditional machine learning (ML) methods.
+However, it often underperforms compared to E2E and TL approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted in Applied Sciences - peer reviewed Open Access journal.
+  This research was funded by the NWO research programme AiNed Fellowship
+  Grants under the project Responsible AI for Voice Diagnostics (RAIVD) - grant
+  number NGF.1607.22.013</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can LLMs perform structured graph reasoning? <span class="chip">ICPR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.01805v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.01805v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Palaash Agrawal, Shavak Vasania, Cheston Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pretrained Large Language Models (LLMs) have demonstrated various reasoning
+capabilities through language-based prompts alone, particularly in unstructured
+task settings (tasks purely based on language semantics). However, LLMs often
+struggle with structured tasks, because of the inherent incompatibility of
+input representation. Reducing structured tasks to uni-dimensional language
+semantics often renders the problem trivial. Keeping the trade-off between LLM
+compatibility and structure complexity in mind, we design various graph
+reasoning tasks as a proxy to semi-structured tasks in this paper, in order to
+test the ability to navigate through representations beyond plain text in
+various LLMs. Particularly, we design 10 distinct problems of graph traversal,
+each representing increasing levels of complexity, and benchmark 5 different
+instruct-finetuned LLMs (GPT-4, GPT-3.5, Claude-2, Llama-2 and Palm-2) on the
+aforementioned tasks. Further, we analyse the performance of models across
+various settings such as varying sizes of graphs as well as different forms of
+k-shot prompting. We highlight various limitations, biases and properties of
+LLMs through this benchmarking process, such as an inverse relation to the
+average degrees of freedom of traversal per node in graphs, the overall
+negative impact of k-shot prompting on graph reasoning tasks, and a positive
+response bias which prevents LLMs from identifying the absence of a valid
+solution. Finally, we introduce a new prompting technique specially designed
+for graph traversal tasks (PathCompare), which demonstrates a notable increase
+in the performance of LLMs in comparison to standard prompting techniques such
+as Chain-of-Thought (CoT).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>International Conference on Pattern Recognition (ICPR), 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Odyssey of Commonsense Causality: From Foundational Benchmarks to
+  Cutting-Edge Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.19307v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.19307v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaobo Cui, Zhijing Jin, Bernhard Schölkopf, Boi Faltings
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding commonsense causality is a unique mark of intelligence for
+humans. It helps people understand the principles of the real world better and
+benefits the decision-making process related to causation. For instance,
+commonsense causality is crucial in judging whether a defendant's action causes
+the plaintiff's loss in determining legal liability. Despite its significance,
+a systematic exploration of this topic is notably lacking. Our comprehensive
+survey bridges this gap by focusing on taxonomies, benchmarks, acquisition
+methods, qualitative reasoning, and quantitative measurements in commonsense
+causality, synthesizing insights from over 200 representative articles. Our
+work aims to provide a systematic overview, update scholars on recent
+advancements, provide a pragmatic guide for beginners, and highlight promising
+future research directions in this vital field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>42 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Inverse-Q*: Token Level Reinforcement Learning for Aligning Large
+  Language Models Without Preference Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14874v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14874v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Han Xia, Songyang Gao, Qiming Ge, Zhiheng Xi, Qi Zhang, Xuanjing Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement Learning from Human Feedback (RLHF) has proven effective in
+aligning large language models with human intentions, yet it often relies on
+complex methodologies like Proximal Policy Optimization (PPO) that require
+extensive hyper-parameter tuning and present challenges in sample efficiency
+and stability. In this paper, we introduce Inverse-Q*, an innovative framework
+that transcends traditional RL methods by optimizing token-level reinforcement
+learning without the need for additional reward or value models. Inverse-Q*
+leverages direct preference optimization techniques but extends them by
+estimating the conditionally optimal policy directly from the model's
+responses, facilitating more granular and flexible policy shaping. Our approach
+reduces reliance on human annotation and external supervision, making it
+especially suitable for low-resource settings. We present extensive
+experimental results demonstrating that Inverse-Q* not only matches but
+potentially exceeds the effectiveness of PPO in terms of convergence speed and
+the alignment of model responses with human preferences. Our findings suggest
+that Inverse-Q* offers a practical and robust alternative to conventional RLHF
+approaches, paving the way for more efficient and adaptable model training
+approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ IKUN for WMT24 General MT Task: LLMs Are here for Multilingual Machine
+  Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.11512v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.11512v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Baohao Liao, Christian Herold, Shahram Khadivi, Christof Monz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces two multilingual systems, IKUN and IKUN-C, developed
+for the general machine translation task in WMT24. IKUN and IKUN-C represent an
+open system and a constrained system, respectively, built on Llama-3-8b and
+Mistral-7B-v0.3. Both systems are designed to handle all 11 language directions
+using a single model. According to automatic evaluation metrics, IKUN-C
+achieved 6 first-place and 3 second-place finishes among all constrained
+systems, while IKUN secured 1 first-place and 2 second-place finishes across
+both open and constrained systems. These encouraging results suggest that large
+language models (LLMs) are nearing the level of proficiency required for
+effective multilingual machine translation. The systems are based on a
+two-stage approach: first, continuous pre-training on monolingual data in 10
+languages, followed by fine-tuning on high-quality parallel data for 11
+language directions. The primary difference between IKUN and IKUN-C lies in
+their monolingual pre-training strategy. IKUN-C is pre-trained using
+constrained monolingual data, whereas IKUN leverages monolingual data from the
+OSCAR dataset. In the second phase, both systems are fine-tuned on parallel
+data sourced from NTREX, Flores, and WMT16-23 for all 11 language pairs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>typo: 120K -> 12K vocabulary size</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ReMamba: Equip Mamba with Effective Long-Sequence Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15496v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15496v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Danlong Yuan, Jiahao Liu, Bei Li, Huishuai Zhang, Jingang Wang, Xunliang Cai, Dongyan Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While the Mamba architecture demonstrates superior inference efficiency and
+competitive performance on short-context natural language processing (NLP)
+tasks, empirical evidence suggests its capacity to comprehend long contexts is
+limited compared to transformer-based models. In this study, we investigate the
+long-context efficiency issues of the Mamba models and propose ReMamba, which
+enhances Mamba's ability to comprehend long contexts. ReMamba incorporates
+selective compression and adaptation techniques within a two-stage re-forward
+process, incurring minimal additional inference costs overhead. Experimental
+results on the LongBench and L-Eval benchmarks demonstrate ReMamba's efficacy,
+improving over the baselines by 3.2 and 1.6 points, respectively, and attaining
+performance almost on par with same-size transformer models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Preference-driven Paradigm for Enhanced Translation with Large
+  Language Models <span class="chip">NAACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.11288v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.11288v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dawei Zhu, Sony Trenous, Xiaoyu Shen, Dietrich Klakow, Bill Byrne, Eva Hasler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent research has shown that large language models (LLMs) can achieve
+remarkable translation performance through supervised fine-tuning (SFT) using
+only a small amount of parallel data. However, SFT simply instructs the model
+to imitate the reference translations at the token level, making it vulnerable
+to the noise present in the references. Hence, the assistance from SFT often
+reaches a plateau once the LLMs have achieved a certain level of translation
+capability, and further increasing the size of parallel data does not provide
+additional benefits. To overcome this plateau associated with imitation-based
+SFT, we propose a preference-based approach built upon the Plackett-Luce model.
+The objective is to steer LLMs towards a more nuanced understanding of
+translation preferences from a holistic view, while also being more resilient
+in the absence of gold translations. We further build a dataset named MAPLE to
+verify the effectiveness of our approach, which includes multiple translations
+of varying quality for each source sentence. Extensive experiments demonstrate
+the superiority of our approach in "breaking the plateau" across diverse LLMs
+and test settings. Our in-depth analysis underscores the pivotal role of
+diverse translations and accurate preference scores in the success of our
+approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to NAACL 2024 (long, main)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TEncDM: Understanding the Properties of Diffusion Model in the Space of
+  Language Model Encodings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.19097v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.19097v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Shabalin, Viacheslav Meshchaninov, Egor Chimbulatov, Vladislav Lapikov, Roman Kim, Grigory Bartosh, Dmitry Molchanov, Sergey Markov, Dmitry Vetrov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents the Text Encoding Diffusion Model (TEncDM), a novel
+approach to diffusion modeling that operates in the space of pre-trained
+language model encodings. In contrast to traditionally used embeddings,
+encodings integrate contextual information. In our approach, we also employ a
+transformer-based decoder, specifically designed to incorporate context in the
+token prediction process. We conduct a comprehensive examination of the
+influence of the encoder, decoder, noise scheduler, and self-conditioning on
+zero-shot generation. Furthermore, we compare TEncDM with previous approaches
+on three conditional text generation tasks: QQP, XSum, and Wiki-Auto. The
+results show that TEncDM exhibits superior performance compared to existing
+non-autoregressive diffusion models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 13 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Helmsman of the Masses? Evaluate the Opinion Leadership of Large
+  Language Models in the Werewolf Game 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.01602v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.01602v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Silin Du, Xiaowei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have exhibited memorable strategic behaviors in
+social deductive games. However, the significance of opinion leadership
+exhibited by LLM-based agents has been largely overlooked, which is crucial for
+practical applications in multi-agent and human-AI interaction settings.
+Opinion leaders are individuals who have a noticeable impact on the beliefs and
+behaviors of others within a social group. In this work, we employ the Werewolf
+game as a simulation platform to assess the opinion leadership of LLMs. The
+game includes the role of the Sheriff, tasked with summarizing arguments and
+recommending decision options, and therefore serves as a credible proxy for an
+opinion leader. We develop a framework integrating the Sheriff role and devise
+two novel metrics based on the critical characteristics of opinion leaders. The
+first metric measures the reliability of the opinion leader, and the second
+assesses the influence of the opinion leader on other players' decisions. We
+conduct extensive experiments to evaluate LLMs of different scales. In
+addition, we collect a Werewolf question-answering dataset (WWQA) to assess and
+enhance LLM's grasp of the game rules, and we also incorporate human
+participants for further analysis. The results suggest that the Werewolf game
+is a suitable test bed to evaluate the opinion leadership of LLMs, and few LLMs
+possess the capacity for opinion leadership.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published as a conference paper at COLM 2024. 37 pages, 6 figures, 27
+  tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MaskMoE: Boosting Token-Level Learning via Routing Mask in
+  Mixture-of-Experts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.09816v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.09816v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenpeng Su, Zijia Lin, Xue Bai, Xing Wu, Yizhe Xiong, Haoran Lian, Guangyuan Ma, Hui Chen, Guiguang Ding, Wei Zhou, Songlin Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scaling the size of a model enhances its capabilities but significantly
+increases computation complexity. Mixture-of-Experts models (MoE) address the
+issue by allowing model size to scale up without substantially increasing
+training or inference costs. In MoE, there is an important module called the
+router, which is used to distribute each token to the experts. Currently, the
+mainstream routing methods include dynamic routing and fixed routing. Despite
+their promising results, MoE models encounter several challenges. Primarily,
+for dynamic routing methods, the dispersion of training tokens across multiple
+experts can lead to underfitting, particularly for infrequent tokens.
+Additionally, though fixed routing methods can mitigate that issue, they
+compromise on the diversity of representations. In this paper, we propose
+\textbf{MaskMoE}, a method designed to enhance token-level learning by
+employing a routing \textbf{mask}ing technique within the
+\textbf{M}ixture-\textbf{o}f-\textbf{E}xperts model. MaskMoE is capable of
+maintaining representation diversity while achieving more comprehensive
+training. Experimental results demonstrate that our method outperforms previous
+dominant Mixture-of-Experts models in terms of both perplexity (PPL) and
+downstream task performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LRP4RAG: Detecting Hallucinations in Retrieval-Augmented Generation via
+  Layer-wise Relevance Propagation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15533v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15533v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haichuan Hu, Yuhan Sun, Quanjun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-Augmented Generation (RAG) has become a primary technique for
+mitigating hallucinations in large language models (LLMs). However, incomplete
+knowledge extraction and insufficient understanding can still mislead LLMs to
+produce irrelevant or even contradictory responses, which means hallucinations
+persist in RAG. In this paper, we propose LRP4RAG, a method based on the
+Layer-wise Relevance Propagation (LRP) algorithm for detecting hallucinations
+in RAG. Specifically, we first utilize LRP to compute the relevance between the
+input and output of the RAG generator. We then apply further extraction and
+resampling to the relevance matrix. The processed relevance data are input into
+multiple classifiers to determine whether the output contains hallucinations.
+To the best of our knowledge, this is the first time that LRP has been used for
+detecting RAG hallucinations, and extensive experiments demonstrate that
+LRP4RAG outperforms existing baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PsychoGAT: A Novel Psychological Measurement Paradigm through
+  Interactive Fiction Games with LLM Agents <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.12326v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.12326v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qisen Yang, Zekun Wang, Honghui Chen, Shenzhi Wang, Yifan Pu, Xin Gao, Wenhao Huang, Shiji Song, Gao Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Psychological measurement is essential for mental health, self-understanding,
+and personal development. Traditional methods, such as self-report scales and
+psychologist interviews, often face challenges with engagement and
+accessibility. While game-based and LLM-based tools have been explored to
+improve user interest and automate assessment, they struggle to balance
+engagement with generalizability. In this work, we propose PsychoGAT
+(Psychological Game AgenTs) to achieve a generic gamification of psychological
+assessment. The main insight is that powerful LLMs can function both as adept
+psychologists and innovative game designers. By incorporating LLM agents into
+designated roles and carefully managing their interactions, PsychoGAT can
+transform any standardized scales into personalized and engaging interactive
+fiction games. To validate the proposed method, we conduct psychometric
+evaluations to assess its effectiveness and employ human evaluators to examine
+the generated content across various psychological constructs, including
+depression, cognitive distortions, and personality traits. Results demonstrate
+that PsychoGAT serves as an effective assessment tool, achieving statistically
+significant excellence in psychometric metrics such as reliability, convergent
+validity, and discriminant validity. Moreover, human evaluations confirm
+PsychoGAT's enhancements in content coherence, interactivity, interest,
+immersion, and satisfaction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Internal Consistency and Self-Feedback in Large Language Models: A
+  <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.14507v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.14507v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xun Liang, Shichao Song, Zifan Zheng, Hanyu Wang, Qingchen Yu, Xunkai Li, Rong-Hua Li, Peng Cheng, Zhonghao Wang, Feiyu Xiong, Zhiyu Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) often exhibit deficient reasoning or generate
+hallucinations. To address these, studies prefixed with "Self-" such as
+Self-Consistency, Self-Improve, and Self-Refine have been initiated. They share
+a commonality: involving LLMs evaluating and updating themselves. Nonetheless,
+these efforts lack a unified perspective on summarization, as existing surveys
+predominantly focus on categorization.
+  In this paper, we summarize a theoretical framework, Internal Consistency,
+offering explanations for reasoning deficiencies and hallucinations. Internal
+Consistency refers to the consistency in expressions among LLMs' latent,
+decoding, or response layers based on sampling methodologies. Then, we
+introduce another effective theoretical framework capable of mining Internal
+Consistency, named Self-Feedback. This framework consists of two modules:
+Self-Evaluation and Self-Update. The former captures Internal Consistency
+Signals, while the latter leverages the signals to enhance either the model's
+response or the model itself. This framework has been employed in numerous
+studies.
+  We systematically classify these studies by tasks and lines of work;
+summarize relevant evaluation methods and benchmarks; and delve into the
+concern, "Does Self-Feedback Really Work?" We also propose several critical
+viewpoints, including the "Hourglass Evolution of Internal Consistency",
+"Consistency Is (Almost) Correctness" hypothesis, and "The Paradox of Latent
+and Explicit Reasoning". The relevant resources are open-sourced at
+https://github.com/IAAR-Shanghai/ICSFSurvey.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 9 figures, 7 tables, 14 equations</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ InstructERC: Reforming Emotion Recognition in Conversation with
+  Multi-task Retrieval-Augmented Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.11911v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.11911v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shanglin Lei, Guanting Dong, Xiaoping Wang, Keheng Wang, Runqi Qiao, Sirui Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The field of emotion recognition of conversation (ERC) has been focusing on
+separating sentence feature encoding and context modeling, lacking exploration
+in generative paradigms based on unified designs. In this study, we propose a
+novel approach, InstructERC, to reformulate the ERC task from a discriminative
+framework to a generative framework based on Large Language Models (LLMs).
+InstructERC makes three significant contributions: (1) it introduces a simple
+yet effective retrieval template module, which helps the model explicitly
+integrate multi-granularity dialogue supervision information. (2) We introduce
+two additional emotion alignment tasks, namely speaker identification and
+emotion prediction tasks, to implicitly model the dialogue role relationships
+and future emotional tendencies in conversations. (3) Pioneeringly, we unify
+emotion labels across benchmarks through the feeling wheel to fit real
+application scenarios. InstructERC still perform impressively on this unified
+dataset. Our LLM-based plugin framework significantly outperforms all previous
+models and achieves comprehensive SOTA on three commonly used ERC datasets.
+Extensive analysis of parameter-efficient and data-scaling experiments provides
+empirical guidance for applying it in practical scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TF-Attack: Transferable and Fast Adversarial Attacks on Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13985v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13985v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zelin Li, Kehai Chen, Xuefeng Bai, Lemao Liu, Mingming Yang, Yang Xiang, Min Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the great advancements in large language models (LLMs), adversarial
+attacks against LLMs have recently attracted increasing attention. We found
+that pre-existing adversarial attack methodologies exhibit limited
+transferability and are notably inefficient, particularly when applied to LLMs.
+In this paper, we analyze the core mechanisms of previous predominant
+adversarial attack methods, revealing that 1) the distributions of importance
+score differ markedly among victim models, restricting the transferability; 2)
+the sequential attack processes induces substantial time overheads. Based on
+the above two insights, we introduce a new scheme, named TF-Attack, for
+Transferable and Fast adversarial attacks on LLMs. TF-Attack employs an
+external LLM as a third-party overseer rather than the victim model to identify
+critical units within sentences. Moreover, TF-Attack introduces the concept of
+Importance Level, which allows for parallel substitutions of attacks. We
+conduct extensive experiments on 6 widely adopted benchmarks, evaluating the
+proposed method through both automatic and human metrics. Results show that our
+method consistently surpasses previous methods in transferability and delivers
+significant speed improvements, up to 20 times faster than earlier attack
+strategies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 6 figures. arXiv admin note: text overlap with
+  arXiv:2305.17440 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BEYOND DIALOGUE: A Profile-Dialogue Alignment Framework Towards General
+  Role-Playing Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.10903v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.10903v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yeyong Yu, Runsheng Yu, Haojie Wei, Zhanqiu Zhang, Quan Qian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid advancement of large language models (LLMs) has revolutionized
+role-playing, enabling the development of general role-playing models. However,
+current role-playing training has two significant issues: (I) Using a
+predefined role profile to prompt dialogue training for specific scenarios
+usually leads to inconsistencies and even conflicts between the dialogue and
+the profile, resulting in training biases. (II) The model learns to imitate the
+role based solely on the profile, neglecting profile-dialogue alignment at the
+sentence level. In this work, we propose a simple yet effective framework
+called BEYOND DIALOGUE, designed to overcome these hurdles. This framework
+innovatively introduces "beyond dialogue" tasks to align dialogue with profile
+traits based on each specific scenario, thereby eliminating biases during
+training. Furthermore, by adopting an innovative prompting mechanism that
+generates reasoning outcomes for training, the framework allows the model to
+achieve fine-grained alignment between profile and dialogue at the sentence
+level. The aforementioned methods are fully automated and low-cost.
+Additionally, the integration of automated dialogue and objective evaluation
+methods forms a comprehensive framework, paving the way for general
+role-playing. Experimental results demonstrate that our model excels in
+adhering to and reflecting various dimensions of role profiles, outperforming
+most proprietary general and specialized role-playing baselines. All code and
+datasets are available at https://github.com/yuyouyu32/BeyondDialogue.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GenRec: Generative Sequential Recommendation with Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.21191v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.21191v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Panfeng Cao, Pietro Lio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequential recommendation is a task to capture hidden user preferences from
+historical user item interaction data and recommend next items for the user.
+Significant progress has been made in this domain by leveraging classification
+based learning methods. Inspired by the recent paradigm of 'pretrain, prompt
+and predict' in NLP, we consider sequential recommendation as a sequence to
+sequence generation task and propose a novel model named Generative
+Recommendation (GenRec). Unlike classification based models that learn explicit
+user and item representations, GenRec utilizes the sequence modeling capability
+of Transformer and adopts the masked item prediction objective to effectively
+learn the hidden bidirectional sequential patterns. Different from existing
+generative sequential recommendation models, GenRec does not rely on manually
+designed hard prompts. The input to GenRec is textual user item sequence and
+the output is top ranked next items. Moreover, GenRec is lightweight and
+requires only a few hours to train effectively in low-resource settings, making
+it highly applicable to real-world scenarios and helping to democratize large
+language models in the sequential recommendation domain. Our extensive
+experiments have demonstrated that GenRec generalizes on various public
+real-world datasets and achieves state-of-the-art results. Our experiments also
+validate the effectiveness of the the proposed masked item prediction objective
+that improves the model performance by a large margin.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TF-Attack: Transferable and Fast Adversarial Attacks on Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13985v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13985v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zelin Li, Kehai Chen, Xuefeng Bai, Lemao Liu, Mingming Yang, Yang Xiang, Min Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the great advancements in large language models (LLMs), adversarial
+attacks against LLMs have recently attracted increasing attention. We found
+that pre-existing adversarial attack methodologies exhibit limited
+transferability and are notably inefficient, particularly when applied to LLMs.
+In this paper, we analyze the core mechanisms of previous predominant
+adversarial attack methods, revealing that 1) the distributions of importance
+score differ markedly among victim models, restricting the transferability; 2)
+the sequential attack processes induces substantial time overheads. Based on
+the above two insights, we introduce a new scheme, named TF-Attack, for
+Transferable and Fast adversarial attacks on LLMs. TF-Attack employs an
+external LLM as a third-party overseer rather than the victim model to identify
+critical units within sentences. Moreover, TF-Attack introduces the concept of
+Importance Level, which allows for parallel substitutions of attacks. We
+conduct extensive experiments on 6 widely adopted benchmarks, evaluating the
+proposed method through both automatic and human metrics. Results show that our
+method consistently surpasses previous methods in transferability and delivers
+significant speed improvements, up to 20 times faster than earlier attack
+strategies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">130</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 3D Whole-body Grasp Synthesis with Directional Controllability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16770v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16770v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Georgios Paschalidis, Romana Wilschut, Dimitrije Antić, Omid Taheri, Dimitrios Tzionas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Synthesizing 3D whole-bodies that realistically grasp objects is useful for
+animation, mixed reality, and robotics. This is challenging, because the hands
+and body need to look natural w.r.t. each other, the grasped object, as well as
+the local scene (i.e., a receptacle supporting the object). Only recent work
+tackles this, with a divide-and-conquer approach; it first generates a
+"guiding" right-hand grasp, and then searches for bodies that match this.
+However, the guiding-hand synthesis lacks controllability and receptacle
+awareness, so it likely has an implausible direction (i.e., a body can't match
+this without penetrating the receptacle) and needs corrections through major
+post-processing. Moreover, the body search needs exhaustive sampling and is
+expensive. These are strong limitations. We tackle these with a novel method
+called CWGrasp. Our key idea is that performing geometry-based reasoning "early
+on," instead of "too late," provides rich "control" signals for inference. To
+this end, CWGrasp first samples a plausible reaching-direction vector (used
+later for both the arm and hand) from a probabilistic model built via
+raycasting from the object and collision checking. Then, it generates a
+reaching body with a desired arm direction, as well as a "guiding" grasping
+hand with a desired palm direction that complies with the arm's one.
+Eventually, CWGrasp refines the body to match the "guiding" hand, while
+plausibly contacting the scene. Notably, generating already-compatible "parts"
+greatly simplifies the "whole." Moreover, CWGrasp uniquely tackles both right-
+and left-hand grasps. We evaluate on the GRAB and ReplicaGrasp datasets.
+CWGrasp outperforms baselines, at lower runtime and budget, while all
+components help performance. Code and models will be released.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SAM2Point: Segment Any 3D as Videos in Zero-shot and <span class="highlight-title">Prompt</span>able Manners 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16768v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16768v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyu Guo, Renrui Zhang, Xiangyang Zhu, Chengzhuo Tong, Peng Gao, Chunyuan Li, Pheng-Ann Heng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce SAM2Point, a preliminary exploration adapting Segment Anything
+Model 2 (SAM 2) for zero-shot and promptable 3D segmentation. SAM2Point
+interprets any 3D data as a series of multi-directional videos, and leverages
+SAM 2 for 3D-space segmentation, without further training or 2D-3D projection.
+Our framework supports various prompt types, including 3D points, boxes, and
+masks, and can generalize across diverse scenarios, such as 3D objects, indoor
+scenes, outdoor environments, and raw sparse LiDAR. Demonstrations on multiple
+3D datasets, e.g., Objaverse, S3DIS, ScanNet, Semantic3D, and KITTI, highlight
+the robust generalization capabilities of SAM2Point. To our best knowledge, we
+present the most faithful implementation of SAM in 3D, which may serve as a
+starting point for future research in promptable 3D segmentation. Online Demo:
+https://huggingface.co/spaces/ZiyuG/SAM2Point . Code:
+https://github.com/ZiyuGuo99/SAM2Point .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress. Online Demo:
+  https://huggingface.co/spaces/ZiyuG/SAM2Point . Code:
+  https://github.com/ZiyuGuo99/SAM2Point</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Prompt</span>Smooth: Certifying Robustness of Medical Vision-Language Models
+  via <span class="highlight-title">Prompt</span> Learning <span class="chip">MICCAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16769v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16769v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Noor Hussein, Fahad Shamshad, Muzammal Naseer, Karthik Nandakumar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical vision-language models (Med-VLMs) trained on large datasets of
+medical image-text pairs and later fine-tuned for specific tasks have emerged
+as a mainstream paradigm in medical image analysis. However, recent studies
+have highlighted the susceptibility of these Med-VLMs to adversarial attacks,
+raising concerns about their safety and robustness. Randomized smoothing is a
+well-known technique for turning any classifier into a model that is
+certifiably robust to adversarial perturbations. However, this approach
+requires retraining the Med-VLM-based classifier so that it classifies well
+under Gaussian noise, which is often infeasible in practice. In this paper, we
+propose a novel framework called PromptSmooth to achieve efficient certified
+robustness of Med-VLMs by leveraging the concept of prompt learning. Given any
+pre-trained Med-VLM, PromptSmooth adapts it to handle Gaussian noise by
+learning textual prompts in a zero-shot or few-shot manner, achieving a
+delicate balance between accuracy and robustness, while minimizing the
+computational overhead. Moreover, PromptSmooth requires only a single model to
+handle multiple noise levels, which substantially reduces the computational
+cost compared to traditional methods that rely on training a separate model for
+each noise level. Comprehensive experiments based on three Med-VLMs and across
+six downstream datasets of various imaging modalities demonstrate the efficacy
+of PromptSmooth. Our code and models are available at
+https://github.com/nhussein/promptsmooth.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to MICCAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ReconX: Reconstruct Any Scene from Sparse Views with Video Diffusion
+  Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16767v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16767v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fangfu Liu, Wenqiang Sun, Hanyang Wang, Yikai Wang, Haowen Sun, Junliang Ye, Jun Zhang, Yueqi Duan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advancements in 3D scene reconstruction have transformed 2D images from the
+real world into 3D models, producing realistic 3D results from hundreds of
+input photos. Despite great success in dense-view reconstruction scenarios,
+rendering a detailed scene from insufficient captured views is still an
+ill-posed optimization problem, often resulting in artifacts and distortions in
+unseen areas. In this paper, we propose ReconX, a novel 3D scene reconstruction
+paradigm that reframes the ambiguous reconstruction challenge as a temporal
+generation task. The key insight is to unleash the strong generative prior of
+large pre-trained video diffusion models for sparse-view reconstruction.
+However, 3D view consistency struggles to be accurately preserved in directly
+generated video frames from pre-trained models. To address this, given limited
+input views, the proposed ReconX first constructs a global point cloud and
+encodes it into a contextual space as the 3D structure condition. Guided by the
+condition, the video diffusion model then synthesizes video frames that are
+both detail-preserved and exhibit a high degree of 3D consistency, ensuring the
+coherence of the scene from various perspectives. Finally, we recover the 3D
+scene from the generated video through a confidence-aware 3D Gaussian Splatting
+optimization scheme. Extensive experiments on various real-world datasets show
+the superiority of our ReconX over state-of-the-art methods in terms of quality
+and generalizability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://liuff19.github.io/ReconX</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CSGO: Content-Style Composition in Text-to-Image Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16766v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16766v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peng Xing, Haofan Wang, Yanpeng Sun, Qixun Wang, Xu Bai, Hao Ai, Renyuan Huang, Zechao Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The diffusion model has shown exceptional capabilities in controlled image
+generation, which has further fueled interest in image style transfer. Existing
+works mainly focus on training free-based methods (e.g., image inversion) due
+to the scarcity of specific data. In this study, we present a data construction
+pipeline for content-style-stylized image triplets that generates and
+automatically cleanses stylized data triplets. Based on this pipeline, we
+construct a dataset IMAGStyle, the first large-scale style transfer dataset
+containing 210k image triplets, available for the community to explore and
+research. Equipped with IMAGStyle, we propose CSGO, a style transfer model
+based on end-to-end training, which explicitly decouples content and style
+features employing independent feature injection. The unified CSGO implements
+image-driven style transfer, text-driven stylized synthesis, and text
+editing-driven stylized synthesis. Extensive experiments demonstrate the
+effectiveness of our approach in enhancing style control capabilities in image
+generation. Additional visualization and access to the source code can be
+located on the project page: \url{https://csgo-gen.github.io/}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UV-free Texture Generation with Denoising and Geodesic Heat Diffusions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16762v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16762v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simone Foti, Stefanos Zafeiriou, Tolga Birdal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Seams, distortions, wasted UV space, vertex-duplication, and varying
+resolution over the surface are the most prominent issues of the standard
+UV-based texturing of meshes. These issues are particularly acute when
+automatic UV-unwrapping techniques are used. For this reason, instead of
+generating textures in automatically generated UV-planes like most
+state-of-the-art methods, we propose to represent textures as coloured
+point-clouds whose colours are generated by a denoising diffusion probabilistic
+model constrained to operate on the surface of 3D objects. Our sampling and
+resolution agnostic generative model heavily relies on heat diffusion over the
+surface of the meshes for spatial communication between points. To enable
+processing of arbitrarily sampled point-cloud textures and ensure long-distance
+texture consistency we introduce a fast re-sampling of the mesh spectral
+properties used during the heat diffusion and introduce a novel
+heat-diffusion-based self-attention mechanism. Our code and pre-trained models
+are available at github.com/simofoti/UV3-TeD.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OmniRe: Omni Urban Scene Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16760v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16760v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyu Chen, Jiawei Yang, Jiahui Huang, Riccardo de Lutio, Janick Martinez Esturo, Boris Ivanovic, Or Litany, Zan Gojcic, Sanja Fidler, Marco Pavone, Li Song, Yue Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce OmniRe, a holistic approach for efficiently reconstructing
+high-fidelity dynamic urban scenes from on-device logs. Recent methods for
+modeling driving sequences using neural radiance fields or Gaussian Splatting
+have demonstrated the potential of reconstructing challenging dynamic scenes,
+but often overlook pedestrians and other non-vehicle dynamic actors, hindering
+a complete pipeline for dynamic urban scene reconstruction. To that end, we
+propose a comprehensive 3DGS framework for driving scenes, named OmniRe, that
+allows for accurate, full-length reconstruction of diverse dynamic objects in a
+driving log. OmniRe builds dynamic neural scene graphs based on Gaussian
+representations and constructs multiple local canonical spaces that model
+various dynamic actors, including vehicles, pedestrians, and cyclists, among
+many others. This capability is unmatched by existing methods. OmniRe allows us
+to holistically reconstruct different objects present in the scene,
+subsequently enabling the simulation of reconstructed scenarios with all actors
+participating in real-time (~60Hz). Extensive evaluations on the Waymo dataset
+show that our approach outperforms prior state-of-the-art methods
+quantitatively and qualitatively by a large margin. We believe our work fills a
+critical gap in driving reconstruction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>See the project page for code, video results and demos:
+  https://ziyc.github.io/omnire/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dissecting Out-of-Distribution Detection and Open-Set Recognition: A
+  Critical Analysis of Methods and Benchmarks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16757v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16757v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongjun Wang, Sagar Vaze, Kai Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting test-time distribution shift has emerged as a key capability for
+safely deployed machine learning models, with the question being tackled under
+various guises in recent years. In this paper, we aim to provide a consolidated
+view of the two largest sub-fields within the community: out-of-distribution
+(OOD) detection and open-set recognition (OSR). In particular, we aim to
+provide rigorous empirical analysis of different methods across settings and
+provide actionable takeaways for practitioners and researchers. Concretely, we
+make the following contributions: (i) We perform rigorous cross-evaluation
+between state-of-the-art methods in the OOD detection and OSR settings and
+identify a strong correlation between the performances of methods for them;
+(ii) We propose a new, large-scale benchmark setting which we suggest better
+disentangles the problem tackled by OOD detection and OSR, re-evaluating
+state-of-the-art OOD detection and OSR methods in this setting; (iii) We
+surprisingly find that the best performing method on standard benchmarks
+(Outlier Exposure) struggles when tested at scale, while scoring rules which
+are sensitive to the deep feature magnitude consistently show promise; and (iv)
+We conduct empirical analysis to explain these phenomena and highlight
+directions for future research. Code:
+\url{https://github.com/Visual-AI/Dissect-OOD-OSR}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IJCV, preprint version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VideoLLM-MoD: Efficient Video-Language Streaming with Mixture-of-Depths
+  Vision Computation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16730v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16730v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiwei Wu, Joya Chen, Kevin Qinghong Lin, Qimeng Wang, Yan Gao, Qianli Xu, Tong Xu, Yao Hu, Enhong Chen, Mike Zheng Shou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A well-known dilemma in large vision-language models (e.g., GPT-4, LLaVA) is
+that while increasing the number of vision tokens generally enhances visual
+understanding, it also significantly raises memory and computational costs,
+especially in long-term, dense video frame streaming scenarios. Although
+learnable approaches like Q-Former and Perceiver Resampler have been developed
+to reduce the vision token burden, they overlook the context causally modeled
+by LLMs (i.e., key-value cache), potentially leading to missed visual cues when
+addressing user queries. In this paper, we introduce a novel approach to reduce
+vision compute by leveraging redundant vision tokens "skipping layers" rather
+than decreasing the number of vision tokens. Our method, VideoLLM-MoD, is
+inspired by mixture-of-depths LLMs and addresses the challenge of numerous
+vision tokens in long-term or streaming video. Specifically, for each
+transformer layer, we learn to skip the computation for a high proportion
+(e.g., 80\%) of vision tokens, passing them directly to the next layer. This
+approach significantly enhances model efficiency, achieving approximately
+\textasciitilde42\% time and \textasciitilde30\% memory savings for the entire
+training. Moreover, our method reduces the computation in the context and avoid
+decreasing the vision tokens, thus preserving or even improving performance
+compared to the vanilla model. We conduct extensive experiments to demonstrate
+the effectiveness of VideoLLM-MoD, showing its state-of-the-art results on
+multiple benchmarks, including narration, forecasting, and summarization tasks
+in COIN, Ego4D, and Ego-Exo4D datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Prediction-Feedback DETR for Temporal Action Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16729v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16729v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jihwan Kim, Miso Lee, Cheol-Ho Cho, Jihyun Lee, Jae-Pil Heo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Temporal Action Detection (TAD) is fundamental yet challenging for real-world
+video applications. Leveraging the unique benefits of transformers, various
+DETR-based approaches have been adopted in TAD. However, it has recently been
+identified that the attention collapse in self-attention causes the performance
+degradation of DETR for TAD. Building upon previous research, this paper newly
+addresses the attention collapse problem in cross-attention within DETR-based
+TAD methods. Moreover, our findings reveal that cross-attention exhibits
+patterns distinct from predictions, indicating a short-cut phenomenon. To
+resolve this, we propose a new framework, Prediction-Feedback DETR (Pred-DETR),
+which utilizes predictions to restore the collapse and align the cross- and
+self-attention with predictions. Specifically, we devise novel
+prediction-feedback objectives using guidance from the relations of the
+predictions. As a result, Pred-DETR significantly alleviates the collapse and
+achieves state-of-the-art performance among DETR-based methods on various
+challenging benchmarks including THUMOS14, ActivityNet-v1.3, HACS, and
+FineAction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ H-SGANet: Hybrid Sparse Graph Attention Network for Deformable Medical
+  Image Registration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16719v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16719v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yufeng Zhou, Wenming Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The integration of Convolutional Neural Network (ConvNet) and Transformer has
+emerged as a strong candidate for image registration, leveraging the strengths
+of both models and a large parameter space. However, this hybrid model,
+treating brain MRI volumes as grid or sequence structures, faces challenges in
+accurately representing anatomical connectivity, diverse brain regions, and
+vital connections contributing to the brain's internal architecture. Concerns
+also arise regarding the computational expense and GPU memory usage associated
+with this model. To tackle these issues, a lightweight hybrid sparse graph
+attention network (H-SGANet) has been developed. This network incorporates a
+central mechanism, Sparse Graph Attention (SGA), based on a Vision Graph Neural
+Network (ViG) with predetermined anatomical connections. The SGA module expands
+the model's receptive field and seamlessly integrates into the network. To
+further amplify the advantages of the hybrid network, the Separable
+Self-Attention (SSA) is employed as an enhanced token mixer, integrated with
+depth-wise convolution to constitute SSAFormer. This strategic integration is
+designed to more effectively extract long-range dependencies. As a hybrid
+ConvNet-ViG-Transformer model, H-SGANet offers threefold benefits for
+volumetric medical image registration. It optimizes fixed and moving images
+concurrently through a hybrid feature fusion layer and an end-to-end learning
+framework. Compared to VoxelMorph, a model with a similar parameter count,
+H-SGANet demonstrates significant performance enhancements of 3.5% and 1.5% in
+Dice score on the OASIS dataset and LPBA40 dataset, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ One-Shot Learning Meets Depth Diffusion in Multi-Object Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16704v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16704v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anisha Jain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Creating editable videos that depict complex interactions between multiple
+objects in various artistic styles has long been a challenging task in
+filmmaking. Progress is often hampered by the scarcity of data sets that
+contain paired text descriptions and corresponding videos that showcase these
+interactions. This paper introduces a novel depth-conditioning approach that
+significantly advances this field by enabling the generation of coherent and
+diverse videos from just a single text-video pair using a pre-trained
+depth-aware Text-to-Image (T2I) model. Our method fine-tunes the pre-trained
+model to capture continuous motion by employing custom-designed spatial and
+temporal attention mechanisms. During inference, we use the DDIM inversion to
+provide structural guidance for video generation. This innovative technique
+allows for continuously controllable depth in videos, facilitating the
+generation of multiobject interactions while maintaining the concept generation
+and compositional strengths of the original T2I model across various artistic
+styles, such as photorealism, animation, and impressionism.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GradBias: Unveiling Word Influence on Bias in Text-to-Image Generative
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16700v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16700v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Moreno D'Incà, Elia Peruzzo, Massimiliano Mancini, Xingqian Xu, Humphrey Shi, Nicu Sebe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent progress in Text-to-Image (T2I) generative models has enabled
+high-quality image generation. As performance and accessibility increase, these
+models are gaining significant attraction and popularity: ensuring their
+fairness and safety is a priority to prevent the dissemination and perpetuation
+of biases. However, existing studies in bias detection focus on closed sets of
+predefined biases (e.g., gender, ethnicity). In this paper, we propose a
+general framework to identify, quantify, and explain biases in an open set
+setting, i.e. without requiring a predefined set. This pipeline leverages a
+Large Language Model (LLM) to propose biases starting from a set of captions.
+Next, these captions are used by the target generative model for generating a
+set of images. Finally, Vision Question Answering (VQA) is leveraged for bias
+evaluation. We show two variations of this framework: OpenBias and GradBias.
+OpenBias detects and quantifies biases, while GradBias determines the
+contribution of individual prompt words on biases. OpenBias effectively detects
+both well-known and novel biases related to people, objects, and animals and
+highly aligns with existing closed-set bias detection methods and human
+judgment. GradBias shows that neutral words can significantly influence biases
+and it outperforms several baselines, including state-of-the-art foundation
+models. Code available here: https://github.com/Moreno98/GradBias.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review. Code: https://github.com/Moreno98/GradBias</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generic Objects as Pose Probes for Few-Shot View Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16690v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16690v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhirui Gao, Renjiao Yi, Chenyang Zhu, Ke Zhuang, Wei Chen, Kai Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Radiance fields including NeRFs and 3D Gaussians demonstrate great potential
+in high-fidelity rendering and scene reconstruction, while they require a
+substantial number of posed images as inputs. COLMAP is frequently employed for
+preprocessing to estimate poses, while it necessitates a large number of
+feature matches to operate effectively, and it struggles with scenes
+characterized by sparse features, large baselines between images, or a limited
+number of input images. We aim to tackle few-view NeRF reconstruction using
+only 3 to 6 unposed scene images. Traditional methods often use calibration
+boards but they are not common in images. We propose a novel idea of utilizing
+everyday objects, commonly found in both images and real life, as "pose
+probes". The probe object is automatically segmented by SAM, whose shape is
+initialized from a cube. We apply a dual-branch volume rendering optimization
+(object NeRF and scene NeRF) to constrain the pose optimization and jointly
+refine the geometry. Specifically, object poses of two views are first
+estimated by PnP matching in an SDF representation, which serves as initial
+poses. PnP matching, requiring only a few features, is suitable for
+feature-sparse scenes. Additional views are incrementally incorporated to
+refine poses from preceding views. In experiments, PoseProbe achieves
+state-of-the-art performance in both pose estimation and novel view synthesis
+across multiple datasets. We demonstrate its effectiveness, particularly in
+few-view and large-baseline scenes where COLMAP struggles. In ablations, using
+different objects in a scene yields comparable performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PartFormer: Awakening Latent Diverse Representation from Vision
+  <span class="highlight-title">Transformer</span> for Object Re-Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16684v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16684v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Tan, Pingyang Dai, Jie Chen, Liujuan Cao, Yongjian Wu, Rongrong Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Extracting robust feature representation is critical for object
+re-identification to accurately identify objects across non-overlapping
+cameras. Although having a strong representation ability, the Vision
+Transformer (ViT) tends to overfit on most distinct regions of training data,
+limiting its generalizability and attention to holistic object features.
+Meanwhile, due to the structural difference between CNN and ViT, fine-grained
+strategies that effectively address this issue in CNN do not continue to be
+successful in ViT. To address this issue, by observing the latent diverse
+representation hidden behind the multi-head attention, we present PartFormer,
+an innovative adaptation of ViT designed to overcome the granularity
+limitations in object Re-ID tasks. The PartFormer integrates a Head
+Disentangling Block (HDB) that awakens the diverse representation of multi-head
+self-attention without the typical loss of feature richness induced by
+concatenation and FFN layers post-attention. To avoid the homogenization of
+attention heads and promote robust part-based feature learning, two head
+diversity constraints are imposed: attention diversity constraint and
+correlation diversity constraint. These constraints enable the model to exploit
+diverse and discriminative feature representations from different attention
+heads. Comprehensive experiments on various object Re-ID benchmarks demonstrate
+the superiority of the PartFormer. Specifically, our framework significantly
+outperforms state-of-the-art by 2.4\% mAP scores on the most challenging MSMT17
+dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Space3D-Bench: Spatial 3D Question Answering Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16662v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16662v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emilia Szymanska, Mihai Dusmanu, Jan-Willem Buurlage, Mahdi Rad, Marc Pollefeys
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Answering questions about the spatial properties of the environment poses
+challenges for existing language and vision foundation models due to a lack of
+understanding of the 3D world notably in terms of relationships between
+objects. To push the field forward, multiple 3D Q&A datasets were proposed
+which, overall, provide a variety of questions, but they individually focus on
+particular aspects of 3D reasoning or are limited in terms of data modalities.
+To address this, we present Space3D-Bench - a collection of 1000 general
+spatial questions and answers related to scenes of the Replica dataset which
+offers a variety of data modalities: point clouds, posed RGB-D images,
+navigation meshes and 3D object detections. To ensure that the questions cover
+a wide range of 3D objectives, we propose an indoor spatial questions taxonomy
+inspired by geographic information systems and use it to balance the dataset
+accordingly. Moreover, we provide an assessment system that grades natural
+language responses based on predefined ground-truth answers by leveraging a
+Vision Language Model's comprehension of both text and images to compare the
+responses with ground-truth textual information or relevant visual data.
+Finally, we introduce a baseline called RAG3D-Chat integrating the world
+understanding of foundation models with rich context retrieval, achieving an
+accuracy of 67% on the proposed dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Eigen-Cluster VIS: Improving Weakly-supervised Video Instance
+  Segmentation by Leveraging Spatio-temporal Consistency 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16661v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16661v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Farnoosh Arefi, Amir M. Mansourian, Shohreh Kasaei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The performance of Video Instance Segmentation (VIS) methods has improved
+significantly with the advent of transformer networks. However, these networks
+often face challenges in training due to the high annotation cost. To address
+this, unsupervised and weakly-supervised methods have been developed to reduce
+the dependency on annotations. This work introduces a novel weakly-supervised
+method called Eigen-cluster VIS that, without requiring any mask annotations,
+achieves competitive accuracy compared to other VIS approaches. This method is
+based on two key innovations: a Temporal Eigenvalue Loss (TEL) and a clip-level
+Quality Cluster Coefficient (QCC). The TEL ensures temporal coherence by
+leveraging the eigenvalues of the Laplacian matrix derived from graph adjacency
+matrices. By minimizing the mean absolute error (MAE) between the eigenvalues
+of adjacent frames, this loss function promotes smooth transitions and stable
+segmentation boundaries over time, reducing temporal discontinuities and
+improving overall segmentation quality. The QCC employs the K-means method to
+ensure the quality of spatio-temporal clusters without relying on ground truth
+masks. Using the Davies-Bouldin score, the QCC provides an unsupervised measure
+of feature discrimination, allowing the model to self-evaluate and adapt to
+varying object distributions, enhancing robustness during the testing phase.
+These enhancements are computationally efficient and straightforward, offering
+significant performance gains without additional annotated data. The proposed
+Eigen-Cluster VIS method is evaluated on the YouTube-VIS 2019/2021 and OVIS
+datasets, demonstrating that it effectively narrows the performance gap between
+the fully-supervised and weakly-supervised VIS approaches. The code is
+available on: https://github.com/farnooshar/EigenClusterVIS
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 6 Figures, 5 tabels</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DriveGenVLM: Real-world Video Generation for Vision Language Model based
+  Autonomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16647v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16647v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongjie Fu, Anmol Jain, Xuan Di, Xu Chen, Zhaobin Mo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advancement of autonomous driving technologies necessitates increasingly
+sophisticated methods for understanding and predicting real-world scenarios.
+Vision language models (VLMs) are emerging as revolutionary tools with
+significant potential to influence autonomous driving. In this paper, we
+propose the DriveGenVLM framework to generate driving videos and use VLMs to
+understand them. To achieve this, we employ a video generation framework
+grounded in denoising diffusion probabilistic models (DDPM) aimed at predicting
+real-world video sequences. We then explore the adequacy of our generated
+videos for use in VLMs by employing a pre-trained model known as Efficient
+In-context Learning on Egocentric Videos (EILEV). The diffusion model is
+trained with the Waymo open dataset and evaluated using the Fr\'echet Video
+Distance (FVD) score to ensure the quality and realism of the generated videos.
+Corresponding narrations are provided by EILEV for these generated videos,
+which may be beneficial in the autonomous driving domain. These narrations can
+enhance traffic scene understanding, aid in navigation, and improve planning
+capabilities. The integration of video generation with VLMs in the DriveGenVLM
+framework represents a significant step forward in leveraging advanced AI
+models to address complex challenges in autonomous driving.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SODAWideNet++: Combining Attention and Convolutions for Salient Object
+  Detection <span class="chip">ICPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16645v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16645v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rohit Venkata Sai Dulam, Chandra Kambhamettu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Salient Object Detection (SOD) has traditionally relied on feature refinement
+modules that utilize the features of an ImageNet pre-trained backbone. However,
+this approach limits the possibility of pre-training the entire network because
+of the distinct nature of SOD and image classification. Additionally, the
+architecture of these backbones originally built for Image classification is
+sub-optimal for a dense prediction task like SOD. To address these issues, we
+propose a novel encoder-decoder-style neural network called SODAWideNet++ that
+is designed explicitly for SOD. Inspired by the vision transformers ability to
+attain a global receptive field from the initial stages, we introduce the
+Attention Guided Long Range Feature Extraction (AGLRFE) module, which combines
+large dilated convolutions and self-attention. Specifically, we use attention
+features to guide long-range information extracted by multiple dilated
+convolutions, thus taking advantage of the inductive biases of a convolution
+operation and the input dependency brought by self-attention. In contrast to
+the current paradigm of ImageNet pre-training, we modify 118K annotated images
+from the COCO semantic segmentation dataset by binarizing the annotations to
+pre-train the proposed model end-to-end. Further, we supervise the background
+predictions along with the foreground to push our model to generate accurate
+saliency predictions. SODAWideNet++ performs competitively on five different
+datasets while only containing 35% of the trainable parameters compared to the
+state-of-the-art models. The code and pre-computed saliency maps are provided
+at https://github.com/VimsLab/SODAWideNetPlusPlus.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICPR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 3D Pose-Based Temporal Action Segmentation for Figure Skating: A
+  Fine-Grained and Jump Procedure-Aware Annotation Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16638v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16638v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ryota Tanaka, Tomohiro Suzuki, Keisuke Fujii
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding human actions from videos is essential in many domains,
+including sports. In figure skating, technical judgments are performed by
+watching skaters' 3D movements, and its part of the judging procedure can be
+regarded as a Temporal Action Segmentation (TAS) task. TAS tasks in figure
+skating that automatically assign temporal semantics to video are actively
+researched. However, there is a lack of datasets and effective methods for TAS
+tasks requiring 3D pose data. In this study, we first created the FS-Jump3D
+dataset of complex and dynamic figure skating jumps using optical markerless
+motion capture. We also propose a new fine-grained figure skating jump TAS
+dataset annotation method with which TAS models can learn jump procedures. In
+the experimental results, we validated the usefulness of 3D pose features as
+input and the fine-grained dataset for the TAS model in figure skating.
+FS-Jump3D Dataset is available at https://github.com/ryota-skating/FS-Jump3D.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 7th ACM International Workshop on Multimedia Content
+  Analysis in Sports</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Turbulence Strength $C_n^2$ Estimation from Video using Physics-based
+  Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16623v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16623v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ripon Kumar Saha, Esen Salcin, Jihoo Kim, Joseph Smith, Suren Jayasuriya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Images captured from a long distance suffer from dynamic image distortion due
+to turbulent flow of air cells with random temperatures, and thus refractive
+indices. This phenomenon, known as image dancing, is commonly characterized by
+its refractive-index structure constant $C_n^2$ as a measure of the turbulence
+strength. For many applications such as atmospheric forecast model,
+long-range/astronomy imaging, and aviation safety, optical communication
+technology, $C_n^2$ estimation is critical for accurately sensing the turbulent
+environment. Previous methods for $C_n^2$ estimation include estimation from
+meteorological data (temperature, relative humidity, wind shear, etc.) for
+single-point measurements, two-ended pathlength measurements from optical
+scintillometer for path-averaged $C_n^2$, and more recently estimating $C_n^2$
+from passive video cameras for low cost and hardware complexity. In this paper,
+we present a comparative analysis of classical image gradient methods for
+$C_n^2$ estimation and modern deep learning-based methods leveraging
+convolutional neural networks. To enable this, we collect a dataset of video
+capture along with reference scintillometer measurements for ground truth, and
+we release this unique dataset to the scientific community. We observe that
+deep learning methods can achieve higher accuracy when trained on similar data,
+but suffer from generalization errors to other, unseen imagery as compared to
+classical methods. To overcome this trade-off, we present a novel physics-based
+network architecture that combines learned convolutional layers with a
+differentiable image gradient method that maintains high accuracy while being
+generalizable across image datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code Available: https://github.com/Riponcs/Cn2Estimation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sparse Signal Reconstruction for Overdispersed Low-photon Count
+  Biomedical Imaging Using $\ell_p$ Total Variation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16622v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16622v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Lu, Roummel F. Marcia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The negative binomial model, which generalizes the Poisson distribution
+model, can be found in applications involving low-photon signal recovery,
+including medical imaging. Recent studies have explored several regularization
+terms for the negative binomial model, such as the $\ell_p$ quasi-norm with $0
+< p < 1$, $\ell_1$ norm, and the total variation (TV) quasi-seminorm for
+promoting sparsity in signal recovery. These penalty terms have been shown to
+improve image reconstruction outcomes. In this paper, we investigate the
+$\ell_p$ quasi-seminorm, both isotropic and anisotropic $\ell_p$ TV
+quasi-seminorms, within the framework of the negative binomial statistical
+model. This problem can be formulated as an optimization problem, which we
+solve using a gradient-based approach. We present comparisons between the
+negative binomial and Poisson statistical models using the $\ell_p$ TV
+quasi-seminorm as well as common penalty terms. Our experimental results
+highlight the efficacy of the proposed method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, Accepted by the IEEE International Symposium on Biomedical
+  Imaging (ISBI)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Infusing Auxiliary Knowledge for Distracted Driver Detection <span class="chip">KDD</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16621v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16621v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ishwar B Balappanawar, Ashmit Chamoli, Ruwan Wickramarachchi, Aditya Mishra, Ponnurangam Kumaraguru, Amit P. Sheth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Distracted driving is a leading cause of road accidents globally.
+Identification of distracted driving involves reliably detecting and
+classifying various forms of driver distraction (e.g., texting, eating, or
+using in-car devices) from in-vehicle camera feeds to enhance road safety. This
+task is challenging due to the need for robust models that can generalize to a
+diverse set of driver behaviors without requiring extensive annotated datasets.
+In this paper, we propose KiD3, a novel method for distracted driver detection
+(DDD) by infusing auxiliary knowledge about semantic relations between entities
+in a scene and the structural configuration of the driver's pose. Specifically,
+we construct a unified framework that integrates the scene graphs, and driver
+pose information with the visual cues in video frames to create a holistic
+representation of the driver's actions.Our results indicate that KiD3 achieves
+a 13.64% accuracy improvement over the vision-only baseline by incorporating
+such auxiliary knowledge with visual information.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at KiL 2024: Workshop on Knowledge-infused Learning
+  co-located with 30th ACM KDD Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FastForensics: Efficient Two-Stream Design for Real-Time Image
+  Manipulation Detection <span class="chip">BMVC 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16582v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16582v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yangxiang Zhang, Yuezun Li, Ao Luo, Jiaran Zhou, Junyu Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rise in popularity of portable devices, the spread of falsified
+media on social platforms has become rampant. This necessitates the timely
+identification of authentic content. However, most advanced detection methods
+are computationally heavy, hindering their real-time application. In this
+paper, we describe an efficient two-stream architecture for real-time image
+manipulation detection. Our method consists of two-stream branches targeting
+the cognitive and inspective perspectives. In the cognitive branch, we propose
+efficient wavelet-guided Transformer blocks to capture the global manipulation
+traces related to frequency. This block contains an interactive wavelet-guided
+self-attention module that integrates wavelet transformation with efficient
+attention design, interacting with the knowledge from the inspective branch.
+The inspective branch consists of simple convolutions that capture fine-grained
+traces and interact bidirectionally with Transformer blocks to provide mutual
+support. Our method is lightweight ($\sim$ 8M) but achieves competitive
+performance compared to many other counterparts, demonstrating its efficacy in
+image manipulation detection and its potential for portable integration.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>BMVC 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MST-KD: Multiple Specialized Teachers Knowledge Distillation for Fair
+  Face Recognition <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16563v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16563v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eduarda Caldeira, Jaime S. Cardoso, Ana F. Sequeira, Pedro C. Neto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As in school, one teacher to cover all subjects is insufficient to distill
+equally robust information to a student. Hence, each subject is taught by a
+highly specialised teacher. Following a similar philosophy, we propose a
+multiple specialized teacher framework to distill knowledge to a student
+network. In our approach, directed at face recognition use cases, we train four
+teachers on one specific ethnicity, leading to four highly specialized and
+biased teachers. Our strategy learns a project of these four teachers into a
+common space and distill that information to a student network. Our results
+highlighted increased performance and reduced bias for all our experiments. In
+addition, we further show that having biased/specialized teachers is crucial by
+showing that our approach achieves better results than when knowledge is
+distilled from four teachers trained on balanced datasets. Our approach
+represents a step forward to the understanding of the importance of
+ethnicity-specific features.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ECCV 2024 ABAW</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OP-Align: Object-level and Part-level Alignment for <span class="highlight-title">Self-supervised</span>
+  Category-level Articulated Object Pose Estimation <span class="chip">ECCV2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16547v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16547v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuchen Che, Ryo Furukawa, Asako Kanezaki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Category-level articulated object pose estimation focuses on the pose
+estimation of unknown articulated objects within known categories. Despite its
+significance, this task remains challenging due to the varying shapes and poses
+of objects, expensive dataset annotation costs, and complex real-world
+environments. In this paper, we propose a novel self-supervised approach that
+leverages a single-frame point cloud to solve this task. Our model consistently
+generates reconstruction with a canonical pose and joint state for the entire
+input object, and it estimates object-level poses that reduce overall pose
+variance and part-level poses that align each part of the input with its
+corresponding part of the reconstruction. Experimental results demonstrate that
+our approach significantly outperforms previous self-supervised methods and is
+comparable to the state-of-the-art supervised methods. To assess the
+performance of our model in real-world scenarios, we also introduce a new
+real-world articulated object benchmark dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>to be published in ECCV2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spurfies: Sparse Surface Reconstruction using Local Geometry Priors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16544v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16544v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kevin Raj, Christopher Wewer, Raza Yunus, Eddy Ilg, Jan Eric Lenssen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Spurfies, a novel method for sparse-view surface reconstruction
+that disentangles appearance and geometry information to utilize local geometry
+priors trained on synthetic data. Recent research heavily focuses on 3D
+reconstruction using dense multi-view setups, typically requiring hundreds of
+images. However, these methods often struggle with few-view scenarios. Existing
+sparse-view reconstruction techniques often rely on multi-view stereo networks
+that need to learn joint priors for geometry and appearance from a large amount
+of data. In contrast, we introduce a neural point representation that
+disentangles geometry and appearance to train a local geometry prior using a
+subset of the synthetic ShapeNet dataset only. During inference, we utilize
+this surface prior as additional constraint for surface and appearance
+reconstruction from sparse input views via differentiable volume rendering,
+restricting the space of possible solutions. We validate the effectiveness of
+our method on the DTU dataset and demonstrate that it outperforms previous
+state of the art by 35% in surface quality while achieving competitive novel
+view synthesis quality. Moreover, in contrast to previous works, our method can
+be applied to larger, unbounded scenes, such as Mip-NeRF 360.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>https://geometric-rl.mpi-inf.mpg.de/spurfies/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GRPose: Learning Graph Relations for Human Image Generation with Pose
+  Priors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16540v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16540v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangchen Yin, Donglin Di, Lei Fan, Hao Li, Chen Wei, Xiaofei Gou, Yang Song, Xiao Sun, Xun Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent methods using diffusion models have made significant progress in human
+image generation with various additional controls such as pose priors. However,
+existing approaches still struggle to generate high-quality images with
+consistent pose alignment, resulting in unsatisfactory outputs. In this paper,
+we propose a framework delving into the graph relations of pose priors to
+provide control information for human image generation. The main idea is to
+establish a graph topological structure between the pose priors and latent
+representation of diffusion models to capture the intrinsic associations
+between different pose parts. A Progressive Graph Integrator (PGI) is designed
+to learn the spatial relationships of the pose priors with the graph structure,
+adopting a hierarchical strategy within an Adapter to gradually propagate
+information across different pose parts. A pose perception loss is further
+introduced based on a pretrained pose estimation network to minimize the pose
+differences. Extensive qualitative and quantitative experiments conducted on
+the Human-Art and LAION-Human datasets demonstrate that our model achieves
+superior performance, with a 9.98% increase in pose average precision compared
+to the latest benchmark model. The code is released on *******.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The code will be released at https://github.com/XiangchenYin/GRPose</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Modality-agnostic Label-efficient Segmentation with
+  Entropy-Regularized Distribution Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16520v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16520v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liyao Tang, Zhe Chen, Shanshan Zhao, Chaoyue Wang, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Label-efficient segmentation aims to perform effective segmentation on input
+data using only sparse and limited ground-truth labels for training. This topic
+is widely studied in 3D point cloud segmentation due to the difficulty of
+annotating point clouds densely, while it is also essential for cost-effective
+segmentation on 2D images. Until recently, pseudo-labels have been widely
+employed to facilitate training with limited ground-truth labels, and promising
+progress has been witnessed in both the 2D and 3D segmentation. However,
+existing pseudo-labeling approaches could suffer heavily from the noises and
+variations in unlabelled data, which would result in significant discrepancies
+between generated pseudo-labels and current model predictions during training.
+We analyze that this can further confuse and affect the model learning process,
+which shows to be a shared problem in label-efficient learning across both 2D
+and 3D modalities. To address this issue, we propose a novel learning strategy
+to regularize the pseudo-labels generated for training, thus effectively
+narrowing the gaps between pseudo-labels and model predictions. More
+specifically, our method introduces an Entropy Regularization loss and a
+Distribution Alignment loss for label-efficient learning, resulting in an ERDA
+learning strategy. Interestingly, by using KL distance to formulate the
+distribution alignment loss, ERDA reduces to a deceptively simple
+cross-entropy-based loss which optimizes both the pseudo-label generation
+module and the segmentation model simultaneously. In addition, we innovate in
+the pseudo-label generation to make our ERDA consistently effective across both
+2D and 3D data modalities for segmentation. Enjoying simplicity and more
+modality-agnostic pseudo-label generation, our method has shown outstanding
+performance in fully utilizing all unlabeled data points for training across
+...
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Extended version of arXiv:2305.15832; Code at
+  https://github.com/LiyaoTang/ERDA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Alignment is All You Need: A Training-free Augmentation Strategy for
+  Pose-guided Video Generation <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16506v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16506v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoyu Jin, Zunnan Xu, Mingwen Ou, Wenming Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Character animation is a transformative field in computer graphics and
+vision, enabling dynamic and realistic video animations from static images.
+Despite advancements, maintaining appearance consistency in animations remains
+a challenge. Our approach addresses this by introducing a training-free
+framework that ensures the generated video sequence preserves the reference
+image's subtleties, such as physique and proportions, through a dual alignment
+strategy. We decouple skeletal and motion priors from pose information,
+enabling precise control over animation generation. Our method also improves
+pixel-level alignment for conditional control from the reference character,
+enhancing the temporal consistency and visual cohesion of animations. Our
+method significantly enhances the quality of video generation without the need
+for large datasets or expensive computational resources.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVG@ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Simple and Generalist Approach for Panoptic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16504v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16504v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nedyalko Prisadnikov, Wouter Van Gansbeke, Danda Pani Paudel, Luc Van Gool
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generalist vision models aim for one and the same architecture for a variety
+of vision tasks. While such shared architecture may seem attractive, generalist
+models tend to be outperformed by their bespoken counterparts, especially in
+the case of panoptic segmentation. We address this problem by introducing two
+key contributions, without compromising the desirable properties of generalist
+models. These contributions are: (i) a positional-embedding (PE) based loss for
+improved centroid regressions; (ii) Edge Distance Sampling (EDS) for the better
+separation of instance boundaries. The PE-based loss facilitates a better
+per-pixel regression of the associated instance's centroid, whereas EDS
+contributes by carefully handling the void regions (caused by missing labels)
+and smaller instances. These two simple yet effective modifications
+significantly improve established baselines, while achieving state-of-the-art
+results among all generalist solutions. More specifically, our method achieves
+a panoptic quality(PQ) of 52.5 on the COCO dataset, which is an improvement of
+10 points over the best model with similar approach (Painter), and is superior
+by 2 to the best performing diffusion-based method Pix2Seq-$\mathcal{D}$.
+Furthermore, we provide insights into and an in-depth analysis of our
+contributions through exhaustive experiments. Our source code and model weights
+will be made publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Locally Grouped and Scale-Guided Attention for Dense Pest Counting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16503v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16503v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chang-Hwan Son
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study introduces a new dense pest counting problem to predict densely
+distributed pests captured by digital traps. Unlike traditional detection-based
+counting models for sparsely distributed objects, trap-based pest counting must
+deal with dense pest distributions that pose challenges such as severe
+occlusion, wide pose variation, and similar appearances in colors and textures.
+To address these problems, it is essential to incorporate the local attention
+mechanism, which identifies locally important and unimportant areas to learn
+locally grouped features, thereby enhancing discriminative performance.
+Accordingly, this study presents a novel design that integrates locally grouped
+and scale-guided attention into a multiscale CenterNet framework. To group
+local features with similar attributes, a straightforward method is introduced
+using the heatmap predicted by the first hourglass containing pest centroid
+information, which eliminates the need for complex clustering models. To
+enhance attentiveness, the pixel attention module transforms the heatmap into a
+learnable map. Subsequently, scale-guided attention is deployed to make the
+object and background features more discriminative, achieving multiscale
+feature fusion. Through experiments, the proposed model is verified to enhance
+object features based on local grouping and discriminative feature attention
+learning. Additionally, the proposed model is highly effective in overcoming
+occlusion and pose variation problems, making it more suitable for dense pest
+counting. In particular, the proposed model outperforms state-of-the-art models
+by a large margin, with a remarkable contribution to dense pest counting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UAV-Based Human Body Detector Selection and Fusion for Geolocated
+  Saliency Map Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16501v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16501v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Piotr Rudol, Patrick Doherty, Mariusz Wzorek, Chattrakul Sombattheera
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The problem of reliably detecting and geolocating objects of different
+classes in soft real-time is essential in many application areas, such as
+Search and Rescue performed using Unmanned Aerial Vehicles (UAVs). This
+research addresses the complementary problems of system contextual vision-based
+detector selection, allocation, and execution, in addition to the fusion of
+detection results from teams of UAVs for the purpose of accurately and reliably
+geolocating objects of interest in a timely manner. In an offline step, an
+application-independent evaluation of vision-based detectors from a system
+perspective is first performed. Based on this evaluation, the most appropriate
+algorithms for online object detection for each platform are selected
+automatically before a mission, taking into account a number of practical
+system considerations, such as the available communication links, video
+compression used, and the available computational resources. The detection
+results are fused using a method for building maps of salient locations which
+takes advantage of a novel sensor model for vision-based detections for both
+positive and negative observations. A number of simulated and real flight
+experiments are also presented, validating the proposed method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>42 pages, 19 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CogVLM2: Visual Language Models for Image and Video Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16500v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16500v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenyi Hong, Weihan Wang, Ming Ding, Wenmeng Yu, Qingsong Lv, Yan Wang, Yean Cheng, Shiyu Huang, Junhui Ji, Zhao Xue, Lei Zhao, Zhuoyi Yang, Xiaotao Gu, Xiaohan Zhang, Guanyu Feng, Da Yin, Zihan Wang, Ji Qi, Xixuan Song, Peng Zhang, Debing Liu, Bin Xu, Juanzi Li, Yuxiao Dong, Jie Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Beginning with VisualGLM and CogVLM, we are continuously exploring VLMs in
+pursuit of enhanced vision-language fusion, efficient higher-resolution
+architecture, and broader modalities and applications. Here we propose the
+CogVLM2 family, a new generation of visual language models for image and video
+understanding including CogVLM2, CogVLM2-Video and GLM-4V. As an image
+understanding model, CogVLM2 inherits the visual expert architecture with
+improved training recipes in both pre-training and post-training stages,
+supporting input resolution up to $1344 \times 1344$ pixels. As a video
+understanding model, CogVLM2-Video integrates multi-frame input with timestamps
+and proposes automated temporal grounding data construction. Notably, CogVLM2
+family has achieved state-of-the-art results on benchmarks like MMBench,
+MM-Vet, TextVQA, MVBench and VCGBench. All models are open-sourced in
+https://github.com/THUDM/CogVLM2 and https://github.com/THUDM/GLM-4,
+contributing to the advancement of the field.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adapting Vision-Language Models to Open Classes via Test-Time <span class="highlight-title">Prompt</span>
+  Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16486v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16486v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengqing Gao, Xiang Ao, Xu-Yao Zhang, Cheng-Lin Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adapting pre-trained models to open classes is a challenging problem in
+machine learning. Vision-language models fully explore the knowledge of text
+modality, demonstrating strong zero-shot recognition performance, which is
+naturally suited for various open-set problems. More recently, some research
+focuses on fine-tuning such models to downstream tasks. Prompt tuning methods
+achieved huge improvements by learning context vectors on few-shot data.
+However, through the evaluation under open-set adaptation setting with the test
+data including new classes, we find that there exists a dilemma that learned
+prompts have worse generalization abilities than hand-crafted prompts. In this
+paper, we consider combining the advantages of both and come up with a
+test-time prompt tuning approach, which leverages the maximum concept matching
+(MCM) scores as dynamic weights to generate an input-conditioned prompt for
+each image during test. Through extensive experiments on 11 different datasets,
+we show that our proposed method outperforms all comparison methods on average
+considering both base and new classes. The code is available at
+https://github.com/gaozhengqing/TTPT
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>PRCV 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Deep-Learning-Based Lable-free No-Reference Image Quality Assessment
+  Metric: Application in Sodium MRI Denoising 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16481v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16481v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuaiyu Yuan, Tristan Whitmarsh, Dimitri A Kessler, Otso Arponen, Mary A McLean, Gabrielle Baxter, Frank Riemer, Aneurin J Kennerley, William J Brackenbury, Fiona J Gilbert, Joshua D Kaggie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  New multinuclear MRI techniques, such as sodium MRI, generally suffer from
+low image quality due to an inherently low signal. Postprocessing methods, such
+as image denoising, have been developed for image enhancement. However, the
+assessment of these enhanced images is challenging especially considering when
+there is a lack of high resolution and high signal images as reference, such as
+in sodium MRI. No-reference Image Quality Assessment (NR-IQA) metrics are
+approaches to solve this problem. Existing learning-based NR-IQA metrics rely
+on labels derived from subjective human opinions or metrics like
+Signal-to-Noise Ratio (SNR), which are either time-consuming or lack accurate
+ground truths, resulting in unreliable assessment. We note that deep learning
+(DL) models have a unique characteristic in that they are specialized to a
+characteristic training set, meaning that deviations between the input testing
+data from the training data will reduce prediction accuracy. Therefore, we
+propose a novel DL-based NR-IQA metric, the Model Specialization Metric (MSM),
+which does not depend on ground-truth images or labels. MSM measures the
+difference between the input image and the model's prediction for evaluating
+the quality of the input image. Experiments conducted on both simulated
+distorted proton T1-weighted MR images and denoised sodium MR images
+demonstrate that MSM exhibits a superior evaluation performance on various
+simulated noises and distortions. MSM also has a substantial agreement with the
+expert evaluations, achieving an averaged Cohen's Kappa coefficient of 0.6528,
+outperforming the existing NR-IQA metrics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MICDrop: Masking Image and Depth Features via Complementary Dropout for
+  Domain-Adaptive Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16478v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16478v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Linyan Yang, Lukas Hoyer, Mark Weber, Tobias Fischer, Dengxin Dai, Laura Leal-Taixé, Marc Pollefeys, Daniel Cremers, Luc Van Gool
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised Domain Adaptation (UDA) is the task of bridging the domain gap
+between a labeled source domain, e.g., synthetic data, and an unlabeled target
+domain. We observe that current UDA methods show inferior results on fine
+structures and tend to oversegment objects with ambiguous appearance. To
+address these shortcomings, we propose to leverage geometric information, i.e.,
+depth predictions, as depth discontinuities often coincide with segmentation
+boundaries. We show that naively incorporating depth into current UDA methods
+does not fully exploit the potential of this complementary information. To this
+end, we present MICDrop, which learns a joint feature representation by masking
+image encoder features while inversely masking depth encoder features. With
+this simple yet effective complementary masking strategy, we enforce the use of
+both modalities when learning the joint feature representation. To aid this
+process, we propose a feature fusion module to improve both global as well as
+local information sharing while being robust to errors in the depth
+predictions. We show that our method can be plugged into various recent UDA
+methods and consistently improve results across standard UDA benchmarks,
+obtaining new state-of-the-art performances.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Creating a Segmented Pointcloud of Grapevines by Combining Multiple
+  Viewpoints Through Visual Odometry 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16472v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16472v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Adlerstein, Angelo Bratta, João Carlos Virgolino Soares, Giovanni Dessy, Miguel Fernandes, Matteo Gatti, Claudio Semini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Grapevine winter pruning is a labor-intensive and repetitive process that
+significantly influences the quality and quantity of the grape harvest and
+produced wine of the following season. It requires a careful and expert
+detection of the point to be cut. Because of its complexity, repetitive nature
+and time constraint, the task requires skilled labor that needs to be trained.
+This extended abstract presents the computer vision pipeline employed in
+project Vinum, using detectron2 as a segmentation network and keypoint visual
+odometry to merge different observation into a single pointcloud used to make
+informed pruning decisions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving 3D deep learning segmentation with biophysically motivated
+  cell synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16471v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16471v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roman Bruch, Mario Vitacolonna, Elina Nürnberg, Simeon Sauer, Rüdiger Rudolf, Markus Reischl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Biomedical research increasingly relies on 3D cell culture models and
+AI-based analysis can potentially facilitate a detailed and accurate feature
+extraction on a single-cell level. However, this requires for a precise
+segmentation of 3D cell datasets, which in turn demands high-quality ground
+truth for training. Manual annotation, the gold standard for ground truth data,
+is too time-consuming and thus not feasible for the generation of large 3D
+training datasets. To address this, we present a novel framework for generating
+3D training data, which integrates biophysical modeling for realistic cell
+shape and alignment. Our approach allows the in silico generation of coherent
+membrane and nuclei signals, that enable the training of segmentation models
+utilizing both channels for improved performance. Furthermore, we present a new
+GAN training scheme that generates not only image data but also matching
+labels. Quantitative evaluation shows superior performance of biophysical
+motivated synthetic training data, even outperforming manual annotation and
+pretrained models. This underscores the potential of incorporating biophysical
+modeling for enhancing synthetic training data quality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-source Domain Adaptation for Panoramic Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16469v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16469v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jing Jiang, Sicheng Zhao, Jiankun Zhu, Wenbo Tang, Zhaopan Xu, Jidong Yang, Pengfei Xu, Hongxun Yao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Panoramic semantic segmentation has received widespread attention recently
+due to its comprehensive 360\degree field of view. However, labeling such
+images demands greater resources compared to pinhole images. As a result, many
+unsupervised domain adaptation methods for panoramic semantic segmentation have
+emerged, utilizing real pinhole images or low-cost synthetic panoramic images.
+But, the segmentation model lacks understanding of the panoramic structure when
+only utilizing real pinhole images, and it lacks perception of real-world
+scenes when only adopting synthetic panoramic images. Therefore, in this paper,
+we propose a new task of multi-source domain adaptation for panoramic semantic
+segmentation, aiming to utilize both real pinhole and synthetic panoramic
+images in the source domains, enabling the segmentation model to perform well
+on unlabeled real panoramic images in the target domain. Further, we propose
+Deformation Transform Aligner for Panoramic Semantic Segmentation (DTA4PASS),
+which converts all pinhole images in the source domains into panoramic-like
+images, and then aligns the converted source domains with the target domain.
+Specifically, DTA4PASS consists of two main components: Unpaired Semantic
+Morphing (USM) and Distortion Gating Alignment (DGA). Firstly, in USM, the
+Semantic Dual-view Discriminator (SDD) assists in training the diffeomorphic
+deformation network, enabling the effective transformation of pinhole images
+without paired panoramic views. Secondly, DGA assigns pinhole-like and
+panoramic-like features to each image by gating, and aligns these two features
+through uncertainty estimation. DTA4PASS outperforms the previous
+state-of-the-art methods by 1.92% and 2.19% on the outdoor and indoor
+multi-source domain adaptation scenarios, respectively. The source code will be
+released.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 7 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spiking Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16467v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16467v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiahang Cao, Hanzhong Guo, Ziqing Wang, Deming Zhou, Hao Cheng, Qiang Zhang, Renjing Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent years have witnessed Spiking Neural Networks (SNNs) gaining attention
+for their ultra-low energy consumption and high biological plausibility
+compared with traditional Artificial Neural Networks (ANNs). Despite their
+distinguished properties, the application of SNNs in the computationally
+intensive field of image generation is still under exploration. In this paper,
+we propose the Spiking Diffusion Models (SDMs), an innovative family of
+SNN-based generative models that excel in producing high-quality samples with
+significantly reduced energy consumption. In particular, we propose a
+Temporal-wise Spiking Mechanism (TSM) that allows SNNs to capture more temporal
+features from a bio-plasticity perspective. In addition, we propose a
+threshold-guided strategy that can further improve the performances by up to
+16.7% without any additional training. We also make the first attempt to use
+the ANN-SNN approach for SNN-based generation tasks. Extensive experimental
+results reveal that our approach not only exhibits comparable performance to
+its ANN counterpart with few spiking time steps, but also outperforms previous
+SNN-based generative models by a large margin. Moreover, we also demonstrate
+the high-quality generation ability of SDM on large-scale datasets, e.g., LSUN
+bedroom. This development marks a pivotal advancement in the capabilities of
+SNN-based generation, paving the way for future research avenues to realize
+low-energy and low-latency generative applications. Our code is available at
+https://github.com/AndyCao1125/SDM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE Transactions on Artificial Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Weakly Supervised Object Detection for Automatic Tooth-marked Tongue
+  Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16451v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16451v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongcun Zhang, Jiajun Xu, Yina He, Shaozi Li, Zhiming Luo, Huangwei Lei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tongue diagnosis in Traditional Chinese Medicine (TCM) is a crucial
+diagnostic method that can reflect an individual's health status. Traditional
+methods for identifying tooth-marked tongues are subjective and inconsistent
+because they rely on practitioner experience. We propose a novel fully
+automated Weakly Supervised method using Vision transformer and Multiple
+instance learning WSVM for tongue extraction and tooth-marked tongue
+recognition. Our approach first accurately detects and extracts the tongue
+region from clinical images, removing any irrelevant background information.
+Then, we implement an end-to-end weakly supervised object detection method. We
+utilize Vision Transformer (ViT) to process tongue images in patches and employ
+multiple instance loss to identify tooth-marked regions with only image-level
+annotations. WSVM achieves high accuracy in tooth-marked tongue classification,
+and visualization experiments demonstrate its effectiveness in pinpointing
+these regions. This automated approach enhances the objectivity and accuracy of
+tooth-marked tongue diagnosis. It provides significant clinical value by
+assisting TCM practitioners in making precise diagnoses and treatment
+recommendations. Code is available at https://github.com/yc-zh/WSVM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ What to Preserve and What to Transfer: Faithful, Identity-Preserving
+  Diffusion-based Hairstyle Transfer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16450v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16450v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaeyeon Chung, Sunghyun Park, Jeongho Kim, Jaegul Choo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hairstyle transfer is a challenging task in the image editing field that
+modifies the hairstyle of a given face image while preserving its other
+appearance and background features. The existing hairstyle transfer approaches
+heavily rely on StyleGAN, which is pre-trained on cropped and aligned face
+images. Hence, they struggle to generalize under challenging conditions such as
+extreme variations of head poses or focal lengths. To address this issue, we
+propose a one-stage hairstyle transfer diffusion model, HairFusion, that
+applies to real-world scenarios. Specifically, we carefully design a
+hair-agnostic representation as the input of the model, where the original hair
+information is thoroughly eliminated. Next, we introduce a hair align
+cross-attention (Align-CA) to accurately align the reference hairstyle with the
+face image while considering the difference in their face shape. To enhance the
+preservation of the face image's original features, we leverage adaptive hair
+blending during the inference, where the output's hair regions are estimated by
+the cross-attention map in Align-CA and blended with non-hair areas of the face
+image. Our experimental results show that our method achieves state-of-the-art
+performance compared to the existing methods in preserving the integrity of
+both the transferred hairstyle and the surrounding features. The codes are
+available at https://github.com/cychungg/HairFusion.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Sound Source Localization via False Negative Elimination 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16448v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16448v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zengjie Song, Jiangshe Zhang, Yuxi Wang, Junsong Fan, Zhaoxiang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sound source localization aims to localize objects emitting the sound in
+visual scenes. Recent works obtaining impressive results typically rely on
+contrastive learning. However, the common practice of randomly sampling
+negatives in prior arts can lead to the false negative issue, where the sounds
+semantically similar to visual instance are sampled as negatives and
+incorrectly pushed away from the visual anchor/query. As a result, this
+misalignment of audio and visual features could yield inferior performance. To
+address this issue, we propose a novel audio-visual learning framework which is
+instantiated with two individual learning schemes: self-supervised predictive
+learning (SSPL) and semantic-aware contrastive learning (SACL). SSPL explores
+image-audio positive pairs alone to discover semantically coherent similarities
+between audio and visual features, while a predictive coding module for feature
+alignment is introduced to facilitate the positive-only learning. In this
+regard SSPL acts as a negative-free method to eliminate false negatives. By
+contrast, SACL is designed to compact visual features and remove false
+negatives, providing reliable visual anchor and audio negatives for contrast.
+Different from SSPL, SACL releases the potential of audio-visual contrastive
+learning, offering an effective alternative to achieve the same goal.
+Comprehensive experiments demonstrate the superiority of our approach over the
+state-of-the-arts. Furthermore, we highlight the versatility of the learned
+representation by extending the approach to audio-visual event classification
+and object detection tasks. Code and models are available at:
+https://github.com/zjsong/SACL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2203.13412</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mismatched: Evaluating the Limits of Image Matching Approaches and
+  Benchmarks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16445v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16445v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sierra Bonilla, Chiara Di Vece, Rema Daher, Xinwei Ju, Danail Stoyanov, Francisco Vasconcelos, Sophia Bano
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Three-dimensional (3D) reconstruction from two-dimensional images is an
+active research field in computer vision, with applications ranging from
+navigation and object tracking to segmentation and three-dimensional modeling.
+Traditionally, parametric techniques have been employed for this task. However,
+recent advancements have seen a shift towards learning-based methods. Given the
+rapid pace of research and the frequent introduction of new image matching
+methods, it is essential to evaluate them. In this paper, we present a
+comprehensive evaluation of various image matching methods using a
+structure-from-motion pipeline. We assess the performance of these methods on
+both in-domain and out-of-domain datasets, identifying key limitations in both
+the methods and benchmarks. We also investigate the impact of edge detection as
+a pre-processing step. Our analysis reveals that image matching for 3D
+reconstruction remains an open challenge, necessitating careful selection and
+tuning of models for specific scenarios, while also highlighting mismatches in
+how metrics currently represent method performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Integrating Features for Recognizing Human Activities through Optimized
+  Parameters in Graph Convolutional Networks and <span class="highlight-title">Transformer</span> Architectures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16442v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16442v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Belal, Taimur Hassan, Abdelfatah Hassan, Nael Alsheikh, Noureldin Elhendawi, Irfan Hussain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human activity recognition is a major field of study that employs computer
+vision, machine vision, and deep learning techniques to categorize human
+actions. The field of deep learning has made significant progress, with
+architectures that are extremely effective at capturing human dynamics. This
+study emphasizes the influence of feature fusion on the accuracy of activity
+recognition. This technique addresses the limitation of conventional models,
+which face difficulties in identifying activities because of their limited
+capacity to understand spatial and temporal features. The technique employs
+sensory data obtained from four publicly available datasets: HuGaDB, PKU-MMD,
+LARa, and TUG. The accuracy and F1-score of two deep learning models,
+specifically a Transformer model and a Parameter-Optimized Graph Convolutional
+Network (PO-GCN), were evaluated using these datasets. The feature fusion
+technique integrated the final layer features from both models and inputted
+them into a classifier. Empirical evidence demonstrates that PO-GCN outperforms
+standard models in activity recognition. HuGaDB demonstrated a 2.3% improvement
+in accuracy and a 2.2% increase in F1-score. TUG showed a 5% increase in
+accuracy and a 0.5% rise in F1-score. On the other hand, LARa and PKU-MMD
+achieved lower accuracies of 64% and 69% respectively. This indicates that the
+integration of features enhanced the performance of both the Transformer model
+and PO-GCN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 1 figure, conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Discriminative Spatial-Semantic VOS Solution: 1st Place Solution for 6th
+  LSVOS 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16431v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16431v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Deshui Miao, Yameng Gu, Xin Li, Zhenyu He, Yaowei Wang, Ming-Hsuan Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video object segmentation (VOS) is a crucial task in computer vision, but
+current VOS methods struggle with complex scenes and prolonged object motions.
+To address these challenges, the MOSE dataset aims to enhance object
+recognition and differentiation in complex environments, while the LVOS dataset
+focuses on segmenting objects exhibiting long-term, intricate movements. This
+report introduces a discriminative spatial-temporal VOS model that utilizes
+discriminative object features as query representations. The semantic
+understanding of spatial-semantic modules enables it to recognize object parts,
+while salient features highlight more distinctive object characteristics. Our
+model, trained on extensive VOS datasets, achieved first place
+(\textbf{80.90\%} $\mathcal{J \& F}$) on the test set of the 6th LSVOS
+challenge in the VOS Track, demonstrating its effectiveness in tackling the
+aforementioned challenges. The code will be available at
+\href{https://github.com/yahooo-m/VOS-Solution}{code}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>1st Place Solution for 6th LSVOS VOS Track. arXiv admin note:
+  substantial text overlap with arXiv:2406.04600</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ COIN: Control-Inpainting Diffusion Prior for Human and Camera Motion
+  Estimation <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16426v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16426v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiefeng Li, Ye Yuan, Davis Rempe, Haotian Zhang, Pavlo Molchanov, Cewu Lu, Jan Kautz, Umar Iqbal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Estimating global human motion from moving cameras is challenging due to the
+entanglement of human and camera motions. To mitigate the ambiguity, existing
+methods leverage learned human motion priors, which however often result in
+oversmoothed motions with misaligned 2D projections. To tackle this problem, we
+propose COIN, a control-inpainting motion diffusion prior that enables
+fine-grained control to disentangle human and camera motions. Although
+pre-trained motion diffusion models encode rich motion priors, we find it
+non-trivial to leverage such knowledge to guide global motion estimation from
+RGB videos. COIN introduces a novel control-inpainting score distillation
+sampling method to ensure well-aligned, consistent, and high-quality motion
+from the diffusion prior within a joint optimization framework. Furthermore, we
+introduce a new human-scene relation loss to alleviate the scale ambiguity by
+enforcing consistency among the humans, camera, and scene. Experiments on three
+challenging benchmarks demonstrate the effectiveness of COIN, which outperforms
+the state-of-the-art methods in terms of global human motion estimation and
+camera motion estimation. As an illustrative example, COIN outperforms the
+state-of-the-art method by 33% in world joint position error (W-MPJPE) on the
+RICH dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ECCV 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Text-Enhanced Zero-Shot Action Recognition: A training-free approach <span class="chip">ICPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16412v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16412v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Massimo Bosetti, Shibingfeng Zhang, Bendetta Liberatori, Giacomo Zara, Elisa Ricci, Paolo Rota
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-language models (VLMs) have demonstrated remarkable performance across
+various visual tasks, leveraging joint learning of visual and textual
+representations. While these models excel in zero-shot image tasks, their
+application to zero-shot video action recognition (ZSVAR) remains challenging
+due to the dynamic and temporal nature of actions. Existing methods for ZS-VAR
+typically require extensive training on specific datasets, which can be
+resource-intensive and may introduce domain biases. In this work, we propose
+Text-Enhanced Action Recognition (TEAR), a simple approach to ZS-VAR that is
+training-free and does not require the availability of training data or
+extensive computational resources. Drawing inspiration from recent findings in
+vision and language literature, we utilize action descriptors for decomposition
+and contextual information to enhance zero-shot action recognition. Through
+experiments on UCF101, HMDB51, and Kinetics-600 datasets, we showcase the
+effectiveness and applicability of our proposed approach in addressing the
+challenges of ZS-VAR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted to ICPR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IBO: Inpainting-Based Occlusion to Enhance Explainable Artificial
+  Intelligence Evaluation in Histopathology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16395v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16395v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pardis Afshar, Sajjad Hashembeiki, Pouya Khani, Emad Fatemizadeh, Mohammad Hossein Rohban
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Histopathological image analysis is crucial for accurate cancer diagnosis and
+treatment planning. While deep learning models, especially convolutional neural
+networks, have advanced this field, their "black-box" nature raises concerns
+about interpretability and trustworthiness. Explainable Artificial Intelligence
+(XAI) techniques aim to address these concerns, but evaluating their
+effectiveness remains challenging. A significant issue with current
+occlusion-based XAI methods is that they often generate Out-of-Distribution
+(OoD) samples, leading to inaccurate evaluations. In this paper, we introduce
+Inpainting-Based Occlusion (IBO), a novel occlusion strategy that utilizes a
+Denoising Diffusion Probabilistic Model to inpaint occluded regions in
+histopathological images. By replacing cancerous areas with realistic,
+non-cancerous tissue, IBO minimizes OoD artifacts and preserves data integrity.
+We evaluate our method on the CAMELYON16 dataset through two phases: first, by
+assessing perceptual similarity using the Learned Perceptual Image Patch
+Similarity (LPIPS) metric, and second, by quantifying the impact on model
+predictions through Area Under the Curve (AUC) analysis. Our results
+demonstrate that IBO significantly improves perceptual fidelity, achieving
+nearly twice the improvement in LPIPS scores compared to the best existing
+occlusion strategy. Additionally, IBO increased the precision of XAI
+performance prediction from 42% to 71% compared to traditional methods. These
+results demonstrate IBO's potential to provide more reliable evaluations of XAI
+techniques, benefiting histopathology and other applications. The source code
+for this study is available at https://github.com/a-fsh-r/IBO.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploiting temporal information to detect conversational groups in
+  videos and predict the next speaker 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16380v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16380v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucrezia Tosato, Victor Fortier, Isabelle Bloch, Catherine Pelachaud
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Studies in human human interaction have introduced the concept of F formation
+to describe the spatial arrangement of participants during social interactions.
+This paper has two objectives. It aims at detecting F formations in video
+sequences and predicting the next speaker in a group conversation. The proposed
+approach exploits time information and human multimodal signals in video
+sequences. In particular, we rely on measuring the engagement level of people
+as a feature of group belonging. Our approach makes use of a recursive neural
+network, the Long Short Term Memory (LSTM), to predict who will take the
+speaker's turn in a conversation group. Experiments on the MatchNMingle dataset
+led to 85% true positives in group detection and 98% accuracy in predicting the
+next speaker.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Pattern Recognition Letter, 8 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Law of Vision Representation in MLLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16357v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16357v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shijia Yang, Bohan Zhai, Quanzeng You, Jianbo Yuan, Hongxia Yang, Chenfeng Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present the "Law of Vision Representation" in multimodal large language
+models (MLLMs). It reveals a strong correlation between the combination of
+cross-modal alignment, correspondence in vision representation, and MLLM
+performance. We quantify the two factors using the cross-modal Alignment and
+Correspondence score (AC score). Through extensive experiments involving
+thirteen different vision representation settings and evaluations across eight
+benchmarks, we find that the AC score is linearly correlated to model
+performance. By leveraging this relationship, we are able to identify and train
+the optimal vision representation only, which does not require finetuning the
+language model every time, resulting in a 99.7% reduction in computational
+cost.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The code is available at
+  https://github.com/bronyayang/Law_of_Vision_Representation_in_MLLMs</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NeRF-CA: Dynamic Reconstruction of X-ray Coronary Angiography with
+  Extremely Sparse-views 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16355v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16355v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kirsten W. H. Maas, Danny Ruijters, Anna Vilanova, Nicola Pezzotti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dynamic three-dimensional (4D) reconstruction from two-dimensional X-ray
+coronary angiography (CA) remains a significant clinical problem. Challenges
+include sparse-view settings, intra-scan motion, and complex vessel morphology
+such as structure sparsity and background occlusion. Existing CA reconstruction
+methods often require extensive user interaction or large training datasets. On
+the other hand, Neural Radiance Field (NeRF), a promising deep learning
+technique, has successfully reconstructed high-fidelity static scenes for
+natural and medical scenes. Recent work, however, identified that sparse-views,
+background occlusion, and dynamics still pose a challenge when applying NeRF in
+the X-ray angiography context. Meanwhile, many successful works for natural
+scenes propose regularization for sparse-view reconstruction or scene
+decomposition to handle dynamics. However, these techniques do not directly
+translate to the CA context, where both challenges and background occlusion are
+significant. This paper introduces NeRF-CA, the first step toward a 4D CA
+reconstruction method that achieves reconstructions from sparse coronary
+angiograms with cardiac motion. We leverage the motion of the coronary artery
+to decouple the scene into a dynamic coronary artery component and static
+background. We combine this scene decomposition with tailored regularization
+techniques. These techniques enforce the separation of the coronary artery from
+the background by enforcing dynamic structure sparsity and scene smoothness. By
+uniquely combining these approaches, we achieve 4D reconstructions from as few
+as four angiogram sequences. This setting aligns with clinical workflows while
+outperforming state-of-the-art X-ray sparse-view NeRF reconstruction
+techniques. We validate our approach quantitatively and qualitatively using 4D
+phantom datasets and ablation studies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Toward Robust Early Detection of Alzheimer's Disease via an Integrated
+  Multimodal Learning Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16343v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16343v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifei Chen, Shenghao Zhu, Zhaojie Fang, Chang Liu, Binfeng Zou, Yuhe Wang, Shuo Chang, Fan Jia, Feiwei Qin, Jin Fan, Yong Peng, Changmiao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Alzheimer's Disease (AD) is a complex neurodegenerative disorder marked by
+memory loss, executive dysfunction, and personality changes. Early diagnosis is
+challenging due to subtle symptoms and varied presentations, often leading to
+misdiagnosis with traditional unimodal diagnostic methods due to their limited
+scope. This study introduces an advanced multimodal classification model that
+integrates clinical, cognitive, neuroimaging, and EEG data to enhance
+diagnostic accuracy. The model incorporates a feature tagger with a tabular
+data coding architecture and utilizes the TimesBlock module to capture
+intricate temporal patterns in Electroencephalograms (EEG) data. By employing
+Cross-modal Attention Aggregation module, the model effectively fuses Magnetic
+Resonance Imaging (MRI) spatial information with EEG temporal data,
+significantly improving the distinction between AD, Mild Cognitive Impairment,
+and Normal Cognition. Simultaneously, we have constructed the first AD
+classification dataset that includes three modalities: EEG, MRI, and tabular
+data. Our innovative approach aims to facilitate early diagnosis and
+intervention, potentially slowing the progression of AD. The source code and
+our private ADMC dataset are available at https://github.com/JustlfC03/MSTNet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learned Image Transmission with Hierarchical Variational Autoencoder 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16340v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16340v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guangyi Zhang, Hanlei Li, Yunlong Cai, Qiyu Hu, Guanding Yu, Runmin Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce an innovative hierarchical joint source-channel
+coding (HJSCC) framework for image transmission, utilizing a hierarchical
+variational autoencoder (VAE). Our approach leverages a combination of
+bottom-up and top-down paths at the transmitter to autoregressively generate
+multiple hierarchical representations of the original image. These
+representations are then directly mapped to channel symbols for transmission by
+the JSCC encoder. We extend this framework to scenarios with a feedback link,
+modeling transmission over a noisy channel as a probabilistic sampling process
+and deriving a novel generative formulation for JSCC with feedback. Compared
+with existing approaches, our proposed HJSCC provides enhanced adaptability by
+dynamically adjusting transmission bandwidth, encoding these representations
+into varying amounts of channel symbols. Additionally, we introduce a rate
+attention module to guide the JSCC encoder in optimizing its encoding strategy
+based on prior information. Extensive experiments on images of varying
+resolutions demonstrate that our proposed model outperforms existing baselines
+in rate-distortion performance and maintains robustness against channel noise.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ P2P-Bridge: Diffusion Bridges for 3D Point Cloud Denoising <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16325v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16325v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mathias Vogel, Keisuke Tateno, Marc Pollefeys, Federico Tombari, Marie-Julie Rakotosaona, Francis Engelmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we tackle the task of point cloud denoising through a novel
+framework that adapts Diffusion Schr\"odinger bridges to points clouds. Unlike
+previous approaches that predict point-wise displacements from point features
+or learned noise distributions, our method learns an optimal transport plan
+between paired point clouds. Experiments on object datasets like PU-Net and
+real-world datasets such as ScanNet++ and ARKitScenes show that P2P-Bridge
+achieves significant improvements over existing methods. While our approach
+demonstrates strong results using only point coordinates, we also show that
+incorporating additional features, such as color information or point-wise
+DINOv2 features, further enhances the performance. Code and pretrained models
+are available at https://p2p-bridge.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ECCV 2024 Project page: https://p2p-bridge.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BEVal: A Cross-<span class="highlight-title">dataset</span> Evaluation Study of BEV Segmentation Models for
+  Autononomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16322v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16322v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manuel Alejandro Diaz-Zapata, Wenqian Liu, Robin Baruffa, Christian Laugier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current research in semantic bird's-eye view segmentation for autonomous
+driving focuses solely on optimizing neural network models using a single
+dataset, typically nuScenes. This practice leads to the development of highly
+specialized models that may fail when faced with different environments or
+sensor setups, a problem known as domain shift. In this paper, we conduct a
+comprehensive cross-dataset evaluation of state-of-the-art BEV segmentation
+models to assess their performance across different training and testing
+datasets and setups, as well as different semantic categories. We investigate
+the influence of different sensors, such as cameras and LiDAR, on the models'
+ability to generalize to diverse conditions and scenarios. Additionally, we
+conduct multi-dataset training experiments that improve models' BEV
+segmentation performance compared to single-dataset training. Our work
+addresses the gap in evaluating BEV segmentation models under cross-dataset
+validation. And our findings underscore the importance of enhancing model
+generalizability and adaptability to ensure more robust and reliable BEV
+segmentation approaches for autonomous driving applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ResVG: Enhancing Relation and Semantic Understanding in Multiple
+  Instances for Visual Grounding <span class="chip">ACM MM 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16314v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16314v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minghang Zheng, Jiahua Zhang, Qingchao Chen, Yuxin Peng, Yang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual grounding aims to localize the object referred to in an image based on
+a natural language query. Although progress has been made recently, accurately
+localizing target objects within multiple-instance distractions (multiple
+objects of the same category as the target) remains a significant challenge.
+Existing methods demonstrate a significant performance drop when there are
+multiple distractions in an image, indicating an insufficient understanding of
+the fine-grained semantics and spatial relationships between objects. In this
+paper, we propose a novel approach, the Relation and Semantic-sensitive Visual
+Grounding (ResVG) model, to address this issue. Firstly, we enhance the model's
+understanding of fine-grained semantics by injecting semantic prior information
+derived from text queries into the model. This is achieved by leveraging
+text-to-image generation models to produce images representing the semantic
+attributes of target objects described in queries. Secondly, we tackle the lack
+of training samples with multiple distractions by introducing a
+relation-sensitive data augmentation method. This method generates additional
+training data by synthesizing images containing multiple objects of the same
+category and pseudo queries based on their spatial relationships. The proposed
+ReSVG model significantly improves the model's ability to comprehend both
+object semantics and spatial relations, leading to enhanced performance in
+visual grounding tasks, particularly in scenarios with multiple-instance
+distractions. We conduct extensive experiments to validate the effectiveness of
+our methods on five datasets. Code is available at
+https://github.com/minghangz/ResVG.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM MM 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FA-YOLO: Research On Efficient Feature Selection YOLO Improved Algorithm
+  Based On FMDS and AGMF Modules 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16313v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16313v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yukang Huo, Mingyuan Yao, Qingbin Tian, Tonghao Wang, Ruifeng Wang, Haihua Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Over the past few years, the YOLO series of models has emerged as one of the
+dominant methodologies in the realm of object detection. Many studies have
+advanced these baseline models by modifying their architectures, enhancing data
+quality, and developing new loss functions. However, current models still
+exhibit deficiencies in processing feature maps, such as overlooking the fusion
+of cross-scale features and a static fusion approach that lacks the capability
+for dynamic feature adjustment. To address these issues, this paper introduces
+an efficient Fine-grained Multi-scale Dynamic Selection Module (FMDS Module),
+which applies a more effective dynamic feature selection and fusion method on
+fine-grained multi-scale feature maps, significantly enhancing the detection
+accuracy of small, medium, and large-sized targets in complex environments.
+Furthermore, this paper proposes an Adaptive Gated Multi-branch Focus Fusion
+Module (AGMF Module), which utilizes multiple parallel branches to perform
+complementary fusion of various features captured by the gated unit branch,
+FMDS Module branch, and TripletAttention branch. This approach further enhances
+the comprehensiveness, diversity, and integrity of feature fusion. This paper
+has integrated the FMDS Module, AGMF Module, into Yolov9 to develop a novel
+object detection model named FA-YOLO. Extensive experimental results show that
+under identical experimental conditions, FA-YOLO achieves an outstanding 66.1%
+mean Average Precision (mAP) on the PASCAL VOC 2007 dataset, representing 1.0%
+improvement over YOLOv9's 65.1%. Additionally, the detection accuracies of
+FA-YOLO for small, medium, and large targets are 44.1%, 54.6%, and 70.8%,
+respectively, showing improvements of 2.0%, 3.1%, and 0.9% compared to YOLOv9's
+42.1%, 51.5%, and 69.9%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages and 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bootstrap Segmentation Foundation Model under Distribution Shift via
+  Object-Centric Learning <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16310v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16310v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luyao Tang, Yuxuan Yuan, Chaoqi Chen, Kunze Huang, Xinghao Ding, Yue Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation models have made incredible strides in achieving zero-shot or
+few-shot generalization, leveraging prompt engineering to mimic the
+problem-solving approach of human intelligence. However, when it comes to some
+foundation models like Segment Anything, there is still a challenge in
+performing well on out-of-distribution data, including camouflaged and medical
+images. Inconsistent prompting strategies during fine-tuning and testing
+further compound the issue, leading to decreased performance. Drawing
+inspiration from how human cognition processes new environments, we introduce
+SlotSAM, a method that reconstructs features from the encoder in a
+self-supervised manner to create object-centric representations. These
+representations are then integrated into the foundation model, bolstering its
+object-level perceptual capabilities while reducing the impact of
+distribution-related variables. The beauty of SlotSAM lies in its simplicity
+and adaptability to various tasks, making it a versatile solution that
+significantly enhances the generalization abilities of foundation models.
+Through limited parameter fine-tuning in a bootstrap manner, our approach paves
+the way for improved generalization in novel environments. The code is
+available at github.com/lytang63/SlotSAM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work is accepted by ECCV 2024 EVAL-FoMo Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Semantics-Oriented Multitask Learning for DeepFake Detection: A Joint
+  Embedding Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16305v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16305v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mian Zou, Baosheng Yu, Yibing Zhan, Siwei Lyu, Kede Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, the multimedia forensics and security community has seen
+remarkable progress in multitask learning for DeepFake (i.e., face forgery)
+detection. The prevailing strategy has been to frame DeepFake detection as a
+binary classification problem augmented by manipulation-oriented auxiliary
+tasks. This strategy focuses on learning features specific to face
+manipulations, which exhibit limited generalizability. In this paper, we delve
+deeper into semantics-oriented multitask learning for DeepFake detection,
+leveraging the relationships among face semantics via joint embedding. We first
+propose an automatic dataset expansion technique that broadens current face
+forgery datasets to support semantics-oriented DeepFake detection tasks at both
+the global face attribute and local face region levels. Furthermore, we resort
+to joint embedding of face images and their corresponding labels (depicted by
+textual descriptions) for prediction. This approach eliminates the need for
+manually setting task-agnostic and task-specific parameters typically required
+when predicting labels directly from images. In addition, we employ a bi-level
+optimization strategy to dynamically balance the fidelity loss weightings of
+various tasks, making the training process fully automated. Extensive
+experiments on six DeepFake datasets show that our method improves the
+generalizability of DeepFake detection and, meanwhile, renders some degree of
+model interpretation by providing human-understandable explanations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhanced Control for Diffusion Bridge in Image Restoration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16303v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16303v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Conghan Yue, Zhengwei Peng, Junlong Ma, Dongyu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image restoration refers to the process of restoring a damaged low-quality
+image back to its corresponding high-quality image. Typically, we use
+convolutional neural networks to directly learn the mapping from low-quality
+images to high-quality images achieving image restoration. Recently, a special
+type of diffusion bridge model has achieved more advanced results in image
+restoration. It can transform the direct mapping from low-quality to
+high-quality images into a diffusion process, restoring low-quality images
+through a reverse process. However, the current diffusion bridge restoration
+models do not emphasize the idea of conditional control, which may affect
+performance. This paper introduces the ECDB model enhancing the control of the
+diffusion bridge with low-quality images as conditions. Moreover, in response
+to the characteristic of diffusion models having low denoising level at larger
+values of \(\bm t \), we also propose a Conditional Fusion Schedule, which more
+effectively handles the conditional feature information of various modules.
+Experimental results prove that the ECDB model has achieved state-of-the-art
+results in many image restoration tasks, including deraining, inpainting and
+super-resolution. Code is avaliable at https://github.com/Hammour-steak/ECDB.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking Sparse Lexical Representations for Image Retrieval in the Age
+  of Rising Multi-Modal Large Language Models <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16296v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16296v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kengo Nakata, Daisuke Miyashita, Youyang Ng, Yasuto Hoshi, Jun Deguchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we rethink sparse lexical representations for image retrieval.
+By utilizing multi-modal large language models (M-LLMs) that support visual
+prompting, we can extract image features and convert them into textual data,
+enabling us to utilize efficient sparse retrieval algorithms employed in
+natural language processing for image retrieval tasks. To assist the LLM in
+extracting image features, we apply data augmentation techniques for key
+expansion and analyze the impact with a metric for relevance between images and
+textual data. We empirically show the superior precision and recall performance
+of our image retrieval method compared to conventional vision-language
+model-based methods on the MS-COCO, PASCAL VOC, and NUS-WIDE datasets in a
+keyword-based image retrieval scenario, where keywords serve as search queries.
+We also demonstrate that the retrieval performance can be improved by
+iteratively incorporating keywords into search queries.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ECCV 2024 Workshops: 2nd Workshop on Traditional Computer
+  Vision in the Age of Deep Learning (TradiCV)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Convolutional Neural Network Compression Based on Low-Rank Decomposition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16289v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16289v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yaping He, Linhao Jiang, Di Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks typically impose significant computational loads and
+memory consumption. Moreover, the large parameters pose constraints on
+deploying the model on edge devices such as embedded systems. Tensor
+decomposition offers a clear advantage in compressing large-scale weight
+tensors. Nevertheless, direct utilization of low-rank decomposition typically
+leads to significant accuracy loss. This paper proposes a model compression
+method that integrates Variational Bayesian Matrix Factorization (VBMF) with
+orthogonal regularization. Initially, the model undergoes over-parameterization
+and training, with orthogonal regularization applied to enhance its likelihood
+of achieving the accuracy of the original model. Secondly, VBMF is employed to
+estimate the rank of the weight tensor at each layer. Our framework is
+sufficiently general to apply to other convolutional neural networks and easily
+adaptable to incorporate other tensor decomposition methods. Experimental
+results show that for both high and low compression ratios, our compression
+model exhibits advanced performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 1 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fine-grained Classification of Port Wine Stains Using Optical Coherence
+  Tomography Angiography 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16277v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16277v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaofeng Deng, Defu Chen, Bowen Liu, Xiwan Zhang, Haixia Qiu, Wu Yuan, Hongliang Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate classification of port wine stains (PWS, vascular malformations
+present at birth), is critical for subsequent treatment planning. However, the
+current method of classifying PWS based on the external skin appearance rarely
+reflects the underlying angiopathological heterogeneity of PWS lesions,
+resulting in inconsistent outcomes with the common vascular-targeted
+photodynamic therapy (V-PDT) treatments. Conversely, optical coherence
+tomography angiography (OCTA) is an ideal tool for visualizing the vascular
+malformations of PWS. Previous studies have shown no significant correlation
+between OCTA quantitative metrics and the PWS subtypes determined by the
+current classification approach. This study proposes a new classification
+approach for PWS using both OCT and OCTA. By examining the hypodermic
+histopathology and vascular structure of PWS, we have devised a fine-grained
+classification method that subdivides PWS into five distinct types. To assess
+the angiopathological differences of various PWS subtypes, we have analyzed six
+metrics related to vascular morphology and depth information of PWS lesions.
+The five PWS types present significant differences across all metrics compared
+to the conventional subtypes. Our findings suggest that an angiopathology-based
+classification accurately reflects the heterogeneity in PWS lesions. This
+research marks the first attempt to classify PWS based on angiopathology,
+potentially guiding more effective subtyping and treatment strategies for PWS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE for possible publication.
+  Copyright may be transferred without notice, after which this version may no
+  longer be accessible</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SAU: A Dual-Branch Network to Enhance Long-Tailed Recognition via
+  Generative Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16273v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16273v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guangxi Li, Yinsheng Song, Mingkai Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Long-tailed distributions in image recognition pose a considerable challenge
+due to the severe imbalance between a few dominant classes with numerous
+examples and many minority classes with few samples. Recently, the use of large
+generative models to create synthetic data for image classification has been
+realized, but utilizing synthetic data to address the challenge of long-tailed
+recognition remains relatively unexplored. In this work, we proposed the use of
+synthetic data as a complement to long-tailed datasets to eliminate the impact
+of data imbalance. To tackle this real-synthetic mixed dataset, we designed a
+two-branch model that contains Synthetic-Aware and Unaware branches (SAU). The
+core ideas are (1) a synthetic-unaware branch for classification that mixes
+real and synthetic data and treats all data equally without distinguishing
+between them. (2) A synthetic-aware branch for improving the robustness of the
+feature extractor by distinguishing between real and synthetic data and
+learning their discrepancies. Extensive experimental results demonstrate that
+our method can improve the accuracy of long-tailed image recognition. Notably,
+our approach achieves state-of-the-art Top-1 accuracy and significantly
+surpasses other methods on CIFAR-10-LT and CIFAR-100-LT datasets across various
+imbalance factors. Our code is available at https://github.com/lgX1123/gm4lt.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Beyond Uncertainty: Evidential Deep Learning for Robust Video Temporal
+  Grounding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16272v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16272v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaijing Ma, Haojian Huang, Jin Chen, Haodong Chen, Pengliang Ji, Xianghao Zang, Han Fang, Chao Ban, Hao Sun, Mulin Chen, Xuelong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing Video Temporal Grounding (VTG) models excel in accuracy but often
+overlook open-world challenges posed by open-vocabulary queries and untrimmed
+videos. This leads to unreliable predictions for noisy, corrupted, and
+out-of-distribution data. Adapting VTG models to dynamically estimate
+uncertainties based on user input can address this issue. To this end, we
+introduce SRAM, a robust network module that benefits from a two-stage
+cross-modal alignment task. More importantly, it integrates Deep Evidential
+Regression (DER) to explicitly and thoroughly quantify uncertainty during
+training, thus allowing the model to say "I do not know" in scenarios beyond
+its handling capacity. However, the direct application of traditional DER
+theory and its regularizer reveals structural flaws, leading to unintended
+constraints in VTG tasks. In response, we develop a simple yet effective
+Geom-regularizer that enhances the uncertainty learning framework from the
+ground up. To the best of our knowledge, this marks the first successful
+attempt of DER in VTG. Our extensive quantitative and qualitative results
+affirm the effectiveness, robustness, and interpretability of our modules and
+the uncertainty learning paradigm in VTG tasks. The code will be made
+available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Ongoing work: 28pages, 19 figures, 7 tables. Code is available at:
+  https://kaijing.space/SRAM/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UDD: <span class="highlight-title">Dataset</span> Distillation via Mining Underutilized Regions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16268v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16268v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiguang Wang, Zhongyu Zhang, Jian Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dataset distillation synthesizes a small dataset such that a model trained on
+this set approximates the performance of the original dataset. Recent studies
+on dataset distillation focused primarily on the design of the optimization
+process, with methods such as gradient matching, feature alignment, and
+training trajectory matching. However, little attention has been given to the
+issue of underutilized regions in synthetic images. In this paper, we propose
+UDD, a novel approach to identify and exploit the underutilized regions to make
+them informative and discriminate, and thus improve the utilization of the
+synthetic dataset. Technically, UDD involves two underutilized regions
+searching policies for different conditions, i.e., response-based policy and
+data jittering-based policy. Compared with previous works, such two policies
+are utilization-sensitive, equipping with the ability to dynamically adjust the
+underutilized regions during the training process. Additionally, we analyze the
+current model optimization problem and design a category-wise feature
+contrastive loss, which can enhance the distinguishability of different
+categories and alleviate the shortcomings of the existing multi-formation
+methods. Experimentally, our method improves the utilization of the synthetic
+dataset and outperforms the state-of-the-art methods on various datasets, such
+as MNIST, FashionMNIST, SVHN, CIFAR-10, and CIFAR-100. For example, the
+improvements on CIFAR-10 and CIFAR-100 are 4.0\% and 3.7\% over the next best
+method with IPC=1, by mining the underutilized regions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>PRCV2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Diffusion-based Data Augmentation with Inversion Spherical
+  Interpolation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16266v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16266v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanghao Wang, Long Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data Augmentation (DA), \ie, synthesizing faithful and diverse samples to
+expand the original training set, is a prevalent and effective strategy to
+improve various visual recognition tasks. With the powerful image generation
+ability, diffusion-based DA has shown strong performance gains on different
+benchmarks. In this paper, we analyze today's diffusion-based DA methods, and
+argue that they cannot take account of both faithfulness and diversity, which
+are two critical keys for generating high-quality samples and boosting final
+classification performance. To this end, we propose a novel Diffusion-based
+Inversion Interpolation DA method: Diff-II. Specifically, Diff-II consists of
+three main steps: 1) Category concepts learning: Learning concept embeddings
+for each category. 2) Inversion interpolation: Calculating the inversion for
+each image, and conducting spherical interpolation for two randomly sampled
+inversions from the same category. 3) Two-stage denoising: Using different
+prompts to generate synthesized images in a coarse-to-fine manner. Extensive
+experiments on multiple image classification tasks (\eg, few-shot, long-tailed,
+and out-of-distribution classification) have demonstrated its effectiveness
+over state-of-the-art diffusion-based DA methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Low Saturation Confidence Distribution-based Test-Time Adaptation for
+  Cross-Domain Remote Sensing Image Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16265v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16265v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Liang, Xiucheng Zhang, Juepeng Zheng, Jianxi Huang, Haohuan Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although the Unsupervised Domain Adaptation (UDA) method has improved the
+effect of remote sensing image classification tasks, most of them are still
+limited by access to the source domain (SD) data. Designs such as Source-free
+Domain Adaptation (SFDA) solve the challenge of a lack of SD data, however,
+they still rely on a large amount of target domain data and thus cannot achieve
+fast adaptations, which seriously hinders their further application in broader
+scenarios. The real-world applications of cross-domain remote sensing image
+classification require a balance of speed and accuracy at the same time.
+Therefore, we propose a novel and comprehensive test time adaptation (TTA)
+method -- Low Saturation Confidence Distribution Test Time Adaptation
+(LSCD-TTA), which is the first attempt to solve such scenarios through the idea
+of TTA. LSCD-TTA specifically considers the distribution characteristics of
+remote sensing images, including three main parts that concentrate on different
+optimization directions: First, low saturation distribution (LSD) considers the
+dominance of low-confidence samples during the later TTA stage. Second,
+weak-category cross-entropy (WCCE) increases the weight of categories that are
+more difficult to classify with less prior knowledge. Finally, diverse
+categories confidence (DIV) comprehensively considers the category diversity to
+alleviate the deviation of the sample distribution. By weighting the
+abovementioned three modules, the model can widely, quickly and accurately
+adapt to the target domain without much prior target distributions, repeated
+data access, and manual annotation. We evaluate LSCD-TTA on three
+remote-sensing image datasets. The experimental results show that LSCD-TTA
+achieves a significant gain of 4.96%-10.51% with Resnet-50 and 5.33%-12.49%
+with Resnet-101 in average accuracy compared to other state-of-the-art DA and
+TTA methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Advancing Architectural Floorplan Design with Geometry-enhanced Graph
+  Diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16258v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16258v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sizhe Hu, Wenming Wu, Yuntao Wang, Benzhu Xu, Liping Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automating architectural floorplan design is vital for housing and interior
+design, offering a faster, cost-effective alternative to manual sketches by
+architects. However, existing methods, including rule-based and learning-based
+approaches, face challenges in design complexity and constrained generation
+with extensive post-processing, and tend to obvious geometric inconsistencies
+such as misalignment, overlap, and gaps. In this work, we propose a novel
+generative framework for vector floorplan design via structural graph
+generation, called GSDiff, focusing on wall junction generation and wall
+segment prediction to capture both geometric and semantic aspects of structural
+graphs. To improve the geometric rationality of generated structural graphs, we
+propose two innovative geometry enhancement methods. In wall junction
+generation, we propose a novel alignment loss function to improve geometric
+consistency. In wall segment prediction, we propose a random self-supervision
+method to enhance the model's perception of the overall geometric structure,
+thereby promoting the generation of reasonable geometric structures. Employing
+the diffusion model and the Transformer model, as well as the geometry
+enhancement strategies, our framework can generate wall junctions, wall
+segments and room polygons with structural and semantic information, resulting
+in structural graphs that accurately represent floorplans. Extensive
+experiments show that the proposed method surpasses existing techniques,
+enabling free generation and constrained generation, marking a shift towards
+structure generation in architectural design.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EvLight++: Low-Light Video Enhancement with an Event Camera: A
+  Large-Scale Real-World <span class="highlight-title">Dataset</span>, Novel Method, and More 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16254v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16254v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kanghao Chen, Guoqiang Liang, Hangyu Li, Yunfan Lu, Lin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Event cameras offer significant advantages for low-light video enhancement,
+primarily due to their high dynamic range. Current research, however, is
+severely limited by the absence of large-scale, real-world, and
+spatio-temporally aligned event-video datasets. To address this, we introduce a
+large-scale dataset with over 30,000 pairs of frames and events captured under
+varying illumination. This dataset was curated using a robotic arm that traces
+a consistent non-linear trajectory, achieving spatial alignment precision under
+0.03mm and temporal alignment with errors under 0.01s for 90% of the dataset.
+Based on the dataset, we propose \textbf{EvLight++}, a novel event-guided
+low-light video enhancement approach designed for robust performance in
+real-world scenarios. Firstly, we design a multi-scale holistic fusion branch
+to integrate structural and textural information from both images and events.
+To counteract variations in regional illumination and noise, we introduce
+Signal-to-Noise Ratio (SNR)-guided regional feature selection, enhancing
+features from high SNR regions and augmenting those from low SNR regions by
+extracting structural information from events. To incorporate temporal
+information and ensure temporal coherence, we further introduce a recurrent
+module and temporal loss in the whole pipeline. Extensive experiments on our
+and the synthetic SDSD dataset demonstrate that EvLight++ significantly
+outperforms both single image- and video-based methods by 1.37 dB and 3.71 dB,
+respectively. To further explore its potential in downstream tasks like
+semantic segmentation and monocular depth estimation, we extend our datasets by
+adding pseudo segmentation and depth labels via meticulous annotation efforts
+with foundation models. Experiments under diverse low-light scenes show that
+the enhanced results achieve a 15.97% improvement in mIoU for semantic
+segmentation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Journal extension based on EvLight (arXiv:2404.00834)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Anno-incomplete Multi-<span class="highlight-title">dataset</span> Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16247v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16247v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiran Xu, Haoxiang Zhong, Kai Wu, Jialin Li, Yong Liu, Chengjie Wang, Shu-Tao Xia, Hongen Liao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Object detectors have shown outstanding performance on various public
+datasets. However, annotating a new dataset for a new task is usually
+unavoidable in real, since 1) a single existing dataset usually does not
+contain all object categories needed; 2) using multiple datasets usually
+suffers from annotation incompletion and heterogeneous features. We propose a
+novel problem as "Annotation-incomplete Multi-dataset Detection", and develop
+an end-to-end multi-task learning architecture which can accurately detect all
+the object categories with multiple partially annotated datasets. Specifically,
+we propose an attention feature extractor which helps to mine the relations
+among different datasets. Besides, a knowledge amalgamation training strategy
+is incorporated to accommodate heterogeneous features from different sources.
+Extensive experiments on different object detection datasets demonstrate the
+effectiveness of our methods and an improvement of 2.17%, 2.10% in mAP can be
+achieved on COCO and VOC respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neural Spectral Decomposition for <span class="highlight-title">Dataset</span> Distillation <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16236v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16236v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaolei Yang, Shen Cheng, Mingbo Hong, Haoqiang Fan, Xing Wei, Shuaicheng Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose Neural Spectrum Decomposition, a generic
+decomposition framework for dataset distillation. Unlike previous methods, we
+consider the entire dataset as a high-dimensional observation that is low-rank
+across all dimensions. We aim to discover the low-rank representation of the
+entire dataset and perform distillation efficiently. Toward this end, we learn
+a set of spectrum tensors and transformation matrices, which, through simple
+matrix multiplication, reconstruct the data distribution. Specifically, a
+spectrum tensor can be mapped back to the image space by a transformation
+matrix, and efficient information sharing during the distillation learning
+process is achieved through pairwise combinations of different spectrum vectors
+and transformation matrices. Furthermore, we integrate a trajectory matching
+optimization method guided by a real distribution. Our experimental results
+demonstrate that our approach achieves state-of-the-art performance on
+benchmarks, including CIFAR10, CIFAR100, Tiny Imagenet, and ImageNet Subset.
+Our code are available at \url{https://github.com/slyang2021/NSD}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ECCV 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LMT-GP: Combined Latent Mean-Teacher and Gaussian Process for
+  Semi-supervised Low-light Image Enhancement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16235v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16235v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ye Yu, Fengxin Chen, Jun Yu, Zhen Kan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While recent low-light image enhancement (LLIE) methods have made significant
+advancements, they still face challenges in terms of low visual quality and
+weak generalization ability when applied to complex scenarios. To address these
+issues, we propose a semi-supervised method based on latent mean-teacher and
+Gaussian process, named LMT-GP. We first design a latent mean-teacher framework
+that integrates both labeled and unlabeled data, as well as their latent
+vectors, into model training. Meanwhile, we use a mean-teacher-assisted
+Gaussian process learning strategy to establish a connection between the latent
+and pseudo-latent vectors obtained from the labeled and unlabeled data. To
+guide the learning process, we utilize an assisted Gaussian process regression
+(GPR) loss function. Furthermore, we design a pseudo-label adaptation module
+(PAM) to ensure the reliability of the network learning. To demonstrate our
+method's generalization ability and effectiveness, we apply it to multiple LLIE
+datasets and high-level vision tasks. Experiment results demonstrate that our
+method achieves high generalization performance and image quality. The code is
+available at https://github.com/HFUT-CV/LMT-GP.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PSE-Net: Channel Pruning for Convolutional Neural Networks with
+  Parallel-subnets Estimator 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16233v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16233v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiguang Wang, Tao Xie, Haijun Liu, Xingcheng Zhang, Jian Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Channel Pruning is one of the most widespread techniques used to compress
+deep neural networks while maintaining their performances. Currently, a typical
+pruning algorithm leverages neural architecture search to directly find
+networks with a configurable width, the key step of which is to identify
+representative subnet for various pruning ratios by training a supernet.
+However, current methods mainly follow a serial training strategy to optimize
+supernet, which is very time-consuming. In this work, we introduce PSE-Net, a
+novel parallel-subnets estimator for efficient channel pruning. Specifically,
+we propose a parallel-subnets training algorithm that simulate the
+forward-backward pass of multiple subnets by droping extraneous features on
+batch dimension, thus various subnets could be trained in one round. Our
+proposed algorithm facilitates the efficiency of supernet training and equips
+the network with the ability to interpolate the accuracy of unsampled subnets,
+enabling PSE-Net to effectively evaluate and rank the subnets. Over the trained
+supernet, we develop a prior-distributed-based sampling algorithm to boost the
+performance of classical evolutionary search. Such algorithm utilizes the prior
+information of supernet training phase to assist in the search of optimal
+subnets while tackling the challenge of discovering samples that satisfy
+resource constraints due to the long-tail distribution of network
+configuration. Extensive experiments demonstrate PSE-Net outperforms previous
+state-of-the-art channel pruning methods on the ImageNet dataset while
+retaining superior supernet training efficiency. For example, under 300M FLOPs
+constraint, our pruned MobileNetV2 achieves 75.2% Top-1 accuracy on ImageNet
+dataset, exceeding the original MobileNetV2 by 2.6 units while only cost
+30%/16% times than BCNet/AutoAlim.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10pages, Neural Networks</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Conditional Image Generation with Explainable Latent Space
+  Manipulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16232v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16232v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kshitij Pathania
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the realm of image synthesis, achieving fidelity to a reference image
+while adhering to conditional prompts remains a significant challenge. This
+paper proposes a novel approach that integrates a diffusion model with latent
+space manipulation and gradient-based selective attention mechanisms to address
+this issue. Leveraging Grad-SAM (Gradient-based Selective Attention
+Manipulation), we analyze the cross attention maps of the cross attention
+layers and gradients for the denoised latent vector, deriving importance scores
+of elements of denoised latent vector related to the subject of interest. Using
+this information, we create masks at specific timesteps during denoising to
+preserve subjects while seamlessly integrating the reference image features.
+This approach ensures the faithful formation of subjects based on conditional
+prompts, while concurrently refining the background for a more coherent
+composition. Our experiments on places365 dataset demonstrate promising
+results, with our proposed model achieving the lowest mean and median Frechet
+Inception Distance (FID) scores compared to baseline models, indicating
+superior fidelity preservation. Furthermore, our model exhibits competitive
+performance in aligning the generated images with provided textual
+descriptions, as evidenced by high CLIP scores. These results highlight the
+effectiveness of our approach in both fidelity preservation and textual context
+preservation, offering a significant advancement in text-to-image synthesis
+tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages , 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revisiting 360 Depth Estimation with PanoGabor: A New Fusion Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16227v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16227v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhijie Shen, Chunyu Lin, Lang Nie, Kang Liao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Depth estimation from a monocular 360 image is important to the perception of
+the entire 3D environment. However, the inherent distortion and large field of
+view (FoV) in 360 images pose great challenges for this task. To this end,
+existing mainstream solutions typically introduce additional perspective-based
+360 representations (\textit{e.g.}, Cubemap) to achieve effective feature
+extraction. Nevertheless, regardless of the introduced representations, they
+eventually need to be unified into the equirectangular projection (ERP) format
+for the subsequent depth estimation, which inevitably reintroduces the
+troublesome distortions. In this work, we propose an oriented distortion-aware
+Gabor Fusion framework (PGFuse) to address the above challenges. First, we
+introduce Gabor filters that analyze texture in the frequency domain, thereby
+extending the receptive fields and enhancing depth cues. To address the
+reintroduced distortions, we design a linear latitude-aware distortion
+representation method to generate customized, distortion-aware Gabor filters
+(PanoGabor filters). Furthermore, we design a channel-wise and spatial-wise
+unidirectional fusion module (CS-UFM) that integrates the proposed PanoGabor
+filters to unify other representations into the ERP format, delivering
+effective and distortion-free features. Considering the orientation sensitivity
+of the Gabor transform, we introduce a spherical gradient constraint to
+stabilize this sensitivity. Experimental results on three popular indoor 360
+benchmarks demonstrate the superiority of the proposed PGFuse to existing
+state-of-the-art solutions. Code can be available upon acceptance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLaVA-SG: Leveraging Scene Graphs as Visual Semantic Expression in
+  Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16224v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16224v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingyi Wang, Jianzhong Ju, Jian Luan, Zhidong Deng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in large vision-language models (VLMs) typically employ
+vision encoders based on the Vision Transformer (ViT) architecture. The
+division of the images into patches by ViT results in a fragmented perception,
+thereby hindering the visual understanding capabilities of VLMs. In this paper,
+we propose an innovative enhancement to address this limitation by introducing
+a Scene Graph Expression (SGE) module in VLMs. This module extracts and
+structurally expresses the complex semantic information within images, thereby
+improving the foundational perception and understanding abilities of VLMs.
+Extensive experiments demonstrate that integrating our SGE module significantly
+enhances the VLM's performance in vision-language tasks, indicating its
+effectiveness in preserving intricate semantic details and facilitating better
+visual understanding. Code and data would be available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Training-free Video Temporal Grounding using Large-scale <span class="highlight-title">Pre-train</span>ed
+  Models <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16219v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16219v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minghang Zheng, Xinhao Cai, Qingchao Chen, Yuxin Peng, Yang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video temporal grounding aims to identify video segments within untrimmed
+videos that are most relevant to a given natural language query. Existing video
+temporal localization models rely on specific datasets for training and have
+high data collection costs, but they exhibit poor generalization capability
+under the across-dataset and out-of-distribution (OOD) settings. In this paper,
+we propose a Training-Free Video Temporal Grounding (TFVTG) approach that
+leverages the ability of pre-trained large models. A naive baseline is to
+enumerate proposals in the video and use the pre-trained visual language models
+(VLMs) to select the best proposal according to the vision-language alignment.
+However, most existing VLMs are trained on image-text pairs or trimmed video
+clip-text pairs, making it struggle to (1) grasp the relationship and
+distinguish the temporal boundaries of multiple events within the same video;
+(2) comprehend and be sensitive to the dynamic transition of events (the
+transition from one event to another) in the video. To address these issues, we
+propose leveraging large language models (LLMs) to analyze multiple sub-events
+contained in the query text and analyze the temporal order and relationships
+between these events. Secondly, we split a sub-event into dynamic transition
+and static status parts and propose the dynamic and static scoring functions
+using VLMs to better evaluate the relevance between the event and the
+description. Finally, for each sub-event description, we use VLMs to locate the
+top-k proposals and leverage the order and relationships between sub-events
+provided by LLMs to filter and integrate these proposals. Our method achieves
+the best performance on zero-shot video temporal grounding on Charades-STA and
+ActivityNet Captions datasets without any training and demonstrates better
+generalization capabilities in cross-dataset and OOD settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ECCV 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ M4CXR: Exploring Multi-task Potentials of Multi-modal Large Language
+  Models for Chest X-ray Interpretation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16213v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16213v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonggwon Park, Soobum Kim, Byungmu Yoon, Jihun Hyun, Kyoyun Choi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid evolution of artificial intelligence, especially in large language
+models (LLMs), has significantly impacted various domains, including
+healthcare. In chest X-ray (CXR) analysis, previous studies have employed LLMs,
+but with limitations: either underutilizing the multi-tasking capabilities of
+LLMs or lacking clinical accuracy. This paper presents M4CXR, a multi-modal LLM
+designed to enhance CXR interpretation. The model is trained on a visual
+instruction-following dataset that integrates various task-specific datasets in
+a conversational format. As a result, the model supports multiple tasks such as
+medical report generation (MRG), visual grounding, and visual question
+answering (VQA). M4CXR achieves state-of-the-art clinical accuracy in MRG by
+employing a chain-of-thought prompting strategy, in which it identifies
+findings in CXR images and subsequently generates corresponding reports. The
+model is adaptable to various MRG scenarios depending on the available inputs,
+such as single-image, multi-image, and multi-study contexts. In addition to
+MRG, M4CXR performs visual grounding at a level comparable to specialized
+models and also demonstrates outstanding performance in VQA. Both quantitative
+and qualitative assessments reveal M4CXR's versatility in MRG, visual
+grounding, and VQA, while consistently maintaining clinical accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Uni-3DAD: GAN-Inversion Aided Universal 3D Anomaly Detection on
+  Model-free Products 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16201v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16201v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiayu Liu, Shancong Mou, Nathan Gaw, Yinan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Anomaly detection is a long-standing challenge in manufacturing systems.
+Traditionally, anomaly detection has relied on human inspectors. However, 3D
+point clouds have gained attention due to their robustness to environmental
+factors and their ability to represent geometric data. Existing 3D anomaly
+detection methods generally fall into two categories. One compares scanned 3D
+point clouds with design files, assuming these files are always available.
+However, such assumptions are often violated in many real-world applications
+where model-free products exist, such as fresh produce (i.e., ``Cookie",
+``Potato", etc.), dentures, bone, etc. The other category compares patches of
+scanned 3D point clouds with a library of normal patches named memory bank.
+However, those methods usually fail to detect incomplete shapes, which is a
+fairly common defect type (i.e., missing pieces of different products). The
+main challenge is that missing areas in 3D point clouds represent the absence
+of scanned points. This makes it infeasible to compare the missing region with
+existing point cloud patches in the memory bank. To address these two
+challenges, we proposed a unified, unsupervised 3D anomaly detection framework
+capable of identifying all types of defects on model-free products. Our method
+integrates two detection modules: a feature-based detection module and a
+reconstruction-based detection module. Feature-based detection covers geometric
+defects, such as dents, holes, and cracks, while the reconstruction-based
+method detects missing regions. Additionally, we employ a One-class Support
+Vector Machine (OCSVM) to fuse the detection results from both modules. The
+results demonstrate that (1) our proposed method outperforms the
+state-of-the-art methods in identifying incomplete shapes and (2) it still
+maintains comparable performance with the SOTA methods in detecting all other
+types of anomalies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PolarBEVDet: Exploring Polar Representation for Multi-View 3D Object
+  Detection in Bird's-Eye-View 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16200v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16200v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zichen Yu, Quanli Liu, Wei Wang, Liyong Zhang, Xiaoguang Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, LSS-based multi-view 3D object detection provides an economical and
+deployment-friendly solution for autonomous driving. However, all the existing
+LSS-based methods transform multi-view image features into a Cartesian
+Bird's-Eye-View(BEV) representation, which does not take into account the
+non-uniform image information distribution and hardly exploits the view
+symmetry. In this paper, in order to adapt the image information distribution
+and preserve the view symmetry by regular convolution, we propose to employ the
+polar BEV representation to substitute the Cartesian BEV representation. To
+achieve this, we elaborately tailor three modules: a polar view transformer to
+generate the polar BEV representation, a polar temporal fusion module for
+fusing historical polar BEV features and a polar detection head to predict the
+polar-parameterized representation of the object. In addition, we design a 2D
+auxiliary detection head and a spatial attention enhancement module to improve
+the quality of feature extraction in perspective view and BEV, respectively.
+Finally, we integrate the above improvements into a novel multi-view 3D object
+detector, PolarBEVDet. Experiments on nuScenes show that PolarBEVDet achieves
+the superior performance. The code is available at
+https://github.com/Yzichen/PolarBEVDet.git.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DLM-VMTL:A Double Layer Mapper for heterogeneous data video Multi-task
+  <span class="highlight-title">prompt</span> learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16195v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16195v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeyi Bo, Wuxi Sun, Ye Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, the parameters of backbones of Video Understanding tasks
+continue to increase and even reach billion-level. Whether fine-tuning a
+specific task on the Video Foundation Model or pre-training the model designed
+for the specific task, incurs a lot of overhead. How to make these models play
+other values than their own tasks becomes a worthy question. Multi-Task
+Learning(MTL) makes the visual task acquire the rich shareable knowledge from
+other tasks while joint training. It is fully explored in Image Recognition
+tasks especially dense predict tasks. Nevertheless, it is rarely used in video
+domain due to the lack of multi-labels video data. In this paper, a
+heterogenous data video multi-task prompt learning (VMTL) method is proposed to
+address above problem. It's different from it in image domain, a Double-Layers
+Mapper(DLM) is proposed to extract the shareable knowledge into visual promptS
+and align it with representation of primary task. Extensive experiments prove
+that our DLM-VMTL performs better than baselines on 6 different video
+understanding tasks and 11 datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Estimating Dynamic Flow Features in Groups of Tracked Objects 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16190v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16190v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tanner D. Harms, Steven L. Brunton, Beverley J. McKeon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interpreting motion captured in image sequences is crucial for a wide range
+of computer vision applications. Typical estimation approaches include optical
+flow (OF), which approximates the apparent motion instantaneously in a scene,
+and multiple object tracking (MOT), which tracks the motion of subjects over
+time. Often, the motion of objects in a scene is governed by some underlying
+dynamical system which could be inferred by analyzing the motion of groups of
+objects. Standard motion analyses, however, are not designed to intuit flow
+dynamics from trajectory data, making such measurements difficult in practice.
+The goal of this work is to extend gradient-based dynamical systems analyses to
+real-world applications characterized by complex, feature-rich image sequences
+with imperfect tracers. The tracer trajectories are tracked using deep vision
+networks and gradients are approximated using Lagrangian gradient regression
+(LGR), a tool designed to estimate spatial gradients from sparse data. From
+gradients, dynamical features such as regions of coherent rotation and
+transport barriers are identified. The proposed approach is affordably
+implemented and enables advanced studies including the motion analysis of two
+distinct object classes in a single image sequence. Two examples of the method
+are presented on data sets for which standard gradient-based analyses do not
+apply.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VGBench: Evaluating Large Language Models on Vector Graphics
+  Understanding and Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.10972v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.10972v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bocheng Zou, Mu Cai, Jianrui Zhang, Yong Jae Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the realm of vision models, the primary mode of representation is using
+pixels to rasterize the visual world. Yet this is not always the best or unique
+way to represent visual content, especially for designers and artists who
+depict the world using geometry primitives such as polygons. Vector graphics
+(VG), on the other hand, offer a textual representation of visual content,
+which can be more concise and powerful for content like cartoons, sketches and
+scientific figures. Recent studies have shown promising results on processing
+vector graphics with capable Large Language Models (LLMs). However, such works
+focus solely on qualitative results, understanding, or a specific type of
+vector graphics. We propose VGBench, a comprehensive benchmark for LLMs on
+handling vector graphics through diverse aspects, including (a) both visual
+understanding and generation, (b) evaluation of various vector graphics
+formats, (c) diverse question types, (d) wide range of prompting techniques,
+(e) under multiple LLMs and (f) comparison with VLMs on rasterized
+representations. Evaluating on our collected 4279 understanding and 5845
+generation samples, we find that LLMs show strong capability on both aspects
+while exhibiting less desirable performance on low-level formats (SVG). Both
+data and evaluation pipeline will be open-sourced at https://vgbench.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://vgbench.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GRAB: A Challenging GRaph Analysis Benchmark for Large Multimodal Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.11817v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.11817v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonathan Roberts, Kai Han, Samuel Albanie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large multimodal models (LMMs) have exhibited proficiencies across many
+visual tasks. Although numerous well-known benchmarks exist to evaluate model
+performance, they increasingly have insufficient headroom. As such, there is a
+pressing need for a new generation of benchmarks challenging enough for the
+next generation of LMMs. One area that LMMs show potential is graph analysis,
+specifically, the tasks an analyst might typically perform when interpreting
+figures such as estimating the mean, intercepts or correlations of functions
+and data series. In this work, we introduce GRAB, a graph analysis benchmark,
+fit for current and future frontier LMMs. Our benchmark is entirely synthetic,
+ensuring high-quality, noise-free questions. GRAB is comprised of 2170
+questions, covering four tasks and 23 graph properties. We evaluate 20 LMMs on
+GRAB, finding it to be a challenging benchmark, with the highest performing
+model attaining a score of just 21.7%. Finally, we conduct various ablations to
+investigate where the models succeed and struggle. We release GRAB to encourage
+progress in this important, growing domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>V2: Fixed references formatting</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluation Framework for Feedback Generation Methods in Skeletal
+  Movement Assessment <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.09359v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.09359v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tal Hakim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The application of machine-learning solutions to movement assessment from
+skeleton videos has attracted significant research attention in recent years.
+This advancement has made rehabilitation at home more accessible, utilizing
+movement assessment algorithms that can operate on affordable equipment for
+human pose detection and analysis from 2D or 3D videos. While the primary
+objective of automatic assessment tasks is to score movements, the automatic
+generation of feedback highlighting key movement issues has the potential to
+significantly enhance and accelerate the rehabilitation process. While numerous
+research works exist in the field of automatic movement assessment, only a
+handful address feedback generation. In this study, we propose terminology and
+criteria for the classification, evaluation, and comparison of feedback
+generation solutions. We discuss the challenges associated with each feedback
+generation approach and use our proposed criteria to classify existing
+solutions. To our knowledge, this is the first work that formulates feedback
+generation in skeletal movement assessment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to xAI4Biometrics 2024 at ECCV 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OpticalRS-4M: Scaling Efficient Masked Autoencoder Learning on Large
+  Remote Sensing <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.11933v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.11933v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fengxiang Wang, Hongzhen Wang, Di Wang, Zonghao Guo, Zhenyu Zhong, Long Lan, Jing Zhang, Zhiyuan Liu, Maosong Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Masked Image Modeling (MIM) has become an essential method for building
+foundational visual models in remote sensing (RS). However, the limitations in
+size and diversity of existing RS datasets restrict the ability of MIM methods
+to learn generalizable representations. Additionally, conventional MIM
+techniques, which require reconstructing all tokens, introduce unnecessary
+computational overhead. To address these issues, we present a new pre-training
+pipeline for RS models, featuring the creation of a large-scale RS dataset and
+an efficient MIM approach. We curated a high-quality dataset named OpticalRS-4M
+by collecting publicly available RS datasets and processing them through
+exclusion, slicing, and deduplication. OpticalRS-4M comprises 4 million optical
+images covering various RS tasks, such as object detection and pixel
+segmentation. To enhance efficiency, we propose SelectiveMAE, a pre-training
+method that dynamically encodes and reconstructs semantically rich patch
+tokens, thereby reducing the inefficiencies of traditional MIM models caused by
+redundant background pixels in RS images. Extensive experiments demonstrate
+that OpticalRS-4M significantly improves classification, detection, and
+segmentation performance, while SelectiveMAE increases training efficiency over
+2 times. This highlights the effectiveness and scalability of our pipeline in
+developing RS foundational models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Manipulate-Anything: Automating Real-World Robots using Vision-Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.18915v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.18915v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiafei Duan, Wentao Yuan, Wilbert Pumacay, Yi Ru Wang, Kiana Ehsani, Dieter Fox, Ranjay Krishna
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale endeavors like and widespread community efforts such as
+Open-X-Embodiment have contributed to growing the scale of robot demonstration
+data. However, there is still an opportunity to improve the quality, quantity,
+and diversity of robot demonstration data. Although vision-language models have
+been shown to automatically generate demonstration data, their utility has been
+limited to environments with privileged state information, they require
+hand-designed skills, and are limited to interactions with few object
+instances. We propose Manipulate-Anything, a scalable automated generation
+method for real-world robotic manipulation. Unlike prior work, our method can
+operate in real-world environments without any privileged state information,
+hand-designed skills, and can manipulate any static object. We evaluate our
+method using two setups. First, Manipulate-Anything successfully generates
+trajectories for all 7 real-world and 14 simulation tasks, significantly
+outperforming existing methods like VoxPoser. Second, Manipulate-Anything's
+demonstrations can train more robust behavior cloning policies than training
+with human demonstrations, or from data generated by VoxPoser, Scaling-up, and
+Code-As-Policies. We believe Manipulate-Anything can be a scalable method for
+both generating data for robotics and solving novel tasks in a zero-shot
+setting. Project page: https://robot-ma.github.io/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://robot-ma.github.io/. All supplementary
+  material, prompts and code can be found on the project page</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Not (yet) the whole story: Evaluating Visual Storytelling Requires More
+  than Measuring Coherence, Grounding, and Repetition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.04559v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.04559v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aditya K Surikuchi, Raquel Fernández, Sandro Pezzelle
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual storytelling consists in generating a natural language story given a
+temporally ordered sequence of images. This task is not only challenging for
+models, but also very difficult to evaluate with automatic metrics since there
+is no consensus about what makes a story 'good'. In this paper, we introduce a
+novel method that measures story quality in terms of human likeness regarding
+three key aspects highlighted in previous work: visual grounding, coherence,
+and repetitiveness. We then use this method to evaluate the stories generated
+by several models, showing that the foundation model LLaVA obtains the best
+result, but only slightly so compared to TAPM, a 50-times smaller visual
+storytelling model. Upgrading the visual and language components of TAPM
+results in a model that yields competitive performance with a relatively low
+number of parameters. Finally, we carry out a human evaluation study, whose
+results suggest that a 'good' story may require more than a human-like level of
+visual grounding, coherence, and repetition.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Trajectory Forecasting through Low-Rank Adaptation of Discrete Latent
+  Codes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20743v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20743v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Riccardo Benaglia, Angelo Porrello, Pietro Buzzega, Simone Calderara, Rita Cucchiara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Trajectory forecasting is crucial for video surveillance analytics, as it
+enables the anticipation of future movements for a set of agents, e.g.
+basketball players engaged in intricate interactions with long-term intentions.
+Deep generative models offer a natural learning approach for trajectory
+forecasting, yet they encounter difficulties in achieving an optimal balance
+between sampling fidelity and diversity. We address this challenge by
+leveraging Vector Quantized Variational Autoencoders (VQ-VAEs), which utilize a
+discrete latent space to tackle the issue of posterior collapse. Specifically,
+we introduce an instance-based codebook that allows tailored latent
+representations for each example. In a nutshell, the rows of the codebook are
+dynamically adjusted to reflect contextual information (i.e., past motion
+patterns extracted from the observed trajectories). In this way, the
+discretization process gains flexibility, leading to improved reconstructions.
+Notably, instance-level dynamics are injected into the codebook through
+low-rank updates, which restrict the customization of the codebook to a lower
+dimension space. The resulting discrete space serves as the basis of the
+subsequent step, which regards the training of a diffusion-based predictive
+model. We show that such a two-fold framework, augmented with instance-level
+discretization, leads to accurate and diverse forecasts, yielding
+state-of-the-art performance on three established benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 3 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Verification of Geometric Robustness of Neural Networks via Piecewise
+  Linear Approximation and Lipschitz Optimisation <span class="chip">ECAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13140v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13140v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ben Batten, Yang Zheng, Alessandro De Palma, Panagiotis Kouvaros, Alessio Lomuscio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We address the problem of verifying neural networks against geometric
+transformations of the input image, including rotation, scaling, shearing, and
+translation. The proposed method computes provably sound piecewise linear
+constraints for the pixel values by using sampling and linear approximations in
+combination with branch-and-bound Lipschitz optimisation. The method obtains
+provably tighter over-approximations of the perturbation region than the
+present state-of-the-art. We report results from experiments on a comprehensive
+set of verification benchmarks on MNIST and CIFAR10. We show that our proposed
+implementation resolves up to 32% more verification cases than present
+approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ECAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Smart Multi-Modal Search: Contextual Sparse and Dense Embedding
+  Integration in Adobe Express <span class="chip">CIKM 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14698v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14698v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cherag Aroraa, Tracy Holloway King, Jayant Kumar, Yi Lu, Sanat Sharma, Arvind Srikantan, David Uvalle, Josep Valls-Vargas, Harsha Vardhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As user content and queries become increasingly multi-modal, the need for
+effective multi-modal search systems has grown. Traditional search systems
+often rely on textual and metadata annotations for indexed images, while
+multi-modal embeddings like CLIP enable direct search using text and image
+embeddings. However, embedding-based approaches face challenges in integrating
+contextual features such as user locale and recency. Building a scalable
+multi-modal search system requires fine-tuning several components. This paper
+presents a multi-modal search architecture and a series of AB tests that
+optimize embeddings and multi-modal technologies in Adobe Express template
+search. We address considerations such as embedding model selection, the roles
+of embeddings in matching and ranking, and the balance between dense and sparse
+embeddings. Our iterative approach demonstrates how utilizing sparse, dense,
+and contextual features enhances short and long query search, significantly
+reduces null rates (over 70\%), and increases click-through rates (CTR). Our
+findings provide insights into developing robust multi-modal search systems,
+thereby enhancing relevance for complex queries.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CIKM 2024 (International Conference on Information and Knowledge
+  Management), Multimodal Search and Recommendations Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Efficacy of Text-Based Input Modalities for Action Anticipation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12972v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12972v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Apoorva Beedu, Harish Haresamudram, Karan Samel, Irfan Essa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Anticipating future actions is a highly challenging task due to the diversity
+and scale of potential future actions; yet, information from different
+modalities help narrow down plausible action choices. Each modality can provide
+diverse and often complementary context for the model to learn from. While
+previous multi-modal methods leverage information from modalities such as video
+and audio, we primarily explore how text descriptions of actions and objects
+can also lead to more accurate action anticipation by providing additional
+contextual cues, e.g., about the environment and its contents. We propose a
+Multi-modal Contrastive Anticipative Transformer (M-CAT), a video transformer
+architecture that jointly learns from multi-modal features and text
+descriptions of actions and objects. We train our model in two stages, where
+the model first learns to align video clips with descriptions of future
+actions, and is subsequently fine-tuned to predict future actions. Compared to
+existing methods, M-CAT has the advantage of learning additional context from
+two types of text inputs: rich descriptions of future actions during
+pre-training, and, text descriptions for detected objects and actions during
+modality feature fusion. Through extensive experimental evaluation, we
+demonstrate that our model outperforms previous methods on the EpicKitchens
+datasets, and show that using simple text descriptions of actions and objects
+aid in more effective action anticipation. In addition, we examine the impact
+of object and action information obtained via text, and perform extensive
+ablations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mumpy: Multilateral Temporal-view Pyramid <span class="highlight-title">Transformer</span> for Video
+  Inpainting Detection <span class="chip">BMVC 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.11054v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.11054v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ying Zhang, Yuezun Li, Bo Peng, Jiaran Zhou, Huiyu Zhou, Junyu Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The task of video inpainting detection is to expose the pixel-level inpainted
+regions within a video sequence. Existing methods usually focus on leveraging
+spatial and temporal inconsistencies. However, these methods typically employ
+fixed operations to combine spatial and temporal clues, limiting their
+applicability in different scenarios. In this paper, we introduce a novel
+Multilateral Temporal-view Pyramid Transformer ({\em MumPy}) that collaborates
+spatial-temporal clues flexibly. Our method utilizes a newly designed
+multilateral temporal-view encoder to extract various collaborations of
+spatial-temporal clues and introduces a deformable window-based temporal-view
+interaction module to enhance the diversity of these collaborations.
+Subsequently, we develop a multi-pyramid decoder to aggregate the various types
+of features and generate detection maps. By adjusting the contribution strength
+of spatial and temporal clues, our method can effectively identify inpainted
+regions. We validate our method on existing datasets and also introduce a new
+challenging and large-scale Video Inpainting dataset based on the YouTube-VOS
+dataset, which employs several more recent inpainting methods. The results
+demonstrate the superiority of our method in both in-domain and cross-domain
+evaluation scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>BMVC 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generalist Segmentation Algorithm for Photoreceptors Analysis in
+  Adaptive Optics Imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14810v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14810v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mikhail Kulyabin, Aline Sindel, Hilde Pedersen, Stuart Gilson, Rigmor Baraas, Andreas Maier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Analyzing the cone photoreceptor pattern in images obtained from the living
+human retina using quantitative methods can be crucial for the early detection
+and management of various eye conditions. Confocal adaptive optics scanning
+light ophthalmoscope (AOSLO) imaging enables visualization of the cones from
+reflections of waveguiding cone photoreceptors. While there have been
+significant improvements in automated algorithms for segmenting cones in
+confocal AOSLO images, the process of labelling data remains labor-intensive
+and manual. This paper introduces a method based on deep learning (DL) for
+detecting and segmenting cones in AOSLO images. The models were trained on a
+semi-automatically labelled dataset of 20 AOSLO batches of images of 18
+participants for 0$^{\circ}$, 1$^{\circ}$, and 2$^{\circ}$ from the foveal
+center. F1 scores were 0.968, 0.958, and 0.954 for 0$^{\circ}$, 1$^{\circ}$,
+and 2$^{\circ}$, respectively, which is better than previously reported DL
+approaches. Our method minimizes the need for labelled data by only
+necessitating a fraction of labelled cones, which is especially beneficial in
+the field of ophthalmology, where labelled data can often be limited.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Frequency-Assisted Mamba for Remote Sensing Image Super-Resolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.04964v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.04964v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Xiao, Qiangqiang Yuan, Kui Jiang, Yuzeng Chen, Qiang Zhang, Chia-Wen Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent progress in remote sensing image (RSI) super-resolution (SR) has
+exhibited remarkable performance using deep neural networks, e.g.,
+Convolutional Neural Networks and Transformers. However, existing SR methods
+often suffer from either a limited receptive field or quadratic computational
+overhead, resulting in sub-optimal global representation and unacceptable
+computational costs in large-scale RSI. To alleviate these issues, we develop
+the first attempt to integrate the Vision State Space Model (Mamba) for RSI-SR,
+which specializes in processing large-scale RSI by capturing long-range
+dependency with linear complexity. To achieve better SR reconstruction,
+building upon Mamba, we devise a Frequency-assisted Mamba framework, dubbed
+FMSR, to explore the spatial and frequent correlations. In particular, our FMSR
+features a multi-level fusion architecture equipped with the Frequency
+Selection Module (FSM), Vision State Space Module (VSSM), and Hybrid Gate
+Module (HGM) to grasp their merits for effective spatial-frequency fusion.
+Considering that global and local dependencies are complementary and both
+beneficial for SR, we further recalibrate these multi-level features for
+accurate feature fusion via learnable scaling adaptors. Extensive experiments
+on AID, DOTA, and DIOR benchmarks demonstrate that our FMSR outperforms
+state-of-the-art Transformer-based methods HAT-L in terms of PSNR by 0.11 dB on
+average, while consuming only 28.05% and 19.08% of its memory consumption and
+complexity, respectively. Code will be available at
+https://github.com/XY-boy/FreMamba
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE TMM</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On Feasibility of Intent Obfuscating Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.02674v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.02674v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhaobin Li, Patrick Shafto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Intent obfuscation is a common tactic in adversarial situations, enabling the
+attacker to both manipulate the target system and avoid culpability.
+Surprisingly, it has rarely been implemented in adversarial attacks on machine
+learning systems. We are the first to propose using intent obfuscation to
+generate adversarial examples for object detectors: by perturbing another
+non-overlapping object to disrupt the target object, the attacker hides their
+intended target. We conduct a randomized experiment on 5 prominent detectors --
+YOLOv3, SSD, RetinaNet, Faster R-CNN, and Cascade R-CNN -- using both targeted
+and untargeted attacks and achieve success on all models and attacks. We
+analyze the success factors characterizing intent obfuscating attacks,
+including target object confidence and perturb object sizes. We then
+demonstrate that the attacker can exploit these success factors to increase
+success rates for all models and attacks. Finally, we discuss main takeaways
+and legal repercussions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>33 pages, 21 Figures. Includes technical appendix. To appear in AIES
+  2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VideoMambaPro: A Leap Forward for Mamba in Video Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.19006v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.19006v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hui Lu, Albert Ali Salah, Ronald Poppe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video understanding requires the extraction of rich spatio-temporal
+representations, which transformer models achieve through self-attention.
+Unfortunately, self-attention poses a computational burden. In NLP, Mamba has
+surfaced as an efficient alternative for transformers. However, Mamba's
+successes do not trivially extend to computer vision tasks, including those in
+video analysis. In this paper, we theoretically analyze the differences between
+self-attention and Mamba. We identify two limitations in Mamba's token
+processing: historical decay and element contradiction. We propose
+VideoMambaPro (VMP) that solves the identified limitations by adding masked
+backward computation and elemental residual connections to a VideoMamba
+backbone. VideoMambaPro shows state-of-the-art video action recognition
+performance compared to transformer models, and surpasses VideoMamba by clear
+margins: 7.9% and 8.1% top-1 on Kinetics-400 and Something-Something V2,
+respectively. Our VideoMambaPro-M model achieves 91.9% top-1 on Kinetics-400,
+only 0.2% below InternVideo2-6B but with only 1.2% of its parameters. The
+combination of high performance and efficiency makes VideoMambaPro an
+interesting alternative for transformer models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Model weights are lost due to management error, will re-calculate and
+  update the results</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning to Detect and Segment for Open Vocabulary Object Detection <span class="chip">CVPR2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.12130v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.12130v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tao Wang, Nan Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Open vocabulary object detection has been greatly advanced by the recent
+development of vision-language pretrained model, which helps recognize novel
+objects with only semantic categories. The prior works mainly focus on
+knowledge transferring to the object proposal classification and employ
+class-agnostic box and mask prediction. In this work, we propose CondHead, a
+principled dynamic network design to better generalize the box regression and
+mask segmentation for open vocabulary setting. The core idea is to
+conditionally parameterize the network heads on semantic embedding and thus the
+model is guided with class-specific knowledge to better detect novel
+categories. Specifically, CondHead is composed of two streams of network heads,
+the dynamically aggregated head and the dynamically generated head. The former
+is instantiated with a set of static heads that are conditionally aggregated,
+these heads are optimized as experts and are expected to learn sophisticated
+prediction. The latter is instantiated with dynamically generated parameters
+and encodes general class-specific information. With such a conditional design,
+the detection model is bridged by the semantic embedding to offer strongly
+generalizable class-wise box and mask prediction. Our method brings significant
+improvement to the state-of-the-art open vocabulary object detection methods
+with very minor overhead, e.g., it surpasses a RegionClip model by 3.0
+detection AP on novel categories, with only 1.1% more computation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to CVPR2023, code will be available later</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fast Text-to-3D-Aware Face Generation and Manipulation via Direct
+  Cross-modal Mapping and Geometric Regularization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.06702v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.06702v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinlu Zhang, Yiyi Zhou, Qiancheng Zheng, Xiaoxiong Du, Gen Luo, Jun Peng, Xiaoshuai Sun, Rongrong Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-3D-aware face (T3D Face) generation and manipulation is an emerging
+research hot spot in machine learning, which still suffers from low efficiency
+and poor quality. In this paper, we propose an End-to-End Efficient and
+Effective network for fast and accurate T3D face generation and manipulation,
+termed $E^3$-FaceNet. Different from existing complex generation paradigms,
+$E^3$-FaceNet resorts to a direct mapping from text instructions to 3D-aware
+visual space. We introduce a novel Style Code Enhancer to enhance cross-modal
+semantic alignment, alongside an innovative Geometric Regularization objective
+to maintain consistency across multi-view generations. Extensive experiments on
+three benchmark datasets demonstrate that $E^3$-FaceNet can not only achieve
+picture-like 3D face generation and manipulation, but also improve inference
+speed by orders of magnitudes. For instance, compared with Latent3D,
+$E^3$-FaceNet speeds up the five-view generations by almost 470 times, while
+still exceeding in generation quality. Our code is released at
+https://github.com/Aria-Zhangjl/E3-FaceNet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HDRTransDC: High Dynamic Range Image Reconstruction with <span class="highlight-title">Transformer</span>
+  Deformation Convolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.06831v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.06831v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuaikang Shang, Xuejing Kang, Anlong Ming
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  High Dynamic Range (HDR) imaging aims to generate an artifact-free HDR image
+with realistic details by fusing multi-exposure Low Dynamic Range (LDR) images.
+Caused by large motion and severe under-/over-exposure among input LDR images,
+HDR imaging suffers from ghosting artifacts and fusion distortions. To address
+these critical issues, we propose an HDR Transformer Deformation Convolution
+(HDRTransDC) network to generate high-quality HDR images, which consists of the
+Transformer Deformable Convolution Alignment Module (TDCAM) and the Dynamic
+Weight Fusion Block (DWFB). To solve the ghosting artifacts, the proposed TDCAM
+extracts long-distance content similar to the reference feature in the entire
+non-reference features, which can accurately remove misalignment and fill the
+content occluded by moving objects. For the purpose of eliminating fusion
+distortions, we propose DWFB to spatially adaptively select useful information
+across frames to effectively fuse multi-exposed features. Extensive experiments
+show that our method quantitatively and qualitatively achieves state-of-the-art
+performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>We request to withdraw our manuscript due to identified issues:
+  inaccuracies in the description of a submodule's composition, principles, and
+  functionality in Section 3.2, and potential problems in metric calculation in
+  Sections 4.2 and 4.3. To prevent the spread of misleading information, we
+  believe it is necessary to temporarily withdraw the manuscript for further
+  research and verification</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A comparison between humans and AI at recognizing objects in unusual
+  poses 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.03973v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.03973v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Netta Ollikka, Amro Abbas, Andrea Perin, Markku Kilpeläinen, Stéphane Deny
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning is closing the gap with human vision on several object
+recognition benchmarks. Here we investigate this gap for challenging images
+where objects are seen in unusual poses. We find that humans excel at
+recognizing objects in such poses. In contrast, state-of-the-art deep networks
+for vision (EfficientNet, SWAG, ViT, SWIN, BEiT, ConvNext) and state-of-the-art
+large vision-language models (Claude 3.5, Gemini 1.5, GPT-4) are systematically
+brittle on unusual poses, with the exception of Gemini showing excellent
+robustness in that condition. As we limit image exposure time, human
+performance degrades to the level of deep networks, suggesting that additional
+mental processes (requiring additional time) are necessary to identify objects
+in unusual poses. An analysis of error patterns of humans vs. networks reveals
+that even time-limited humans are dissimilar to feed-forward deep networks. In
+conclusion, our comparison reveals that humans and deep networks rely on
+different mechanisms for recognizing objects in unusual poses. Understanding
+the nature of the mental processes taking place during extra viewing time may
+be key to reproduce the robustness of human vision in silico.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sparse-Tuning: Adapting Vision <span class="highlight-title">Transformer</span>s with Efficient Fine-tuning
+  and Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.14700v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.14700v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ting Liu, Xuyang Liu, Siteng Huang, Liangtao Shi, Zunnan Xu, Yi Xin, Quanjun Yin, Xiaohong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Parameter-efficient fine-tuning (PEFT) has emerged as a popular solution for
+adapting pre-trained Vision Transformer (ViT) models to downstream
+applications. While current PEFT methods have achieved parameter efficiency,
+they overlook the efficiency of computation and GPU memory during both
+fine-tuning and inference, falling short of practical requirements. In this
+paper, we propose \textbf{Sparse-Tuning}, a novel PEFT method that accounts for
+the information redundancy in images and videos to boost the above efficiency.
+By sparsely preserving the semantic-relevant tokens and merging irrelevant
+ones, Sparse-Tuning minimizes the quantity of tokens processed at each layer,
+leading to a quadratic reduction in computational and memory overhead. To align
+our token sparsification strategy suitably with fine-tuning purposes, we
+further design Dense Adapters that establish dense connections from shallow
+layers to deeper layers. These Dense Adapters integrate multi-level local
+features to enrich the current tokens, improving both token preservation and
+model adaptation. Empirical results on VTAB-1K, three image datasets, and two
+video datasets show that our Sparse-Tuning reduces GFLOPs to \textbf{62\%-70\%}
+of the original ViT-B while achieving state-of-the-art performance. Source code
+is available at \url{https://github.com/liuting20/Sparse-Tuning}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Learning Based Speckle Filtering for Polarimetric SAR Images.
+  Application to Sentinel-1 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15678v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15678v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alejandro Mestre-Quereda, Juan M. Lopez-Sanchez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speckle suppression in synthetic aperture radar (SAR) images is a key
+processing step which continues to be a research topic. A wide variety of
+methods, using either spatially-based approaches or transform-based strategies,
+have been developed and have shown to provide outstanding results. However,
+recent advances in deep learning techniques and their application to SAR image
+despeckling have been demonstrated to offer state-of-the-art results.
+Unfortunately, they have been mostly applied to single-polarimetric images. The
+extension of a deep learning-based approach for speckle removal to polarimetric
+SAR (PolSAR) images is complicated because of the complex nature of the
+measured covariance matrices for every image pixel, the properties of which
+must be preserved during filtering. In this work, we propose a complete
+framework to remove speckle in polarimetric SAR images using a convolutional
+neural network. The methodology includes a reversible transformation of the
+original complex covariance matrix to obtain a set of real-valued intensity
+bands which are fed to the neural network. In addition, the proposed method
+includes a change detection strategy to avoid the neural network to learn
+erroneous features in areas strongly affected by temporal changes, so that the
+network only learns the underlying speckle component present in the data. The
+method is implemented and tested with dual-polarimetric images acquired by
+Sentinel-1. Experiments show that the proposed approach offers exceptional
+results in both speckle reduction and resolution preservation. More
+importantly, it is also shown that the neural network is not generating
+artifacts or introducing bias in the filtered images, making them suitable for
+further polarimetric processing and exploitation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 32 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dual-Domain CLIP-Assisted Residual Optimization Perception Model for
+  Metal Artifact Reduction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14342v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14342v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinrui Zhang, Ailong Cai, Shaoyu Wang, Linyuan Wang, Zhizhong Zheng, Lei Li, Bin Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Metal artifacts in computed tomography (CT) imaging pose significant
+challenges to accurate clinical diagnosis. The presence of high-density
+metallic implants results in artifacts that deteriorate image quality,
+manifesting in the forms of streaking, blurring, or beam hardening effects,
+etc. Nowadays, various deep learning-based approaches, particularly generative
+models, have been proposed for metal artifact reduction (MAR). However, these
+methods have limited perception ability in the diverse morphologies of
+different metal implants with artifacts, which may generate spurious anatomical
+structures and exhibit inferior generalization capability. To address the
+issues, we leverage visual-language model (VLM) to identify these morphological
+features and introduce them into a dual-domain CLIP-assisted residual
+optimization perception model (DuDoCROP) for MAR. Specifically, a dual-domain
+CLIP (DuDoCLIP) is fine-tuned on the image domain and sinogram domain using
+contrastive learning to extract semantic descriptions from anatomical
+structures and metal artifacts. Subsequently, a diffusion model is guided by
+the embeddings of DuDoCLIP, thereby enabling the dual-domain prior generation.
+Additionally, we design prompt engineering for more precise image-text
+descriptions that can enhance the model's perception capability. Then, a
+downstream task is devised for the one-step residual optimization and
+integration of dual-domain priors, while incorporating raw data fidelity.
+Ultimately, a new perceptual indicator is proposed to validate the model's
+perception and generation performance. With the assistance of DuDoCLIP, our
+DuDoCROP exhibits at least 63.7% higher generalization capability compared to
+the baseline model. Numerical experiments demonstrate that the proposed method
+can generate more realistic image structures and outperform other SOTA
+approaches both qualitatively and quantitatively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 18 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Adaptive Deep Networks for Image Classification via
+  Uncertainty-aware Decision Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13744v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13744v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xu Zhang, Zhipeng Xie, Haiyang Yu, Qitong Wang, Peng Wang, Wei Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Handling varying computational resources is a critical issue in modern AI
+applications. Adaptive deep networks, featuring the dynamic employment of
+multiple classifier heads among different layers, have been proposed to address
+classification tasks under varying computing resources. Existing approaches
+typically utilize the last classifier supported by the available resources for
+inference, as they believe that the last classifier always performs better
+across all classes. However, our findings indicate that earlier classifier
+heads can outperform the last head for certain classes. Based on this
+observation, we introduce the Collaborative Decision Making (CDM) module, which
+fuses the multiple classifier heads to enhance the inference performance of
+adaptive deep networks. CDM incorporates an uncertainty-aware fusion method
+based on evidential deep learning (EDL), that utilizes the reliability
+(uncertainty values) from the first c-1 classifiers to improve the c-th
+classifier' accuracy. We also design a balance term that reduces fusion
+saturation and unfairness issues caused by EDL constraints to improve the
+fusion quality of CDM. Finally, a regularized training strategy that uses the
+last classifier to guide the learning process of early classifiers is proposed
+to further enhance the CDM module's effect, called the Guided Collaborative
+Decision Making (GCDM) framework. The experimental evaluation demonstrates the
+effectiveness of our approaches. Results on ImageNet datasets show CDM and GCDM
+obtain 0.4% to 2.8% accuracy improvement (under varying computing resources) on
+popular adaptive networks. The code is available at the link
+https://github.com/Meteor-Stars/GCDM_AdaptiveNet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 27 figures. In ACM Multimedia 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Flying Bird Object Detection Method for Surveillance Video 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.03749v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.03749v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziwei Sun, Zexi Hua, Hengchao Li, Yan Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Aiming at the specific characteristics of flying bird objects in surveillance
+video, such as the typically non-obvious features in single-frame images, small
+size in most instances, and asymmetric shapes, this paper proposes a Flying
+Bird Object Detection method for Surveillance Video (FBOD-SV). Firstly, a new
+feature aggregation module, the Correlation Attention Feature Aggregation
+(Co-Attention-FA) module, is designed to aggregate the features of the flying
+bird object according to the bird object's correlation on multiple consecutive
+frames of images. Secondly, a Flying Bird Object Detection Network (FBOD-Net)
+with down-sampling followed by up-sampling is designed, which utilizes a large
+feature layer that fuses fine spatial information and large receptive field
+information to detect special multi-scale (mostly small-scale) bird objects.
+Finally, the SimOTA dynamic label allocation method is applied to One-Category
+object detection, and the SimOTA-OC dynamic label strategy is proposed to solve
+the difficult problem of label allocation caused by irregular flying bird
+objects. In this paper, the performance of the FBOD-SV is validated using
+experimental datasets of flying bird objects in traction substation
+surveillance videos. The experimental results show that the FBOD-SV effectively
+improves the detection performance of flying bird objects in surveillance
+video.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Conformal Performance Range Prediction for Segmentation Output Quality
+  Control <span class="chip">MICCAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.13307v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.13307v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anna M. Wundram, Paul Fischer, Michael Muehlebach, Lisa M. Koch, Christian F. Baumgartner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent works have introduced methods to estimate segmentation performance
+without ground truth, relying solely on neural network softmax outputs. These
+techniques hold potential for intuitive output quality control. However, such
+performance estimates rely on calibrated softmax outputs, which is often not
+the case in modern neural networks. Moreover, the estimates do not take into
+account inherent uncertainty in segmentation tasks. These limitations may
+render precise performance predictions unattainable, restricting the practical
+applicability of performance estimation methods. To address these challenges,
+we develop a novel approach for predicting performance ranges with statistical
+guarantees of containing the ground truth with a user specified probability.
+Our method leverages sampling-based segmentation uncertainty estimation to
+derive heuristic performance ranges, and applies split conformal prediction to
+transform these estimates into rigorous prediction ranges that meet the desired
+guarantees. We demonstrate our approach on the FIVES retinal vessel
+segmentation dataset and compare five commonly used sampling-based uncertainty
+estimation techniques. Our results show that it is possible to achieve the
+desired coverage with small prediction ranges, highlighting the potential of
+performance range prediction as a valuable tool for output quality control.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as an oral presentation at MICCAI UNSURE 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Shot Segmentation Based on Von Neumann Entropy for Key Frame Extraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15844v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15844v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xueqing Zhang, Di Fu, Naihao Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video key frame extraction is important in various fields, such as video
+summary, retrieval, and compression. Therefore, we suggest a video key frame
+extraction algorithm based on shot segmentation using Von Neumann entropy. The
+segmentation of shots is achieved through the computation of Von Neumann
+entropy of the similarity matrix among frames within the video sequence. The
+initial frame of each shot is selected as key frames, which combines the
+temporal sequence information of frames. The experimental results show the
+extracted key frames can fully and accurately represent the original video
+content while minimizing the number of repeated frames.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spatio-Temporal Context <span class="highlight-title">Prompt</span>ing for Zero-Shot Action Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15996v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15996v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei-Jhe Huang, Min-Hung Chen, Shang-Hong Lai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spatio-temporal action detection encompasses the tasks of localizing and
+classifying individual actions within a video. Recent works aim to enhance this
+process by incorporating interaction modeling, which captures the relationship
+between people and their surrounding context. However, these approaches have
+primarily focused on fully-supervised learning, and the current limitation lies
+in the lack of generalization capability to recognize unseen action categories.
+In this paper, we aim to adapt the pretrained image-language models to detect
+unseen actions. To this end, we propose a method which can effectively leverage
+the rich knowledge of visual-language models to perform Person-Context
+Interaction. Meanwhile, our Context Prompting module will utilize contextual
+information to prompt labels, thereby enhancing the generation of more
+representative text features. Moreover, to address the challenge of recognizing
+distinct actions by multiple people at the same timestamp, we design the
+Interest Token Spotting mechanism which employs pretrained visual knowledge to
+find each person's interest context tokens, and then these tokens will be used
+for prompting to generate text features tailored to each individual. To
+evaluate the ability to detect unseen actions, we propose a comprehensive
+benchmark on J-HMDB, UCF101-24, and AVA datasets. The experiments show that our
+method achieves superior results compared to previous approaches and can be
+further extended to multi-action videos, bringing it closer to real-world
+applications. The code and data can be found in
+https://webber2933.github.io/ST-CLIP-project-page.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://webber2933.github.io/ST-CLIP-project-page</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Text-Region Matching for Multi-Label Image Recognition with Missing
+  Labels <span class="chip">ACM MM</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.18520v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.18520v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leilei Ma, Hongxing Xie, Lei Wang, Yanping Fu, Dengdi Sun, Haifeng Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, large-scale visual language pre-trained (VLP) models have
+demonstrated impressive performance across various downstream tasks. Motivated
+by these advancements, pioneering efforts have emerged in multi-label image
+recognition with missing labels, leveraging VLP prompt-tuning technology.
+However, they usually cannot match text and vision features well, due to
+complicated semantics gaps and missing labels in a multi-label image. To tackle
+this challenge, we propose $\textbf{T}$ext-$\textbf{R}$egion
+$\textbf{M}$atching for optimizing $\textbf{M}$ulti-$\textbf{L}$abel prompt
+tuning, namely TRM-ML, a novel method for enhancing meaningful cross-modal
+matching. Compared to existing methods, we advocate exploring the information
+of category-aware regions rather than the entire image or pixels, which
+contributes to bridging the semantic gap between textual and visual
+representations in a one-to-one matching manner. Concurrently, we further
+introduce multimodal contrastive learning to narrow the semantic gap between
+textual and visual modalities and establish intra-class and inter-class
+relationships. Additionally, to deal with missing labels, we propose a
+multimodal category prototype that leverages intra- and inter-category semantic
+relationships to estimate unknown labels, facilitating pseudo-label generation.
+Extensive experiments on the MS-COCO, PASCAL VOC, Visual Genome, NUS-WIDE, and
+CUB-200-211 benchmark datasets demonstrate that our proposed framework
+outperforms the state-of-the-art methods by a significant margin. Our code is
+available here: https://github.com/yu-gi-oh-leilei/TRM-ML.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM International Conference on Multimedia (ACM MM) 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Erasing Concepts from Text-to-Image Diffusion Models with Few-shot
+  Unlearning <span class="chip">BMVC2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.07288v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.07288v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Masane Fuchi, Tomohiro Takagi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generating images from text has become easier because of the scaling of
+diffusion models and advancements in the field of vision and language. These
+models are trained using vast amounts of data from the Internet. Hence, they
+often contain undesirable content such as copyrighted material. As it is
+challenging to remove such data and retrain the models, methods for erasing
+specific concepts from pre-trained models have been investigated. We propose a
+novel concept-erasure method that updates the text encoder using few-shot
+unlearning in which a few real images are used. The discussion regarding the
+generated images after erasing a concept has been lacking. While there are
+methods for specifying the transition destination for concepts, the validity of
+the specified concepts is unclear. Our method implicitly achieves this by
+transitioning to the latent concepts inherent in the model or the images. Our
+method can erase a concept within 10 s, making concept erasure more accessible
+than ever before. Implicitly transitioning to related concepts leads to more
+natural concept erasure. We applied the proposed method to various concepts and
+confirmed that concept erasure can be achieved tens to hundreds of times faster
+than with current methods. By varying the parameters to be updated, we obtained
+results suggesting that, like previous research, knowledge is primarily
+accumulated in the feed-forward networks of the text encoder. Our code is
+available at \url{https://github.com/fmp453/few-shot-erasing}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 28 figures, accepted by BMVC2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CorMulT: A Semi-supervised Modality Correlation-aware Multimodal
+  <span class="highlight-title">Transformer</span> for Sentiment Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.07046v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.07046v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yangmin Li, Ruiqi Zhu, Wengen Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal sentiment analysis is an active research area that combines
+multiple data modalities, e.g., text, image and audio, to analyze human
+emotions and benefits a variety of applications. Existing multimodal sentiment
+analysis methods can be classified as modality interaction-based methods,
+modality transformation-based methods and modality similarity-based methods.
+However, most of these methods highly rely on the strong correlations between
+modalities, and cannot fully uncover and utilize the correlations between
+modalities to enhance sentiment analysis. Therefore, these methods usually
+achieve bad performance for identifying the sentiment of multimodal data with
+weak correlations. To address this issue, we proposed a two-stage
+semi-supervised model termed Correlation-aware Multimodal Transformer (CorMulT)
+which consists pre-training stage and prediction stage. At the pre-training
+stage, a modality correlation contrastive learning module is designed to
+efficiently learn modality correlation coefficients between different
+modalities. At the prediction stage, the learned correlation coefficients are
+fused with modality representations to make the sentiment prediction. According
+to the experiments on the popular multimodal dataset CMU-MOSEI, CorMulT
+obviously surpasses state-of-the-art multimodal sentiment analysis methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ KeyMatchNet: Zero-Shot Pose Estimation in 3D Point Clouds by Generalized
+  Keypoint Matching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.16102v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.16102v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Frederik Hagelskjær, Rasmus Laurvig Haugaard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present KeyMatchNet, a novel network for zero-shot pose
+estimation in 3D point clouds. Our method uses only depth information, making
+it more applicable for many industrial use cases, as color information is
+seldom available. The network is composed of two parallel components for
+computing object and scene features. The features are then combined to create
+matches used for pose estimation. The parallel structure allows for
+pre-processing of the individual parts, which decreases the run-time. Using a
+zero-shot network allows for a very short set-up time, as it is not necessary
+to train models for new objects. However, as the network is not trained for the
+specific object, zero-shot pose estimation methods generally have lower
+accuracy compared with conventional methods. To address this, we reduce the
+complexity of the task by including the scenario information during training.
+This is typically not feasible as collecting real data for new tasks
+drastically increases the cost. However, for zero-shot pose estimation,
+training for new objects is not necessary and the expensive data collection can
+thus be performed only once. Our method is trained on 1,500 objects and is only
+tested on unseen objects. We demonstrate that the trained network can not only
+accurately estimate poses for novel objects, but also demonstrate the ability
+of the network on objects outside of the trained class. Test results are also
+shown on real data. We believe that the presented method is valuable for many
+real-world scenarios. Project page available at keymatchnet.github.io
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Satellite Sunroof: High-res Digital Surface Models and Roof Segmentation
+  for Global Solar Mapping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14400v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14400v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vishal Batchu, Alex Wilson, Betty Peng, Carl Elkin, Umangi Jain, Christopher Van Arsdale, Ross Goroshin, Varun Gulshan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The transition to renewable energy, particularly solar, is key to mitigating
+climate change. Google's Solar API aids this transition by estimating solar
+potential from aerial imagery, but its impact is constrained by geographical
+coverage. This paper proposes expanding the API's reach using satellite
+imagery, enabling global solar potential assessment. We tackle challenges
+involved in building a Digital Surface Model (DSM) and roof instance
+segmentation from lower resolution and single oblique views using deep learning
+models. Our models, trained on aligned satellite and aerial datasets, produce
+25cm DSMs and roof segments. With ~1m DSM MAE on buildings, ~5deg roof pitch
+error and ~56% IOU on roof segmentation, they significantly enhance the Solar
+API's potential to promote solar adoption.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RIDE: Boosting 3D Object Detection for LiDAR Point Clouds via
+  Rotation-Invariant Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15643v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15643v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhaoxuan Wang, Xu Han, Hongxin Liu, Xianzhi Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rotation robustness property has drawn much attention to point cloud
+analysis, whereas it still poses a critical challenge in 3D object detection.
+When subjected to arbitrary rotation, most existing detectors fail to produce
+expected outputs due to the poor rotation robustness. In this paper, we present
+RIDE, a pioneering exploration of Rotation-Invariance for the 3D
+LiDAR-point-based object DEtector, with the key idea of designing
+rotation-invariant features from LiDAR scenes and then effectively
+incorporating them into existing 3D detectors. Specifically, we design a
+bi-feature extractor that extracts (i) object-aware features though sensitive
+to rotation but preserve geometry well, and (ii) rotation-invariant features,
+which lose geometric information to a certain extent but are robust to
+rotation. These two kinds of features complement each other to decode 3D
+proposals that are robust to arbitrary rotations. Particularly, our RIDE is
+compatible and easy to plug into the existing one-stage and two-stage 3D
+detectors, and boosts both detection performance and rotation robustness.
+Extensive experiments on the standard benchmarks showcase that the mean average
+precision (mAP) and rotation robustness can be significantly boosted by
+integrating with our RIDE, with +5.6% mAP and 53% rotation robustness
+improvement on KITTI, +5.1% and 28% improvement correspondingly on nuScenes.
+The code will be available soon.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Content Significance Distribution of Sub-Text Blocks in Articles and Its
+  Application to Article-Organization Assessment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.01673v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.01673v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        You Zhou, Jie Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We explore how to capture the significance of a sub-text block in an article
+and how it may be used for text mining tasks. A sub-text block is a
+sub-sequence of sentences in the article. We formulate the notion of content
+significance distribution (CSD) of sub-text blocks, referred to as CSD of the
+first kind and denoted by CSD-1. In particular, we leverage Hugging Face's
+SentenceTransformer to generate contextual sentence embeddings, and use
+MoverScore over text embeddings to measure how similar a sub-text block is to
+the entire text. To overcome the exponential blowup on the number of sub-text
+blocks, we present an approximation algorithm and show that the approximated
+CSD-1 is almost identical to the exact CSD-1. Under this approximation, we show
+that the average and median CSD-1's for news, scholarly research, argument, and
+narrative articles share the same pattern. We also show that under a certain
+linear transformation, the complement of the cumulative distribution function
+of the beta distribution with certain values of $\alpha$ and $\beta$ resembles
+a CSD-1 curve. We then use CSD-1's to extract linguistic features to train an
+SVC classifier for assessing how well an article is organized. Through
+experiments, we show that this method achieves high accuracy for assessing
+student essays. Moreover, we study CSD of sentence locations, referred to as
+CSD of the second kind and denoted by CSD-2, and show that average CSD-2's for
+different types of articles possess distinctive patterns, which either conform
+common perceptions of article structures or provide rectification with minor
+deviation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SegVol: Universal and Interactive Volumetric Medical Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.13385v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.13385v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxin Du, Fan Bai, Tiejun Huang, Bo Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Precise image segmentation provides clinical study with instructive
+information. Despite the remarkable progress achieved in medical image
+segmentation, there is still an absence of a 3D foundation segmentation model
+that can segment a wide range of anatomical categories with easy user
+interaction. In this paper, we propose a 3D foundation segmentation model,
+named SegVol, supporting universal and interactive volumetric medical image
+segmentation. By scaling up training data to 90K unlabeled Computed Tomography
+(CT) volumes and 6K labeled CT volumes, this foundation model supports the
+segmentation of over 200 anatomical categories using semantic and spatial
+prompts. To facilitate efficient and precise inference on volumetric images, we
+design a zoom-out-zoom-in mechanism. Extensive experiments on 22 anatomical
+segmentation tasks verify that SegVol outperforms the competitors in 19 tasks,
+with improvements up to 37.24% compared to the runner-up methods. We
+demonstrate the effectiveness and importance of specific designs by ablation
+study. We expect this foundation model can promote the development of
+volumetric medical image analysis. The model and code are publicly available
+at: https://github.com/BAAI-DCAI/SegVol.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DiffiT: Diffusion Vision <span class="highlight-title">Transformer</span>s for Image Generation <span class="chip">ECCV'24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.02139v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.02139v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Hatamizadeh, Jiaming Song, Guilin Liu, Jan Kautz, Arash Vahdat
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models with their powerful expressivity and high sample quality
+have achieved State-Of-The-Art (SOTA) performance in the generative domain. The
+pioneering Vision Transformer (ViT) has also demonstrated strong modeling
+capabilities and scalability, especially for recognition tasks. In this paper,
+we study the effectiveness of ViTs in diffusion-based generative learning and
+propose a new model denoted as Diffusion Vision Transformers (DiffiT).
+Specifically, we propose a methodology for finegrained control of the denoising
+process and introduce the Time-dependant Multihead Self Attention (TMSA)
+mechanism. DiffiT is surprisingly effective in generating high-fidelity images
+with significantly better parameter efficiency. We also propose latent and
+image space DiffiT models and show SOTA performance on a variety of
+class-conditional and unconditional synthesis tasks at different resolutions.
+The Latent DiffiT model achieves a new SOTA FID score of 1.73 on ImageNet256
+dataset while having 19.85%, 16.88% less parameters than other
+Transformer-based diffusion models such as MDT and DiT,respectively. Code:
+https://github.com/NVlabs/DiffiT
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ECCV'24</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EaDeblur-GS: Event assisted 3D Deblur Reconstruction with Gaussian
+  Splatting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.13520v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.13520v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuchen Weng, Zhengwen Shen, Ruofan Chen, Qi Wang, Jun Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D deblurring reconstruction techniques have recently seen significant
+advancements with the development of Neural Radiance Fields (NeRF) and 3D
+Gaussian Splatting (3DGS). Although these techniques can recover relatively
+clear 3D reconstructions from blurry image inputs, they still face limitations
+in handling severe blurring and complex camera motion. To address these issues,
+we propose Event-assisted 3D Deblur Reconstruction with Gaussian Splatting
+(EaDeblur-GS), which integrates event camera data to enhance the robustness of
+3DGS against motion blur. By employing an Adaptive Deviation Estimator (ADE)
+network to estimate Gaussian center deviations and using novel loss functions,
+EaDeblur-GS achieves sharp 3D reconstructions in real-time, demonstrating
+performance comparable to state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Pre-train</span>ing on Synthetic Driving Data for Trajectory Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.10121v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.10121v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiheng Li, Seth Z. Zhao, Chenfeng Xu, Chen Tang, Chenran Li, Mingyu Ding, Masayoshi Tomizuka, Wei Zhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accumulating substantial volumes of real-world driving data proves pivotal in
+the realm of trajectory forecasting for autonomous driving. Given the heavy
+reliance of current trajectory forecasting models on data-driven methodologies,
+we aim to tackle the challenge of learning general trajectory forecasting
+representations under limited data availability. We propose a pipeline-level
+solution to mitigate the issue of data scarcity in trajectory forecasting. The
+solution is composed of two parts: firstly, we adopt HD map augmentation and
+trajectory synthesis for generating driving data, and then we learn
+representations by pre-training on them. Specifically, we apply vector
+transformations to reshape the maps, and then employ a rule-based model to
+generate trajectories on both original and augmented scenes; thus enlarging the
+driving data without collecting additional real ones. To foster the learning of
+general representations within this augmented dataset, we comprehensively
+explore the different pre-training strategies, including extending the concept
+of a Masked AutoEncoder (MAE) for trajectory forecasting. Without bells and
+whistles, our proposed pipeline-level solution is general, simple, yet
+effective: we conduct extensive experiments to demonstrate the effectiveness of
+our data expansion and pre-training strategies, which outperform the baseline
+prediction model by large margins, e.g. 5.04%, 3.84% and 8.30% in terms of
+$MR_6$, $minADE_6$ and $minFDE_6$. The pre-training dataset and the codes for
+pre-training and fine-tuning are released at
+https://github.com/yhli123/Pretraining_on_Synthetic_Driving_Data_for_Trajectory_Prediction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Deep Representation Learning via Auxiliary Learnable Target
+  Coding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.18680v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.18680v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kangjun Liu, Ke Chen, Kui Jia, Yaowei Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep representation learning is a subfield of machine learning that focuses
+on learning meaningful and useful representations of data through deep neural
+networks. However, existing methods for semantic classification typically
+employ pre-defined target codes such as the one-hot and the Hadamard codes,
+which can either fail or be less flexible to model inter-class correlation. In
+light of this, this paper introduces a novel learnable target coding as an
+auxiliary regularization of deep representation learning, which can not only
+incorporate latent dependency across classes but also impose geometric
+properties of target codes into representation space. Specifically, a
+margin-based triplet loss and a correlation consistency loss on the proposed
+target codes are designed to encourage more discriminative representations
+owing to enlarging between-class margins in representation space and favoring
+equal semantic correlation of learnable target codes respectively. Experimental
+results on several popular visual classification and retrieval benchmarks can
+demonstrate the effectiveness of our method on improving representation
+learning, especially for imbalanced data. Source codes are made publicly
+available at
+\href{https://github.com/AkonLau/LTC}{https://github.com/AkonLau/LTC}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Pattern Recognition, 33 pages, 8 figures, 11 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SI<span class="highlight-title">Transformer</span>: Shared Information-Guided <span class="highlight-title">Transformer</span> for Extreme
+  Multimodal Summarization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15829v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15829v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sicheng Liu, Lintao Wang, Xiaogan Zhu, Xuequan Lu, Zhiyong Wang, Kun Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Extreme Multimodal Summarization with Multimodal Output (XMSMO) becomes an
+attractive summarization approach by integrating various types of information
+to create extremely concise yet informative summaries for individual
+modalities. Existing methods overlook the issue that multimodal data often
+contains more topic irrelevant information, which can mislead the model into
+producing inaccurate summaries especially for extremely short ones. In this
+paper, we propose SITransformer, a Shared Information-guided Transformer for
+extreme multimodal summarization. It has a shared information guided pipeline
+which involves a cross-modal shared information extractor and a cross-modal
+interaction module. The extractor formulates semantically shared salient
+information from different modalities by devising a novel filtering process
+consisting of a differentiable top-k selector and a shared-information guided
+gating unit. As a result, the common, salient, and relevant contents across
+modalities are identified. Next, a transformer with cross-modal attentions is
+developed for intra- and inter-modality learning with the shared information
+guidance to produce the extreme summary. Comprehensive experiments demonstrate
+that SITransformer significantly enhances the summarization quality for both
+video and text summaries for XMSMO. Our code will be publicly available at
+https://github.com/SichengLeoLiu/MMAsia24-XMSMO.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 5 figures, submitted to ACM Multimedia Asia 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hierarchical Spatial Proximity Reasoning for Vision-and-Language
+  Navigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.11541v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.11541v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ming Xu, Zilong Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most Vision-and-Language Navigation (VLN) algorithms are prone to making
+decision due to a lack of visual common sense and insufficient reasoning
+capabilities. To address this issue, we propose a Hierarchical Spatial
+Proximity Reasoning (HSPR) method. First, we introduce a scene understanding
+auxiliary task to help the agent build a knowledge base of hierarchical spatial
+proximity. This task utilizes panoramic views and object features to identify
+types of nodes and uncover the adjacency relationships between nodes, objects,
+and between nodes and objects. Second, we propose a multi-step reasoning
+navigation algorithm based on hierarchical spatial proximity knowledge base,
+which continuously plans feasible paths to enhance exploration efficiency.
+Third, we introduce a residual fusion method to improve navigation decision
+accuracy. Finally, we validate our approach with experiments on publicly
+available datasets including REVERIE, SOON, R2R, and R4R. Our code is available
+at https://github.com/iCityLab/HSPR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DuoSpaceNet: Leveraging Both Bird's-Eye-View and Perspective View
+  Representations for 3D Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.10577v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.10577v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhe Huang, Yizhe Zhao, Hao Xiao, Chenyan Wu, Lingting Ge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in multi-view camera-only 3D object detection either rely on
+an accurate reconstruction of bird's-eye-view (BEV) 3D features or on
+traditional 2D perspective view (PV) image features. While both have their own
+pros and cons, few have found a way to stitch them together in order to benefit
+from "the best of both worlds". To this end, we explore a duo space (i.e., BEV
+and PV) 3D perception framework, in conjunction with some useful duo space
+fusion strategies that allow effective aggregation of the two feature
+representations. To the best of our knowledge, our proposed method,
+DuoSpaceNet, is the first to leverage two distinct feature spaces and achieves
+the state-of-the-art 3D object detection and BEV map segmentation results on
+nuScenes dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TiCoSS: Tightening the Coupling between Semantic Segmentation and Stereo
+  Matching within A Joint Learning Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.18038v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.18038v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guanfeng Tang, Zhiyuan Wu, Jiahang Li, Ping Zhong, Xieyuanli Chen, Huiming Liu, Rui Fan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic segmentation and stereo matching, respectively analogous to the
+ventral and dorsal streams in our human brain, are two key components of
+autonomous driving perception systems. Addressing these two tasks with separate
+networks is no longer the mainstream direction in developing computer vision
+algorithms, particularly with the recent advances in large vision models and
+embodied artificial intelligence. The trend is shifting towards combining them
+within a joint learning framework, especially emphasizing feature sharing
+between the two tasks. The major contributions of this study lie in
+comprehensively tightening the coupling between semantic segmentation and
+stereo matching. Specifically, this study introduces three novelties: (1) a
+tightly coupled, gated feature fusion strategy, (2) a hierarchical deep
+supervision strategy, and (3) a coupling tightening loss function. The combined
+use of these technical contributions results in TiCoSS, a state-of-the-art
+joint learning framework that simultaneously tackles semantic segmentation and
+stereo matching. Through extensive experiments on the KITTI and vKITTI2
+datasets, along with qualitative and quantitative analyses, we validate the
+effectiveness of our developed strategies and loss function, and demonstrate
+its superior performance compared to prior arts, with a notable increase in
+mIoU by over 9%. Our source code will be publicly available at
+mias.group/TiCoSS upon publication.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ 360 Layout Estimation via Orthogonal Planes Disentanglement and
+  Multi-view Geometric Consistency Perception 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.16268v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.16268v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhijie Shen, Chunyu Lin, Junsong Zhang, Lang Nie, Kang Liao, Yao Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing panoramic layout estimation solutions tend to recover room
+boundaries from a vertically compressed sequence, yielding imprecise results as
+the compression process often muddles the semantics between various planes.
+Besides, these data-driven approaches impose an urgent demand for massive data
+annotations, which are laborious and time-consuming. For the first problem, we
+propose an orthogonal plane disentanglement network (termed DOPNet) to
+distinguish ambiguous semantics. DOPNet consists of three modules that are
+integrated to deliver distortion-free, semantics-clean, and detail-sharp
+disentangled representations, which benefit the subsequent layout recovery. For
+the second problem, we present an unsupervised adaptation technique tailored
+for horizon-depth and ratio representations. Concretely, we introduce an
+optimization strategy for decision-level layout analysis and a 1D cost volume
+construction method for feature-level multi-view aggregation, both of which are
+designed to fully exploit the geometric consistency across multiple
+perspectives. The optimizer provides a reliable set of pseudo-labels for
+network training, while the 1D cost volume enriches each view with
+comprehensive scene information derived from other perspectives. Extensive
+experiments demonstrate that our solution outperforms other SoTA models on both
+monocular layout estimation and multi-view layout estimation tasks. Cobe can be
+available at https://github.com/zhijieshen-bjtu/MV-DOPNet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accept to TPAMI2024. arXiv admin note: substantial text overlap with
+  arXiv:2303.00971</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Embodiment: <span class="highlight-title">Self-Supervised</span> Depth Estimation Based on Camera Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.01565v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.01565v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinchang Zhang, Praveen Kumar Reddy, Xue-Iuan Wong, Yiannis Aloimonos, Guoyu Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Depth estimation is a critical topic for robotics and vision-related tasks.
+In monocular depth estimation, in comparison with supervised learning that
+requires expensive ground truth labeling, self-supervised methods possess great
+potential due to no labeling cost. However, self-supervised learning still has
+a large gap with supervised learning in 3D reconstruction and depth estimation
+performance. Meanwhile, scaling is also a major issue for monocular
+unsupervised depth estimation, which commonly still needs ground truth scale
+from GPS, LiDAR, or existing maps to correct. In the era of deep learning,
+existing methods primarily rely on exploring image relationships to train
+unsupervised neural networks, while the physical properties of the camera
+itself such as intrinsics and extrinsics are often overlooked. These physical
+properties are not just mathematical parameters; they are embodiments of the
+camera's interaction with the physical world. By embedding these physical
+properties into the deep learning model, we can calculate depth priors for
+ground regions and regions connected to the ground based on physical
+principles, providing free supervision signals without the need for additional
+sensors. This approach is not only easy to implement but also enhances the
+effects of all unsupervised methods by embedding the camera's physical
+properties into the model, thereby achieving an embodied understanding of the
+real world.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">10</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Jina-Col<span class="highlight-title">BERT</span>-v2: A General-Purpose Multilingual Late Interaction
+  Retriever 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16672v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16672v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rohan Jha, Bo Wang, Michael Günther, Saba Sturua, Mohammad Kalim Akram, Han Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-vector dense models, such as ColBERT, have proven highly effective in
+information retrieval. ColBERT's late interaction scoring approximates the
+joint query-document attention seen in cross-encoders while maintaining
+inference efficiency closer to traditional dense retrieval models, thanks to
+its bi-encoder architecture and recent optimizations in indexing and search. In
+this paper, we introduce several improvements to the ColBERT model architecture
+and training pipeline, leveraging techniques successful in the more established
+single-vector embedding model paradigm, particularly those suited for
+heterogeneous multilingual data. Our new model, Jina-ColBERT-v2, demonstrates
+strong performance across a range of English and multilingual retrieval tasks,
+while also cutting storage requirements by up to 50% compared to previous
+models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Transformer</span>s Meet ACT-R: Repeat-Aware and Sequential Listening Session
+  Recommendation <span class="chip">RecSys'2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16578v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16578v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Viet-Anh Tran, Guillaume Salha-Galvan, Bruno Sguerra, Romain Hennequin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Music streaming services often leverage sequential recommender systems to
+predict the best music to showcase to users based on past sequences of
+listening sessions. Nonetheless, most sequential recommendation methods ignore
+or insufficiently account for repetitive behaviors. This is a crucial
+limitation for music recommendation, as repeatedly listening to the same song
+over time is a common phenomenon that can even change the way users perceive
+this song. In this paper, we introduce PISA (Psychology-Informed Session
+embedding using ACT-R), a session-level sequential recommender system that
+overcomes this limitation. PISA employs a Transformer architecture learning
+embedding representations of listening sessions and users using attention
+mechanisms inspired by Anderson's ACT-R (Adaptive Control of Thought-Rational),
+a cognitive architecture modeling human information access and memory dynamics.
+This approach enables us to capture dynamic and repetitive patterns from user
+behaviors, allowing us to effectively predict the songs they will listen to in
+subsequent sessions, whether they are repeated or new ones. We demonstrate the
+empirical relevance of PISA using both publicly available listening data from
+Last.fm and proprietary data from Deezer, a global music streaming service,
+confirming the critical importance of repetition modeling for sequential
+listening session recommendation. Along with this paper, we publicly release
+our proprietary dataset to foster future research in this field, as well as the
+source code of PISA to facilitate its future use.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages. Accepted by RecSys'2024, full paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Is text normalization relevant for classifying medieval charters? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16446v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16446v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Florian Atzenhofer-Baumgartner, Tamás Kovács
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study examines the impact of historical text normalization on the
+classification of medieval charters, specifically focusing on document dating
+and locating. Using a data set of Middle High German charters from a digital
+archive, we evaluate various classifiers, including traditional and
+transformer-based models, with and without normalization. Our results indicate
+that the given normalization minimally improves locating tasks but reduces
+accuracy for dating, implying that original texts contain crucial features that
+normalization may obscure. We find that support vector machines and gradient
+boosting outperform other models, questioning the efficiency of transformers
+for this use case. Results suggest a selective approach to historical text
+normalization, emphasizing the significance of preserving some textual
+characteristics that are critical for classification tasks in document
+analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This preprint has not undergone peer review or any post-submission
+  improvements or corrections</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Do Recommender Systems Promote Local Music? A Reproducibility Study
+  Using Music Streaming Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16430v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16430v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kristina Matrosova, Lilian Marey, Guillaume Salha-Galvan, Thomas Louail, Olivier Bodini, Manuel Moussallam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper examines the influence of recommender systems on local music
+representation, discussing prior findings from an empirical study on the LFM-2b
+public dataset. This prior study argued that different recommender systems
+exhibit algorithmic biases shifting music consumption either towards or against
+local content. However, LFM-2b users do not reflect the diverse audience of
+music streaming services. To assess the robustness of this study's conclusions,
+we conduct a comparative analysis using proprietary listening data from a
+global music streaming service, which we publicly release alongside this paper.
+We observe significant differences in local music consumption patterns between
+our dataset and LFM-2b, suggesting that caution should be exercised when
+drawing conclusions on local music based solely on LFM-2b. Moreover, we show
+that the algorithmic biases exhibited in the original work vary in our dataset,
+and that several unexplored model parameters can significantly influence these
+biases and affect the study's conclusion on both datasets. Finally, we discuss
+the complexity of accurately labeling local music, emphasizing the risk of
+misleading conclusions due to unreliable, biased, or incomplete labels. To
+encourage further research and ensure reproducibility, we have publicly shared
+our dataset and code.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SynDL: A Large-Scale Synthetic Test Collection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16312v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16312v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hossein A. Rahmani, Xi Wang, Emine Yilmaz, Nick Craswell, Bhaskar Mitra, Paul Thomas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale test collections play a crucial role in Information Retrieval
+(IR) research. However, according to the Cranfield paradigm and the research
+into publicly available datasets, the existing information retrieval research
+studies are commonly developed on small-scale datasets that rely on human
+assessors for relevance judgments - a time-intensive and expensive process.
+Recent studies have shown the strong capability of Large Language Models (LLMs)
+in producing reliable relevance judgments with human accuracy but at a greatly
+reduced cost. In this paper, to address the missing large-scale ad-hoc document
+retrieval dataset, we extend the TREC Deep Learning Track (DL) test collection
+via additional language model synthetic labels to enable researchers to test
+and evaluate their search systems at a large scale. Specifically, such a test
+collection includes more than 1,900 test queries from the previous years of
+tracks. We compare system evaluation with past human labels from past years and
+find that our synthetically created large-scale test collection can lead to
+highly correlated system rankings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking Sparse Lexical Representations for Image Retrieval in the Age
+  of Rising Multi-Modal Large Language Models <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16296v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16296v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kengo Nakata, Daisuke Miyashita, Youyang Ng, Yasuto Hoshi, Jun Deguchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we rethink sparse lexical representations for image retrieval.
+By utilizing multi-modal large language models (M-LLMs) that support visual
+prompting, we can extract image features and convert them into textual data,
+enabling us to utilize efficient sparse retrieval algorithms employed in
+natural language processing for image retrieval tasks. To assist the LLM in
+extracting image features, we apply data augmentation techniques for key
+expansion and analyze the impact with a metric for relevance between images and
+textual data. We empirically show the superior precision and recall performance
+of our image retrieval method compared to conventional vision-language
+model-based methods on the MS-COCO, PASCAL VOC, and NUS-WIDE datasets in a
+keyword-based image retrieval scenario, where keywords serve as search queries.
+We also demonstrate that the retrieval performance can be improved by
+iteratively incorporating keywords into search queries.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ECCV 2024 Workshops: 2nd Workshop on Traditional Computer
+  Vision in the Age of Deep Learning (TradiCV)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient Transfer Learning Framework for Cross-Domain Click-Through
+  Rate Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16238v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16238v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qi Liu, Xingyuan Tang, Jianqiang Huang, Xiangqian Yu, Haoran Jin, Jin Chen, Yuanhao Pu, Defu Lian, Tan Qu, Zhe Wang, Jia Cheng, Jun Lei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Natural content and advertisement coexist in industrial recommendation
+systems but differ in data distribution. Concretely, traffic related to the
+advertisement is considerably sparser compared to that of natural content,
+which motivates the development of transferring knowledge from the richer
+source natural content domain to the sparser advertising domain. The challenges
+include the inefficiencies arising from the management of extensive source data
+and the problem of 'catastrophic forgetting' that results from the CTR model's
+daily updating. To this end, we propose a novel tri-level asynchronous
+framework, i.e., Efficient Transfer Learning Framework for Cross-Domain
+Click-Through Rate Prediction (E-CDCTR), to transfer comprehensive knowledge of
+natural content to advertisement CTR models. This framework consists of three
+key components: Tiny Pre-training Model ((TPM), which trains a tiny CTR model
+with several basic features on long-term natural data; Complete Pre-training
+Model (CPM), which trains a CTR model holding network structure and input
+features the same as target advertisement on short-term natural data;
+Advertisement CTR model (A-CTR), which derives its parameter initialization
+from CPM together with multiple historical embeddings from TPM as extra feature
+and then fine-tunes on advertisement data. TPM provides richer representations
+of user and item for both the CPM and A-CTR, effectively alleviating the
+forgetting problem inherent in the daily updates. CPM further enhances the
+advertisement model by providing knowledgeable initialization, thereby
+alleviating the data sparsity challenges typically encountered by advertising
+CTR models. Such a tri-level cross-domain transfer learning framework offers an
+efficient solution to address both data sparsity and `catastrophic forgetting',
+yielding remarkable improvements.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Smart Multi-Modal Search: Contextual Sparse and Dense Embedding
+  Integration in Adobe Express <span class="chip">CIKM 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14698v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14698v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cherag Aroraa, Tracy Holloway King, Jayant Kumar, Yi Lu, Sanat Sharma, Arvind Srikantan, David Uvalle, Josep Valls-Vargas, Harsha Vardhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As user content and queries become increasingly multi-modal, the need for
+effective multi-modal search systems has grown. Traditional search systems
+often rely on textual and metadata annotations for indexed images, while
+multi-modal embeddings like CLIP enable direct search using text and image
+embeddings. However, embedding-based approaches face challenges in integrating
+contextual features such as user locale and recency. Building a scalable
+multi-modal search system requires fine-tuning several components. This paper
+presents a multi-modal search architecture and a series of AB tests that
+optimize embeddings and multi-modal technologies in Adobe Express template
+search. We address considerations such as embedding model selection, the roles
+of embeddings in matching and ranking, and the balance between dense and sparse
+embeddings. Our iterative approach demonstrates how utilizing sparse, dense,
+and contextual features enhances short and long query search, significantly
+reduces null rates (over 70\%), and increases click-through rates (CTR). Our
+findings provide insights into developing robust multi-modal search systems,
+thereby enhancing relevance for complex queries.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CIKM 2024 (International Conference on Information and Knowledge
+  Management), Multimodal Search and Recommendations Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GenRec: Generative Sequential Recommendation with Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.21191v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.21191v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Panfeng Cao, Pietro Lio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequential recommendation is a task to capture hidden user preferences from
+historical user item interaction data and recommend next items for the user.
+Significant progress has been made in this domain by leveraging classification
+based learning methods. Inspired by the recent paradigm of 'pretrain, prompt
+and predict' in NLP, we consider sequential recommendation as a sequence to
+sequence generation task and propose a novel model named Generative
+Recommendation (GenRec). Unlike classification based models that learn explicit
+user and item representations, GenRec utilizes the sequence modeling capability
+of Transformer and adopts the masked item prediction objective to effectively
+learn the hidden bidirectional sequential patterns. Different from existing
+generative sequential recommendation models, GenRec does not rely on manually
+designed hard prompts. The input to GenRec is textual user item sequence and
+the output is top ranked next items. Moreover, GenRec is lightweight and
+requires only a few hours to train effectively in low-resource settings, making
+it highly applicable to real-world scenarios and helping to democratize large
+language models in the sequential recommendation domain. Our extensive
+experiments have demonstrated that GenRec generalizes on various public
+real-world datasets and achieves state-of-the-art results. Our experiments also
+validate the effectiveness of the the proposed masked item prediction objective
+that improves the model performance by a large margin.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Summaries, Highlights, and Action items: Design, implementation and
+  evaluation of an LLM-powered meeting recap system <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15793v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15793v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sumit Asthana, Sagih Hilleli, Pengcheng He, Aaron Halfaker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Meetings play a critical infrastructural role in the coordination of work. In
+recent years, due to shift to hybrid and remote work, more meetings are moving
+to online Computer Mediated Spaces. This has led to new problems (e.g. more
+time spent in less engaging meetings) and new opportunities (e.g. automated
+transcription/captioning and recap support). Recent advances in large language
+models (LLMs) for dialog summarization have the potential to improve the
+experience of meetings by reducing individuals' meeting load and increasing the
+clarity and alignment of meeting outputs. Despite this potential, they face
+technological limitation due to long transcripts and inability to capture
+diverse recap needs based on user's context. To address these gaps, we design,
+implement and evaluate in-context a meeting recap system. We first
+conceptualize two salient recap representations -- important highlights, and a
+structured, hierarchical minutes view. We develop a system to operationalize
+the representations with dialogue summarization as its building blocks.
+Finally, we evaluate the effectiveness of the system with seven users in the
+context of their work meetings. Our findings show promise in using LLM-based
+dialogue summarization for meeting recap and the need for both representations
+in different contexts. However, we find that LLM-based recap still lacks an
+understanding of whats personally relevant to participants, can miss important
+details, and mis-attributions can be detrimental to group dynamics. We identify
+collaboration opportunities such as a shared recap document that a high quality
+recap enables. We report on implications for designing AI systems to partner
+with users to learn and improve from natural interactions to overcome the
+limitations related to personal relevance and summarization quality.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>in review for CSCW 24</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">142</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Score-Based Density Formula, with Applications in Diffusion Generative
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16765v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16765v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gen Li, Yuling Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Score-based generative models (SGMs) have revolutionized the field of
+generative modeling, achieving unprecedented success in generating realistic
+and diverse content. Despite empirical advances, the theoretical basis for why
+optimizing the evidence lower bound (ELBO) on the log-likelihood is effective
+for training diffusion generative models, such as DDPMs, remains largely
+unexplored. In this paper, we address this question by establishing a density
+formula for a continuous-time diffusion process, which can be viewed as the
+continuous-time limit of the forward process in an SGM. This formula reveals
+the connection between the target density and the score function associated
+with each step of the forward process. Building on this, we demonstrate that
+the minimizer of the optimization objective for training DDPMs nearly coincides
+with that of the true objective, providing a theoretical foundation for
+optimizing DDPMs using the ELBO. Furthermore, we offer new insights into the
+role of score-matching regularization in training GANs, the use of ELBO in
+diffusion classifiers, and the recently proposed diffusion loss.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UV-free Texture Generation with Denoising and Geodesic Heat Diffusions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16762v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16762v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simone Foti, Stefanos Zafeiriou, Tolga Birdal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Seams, distortions, wasted UV space, vertex-duplication, and varying
+resolution over the surface are the most prominent issues of the standard
+UV-based texturing of meshes. These issues are particularly acute when
+automatic UV-unwrapping techniques are used. For this reason, instead of
+generating textures in automatically generated UV-planes like most
+state-of-the-art methods, we propose to represent textures as coloured
+point-clouds whose colours are generated by a denoising diffusion probabilistic
+model constrained to operate on the surface of 3D objects. Our sampling and
+resolution agnostic generative model heavily relies on heat diffusion over the
+surface of the meshes for spatial communication between points. To enable
+processing of arbitrarily sampled point-cloud textures and ensure long-distance
+texture consistency we introduce a fast re-sampling of the mesh spectral
+properties used during the heat diffusion and introduce a novel
+heat-diffusion-based self-attention mechanism. Our code and pre-trained models
+are available at github.com/simofoti/UV3-TeD.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reinforcement Learning without Human Feedback for Last Mile Fine-Tuning
+  of Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16753v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16753v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alec Solway
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning is used to align language models with human preference
+signals after first pre-training the model to predict the next token of text
+within a large corpus using likelihood maximization. Before being deployed in a
+specific domain, models are often further fine-tuned on task specific data.
+Since human preferences are often unavailable for the last step, it is
+performed using likelihood maximization as that is the typical default method.
+However, reinforcement learning has other advantages besides facilitating
+alignment to a human derived reward function. For one, whereas likelihood
+maximization is a form of imitation learning in which the model is trained on
+what to do under ideal conditions, reinforcement learning is not limited to
+demonstrating actions just for optimally reached states and trains a model what
+to do under a range of scenarios as it explores the policy space. In addition,
+it also trains a model what not to do, suppressing competitive but poor
+actions. This work develops a framework for last-mile fine-tuning using
+reinforcement learning and tests whether it garners performance gains. The
+experiments center on abstractive summarization, but the framework is general
+and broadly applicable. Use of the procedure produced significantly better
+results than likelihood maximization when comparing raw predictions. For the
+specific data tested, the gap could be bridged by employing post-processing of
+the maximum likelihood outputs. Nonetheless, the framework offers a new avenue
+for model optimization in situations where post-processing may be less
+straightforward or effective, and it can be extended to include more complex
+classes of undesirable outputs to penalize and train against, such as
+hallucinations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Gradient Analysis Framework for Rewarding Good and Penalizing Bad
+  Examples in Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16751v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16751v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi-Lin Tuan, William Yang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Beyond maximum likelihood estimation (MLE), the standard objective of a
+language model (LM) that optimizes good examples probabilities, many studies
+have explored ways that also penalize bad examples for enhancing the quality of
+output distribution, including unlikelihood training, exponential maximizing
+average treatment effect (ExMATE), and direct preference optimization (DPO). To
+systematically compare these methods and further provide a unified recipe for
+LM optimization, in this paper, we present a unique angle of gradient analysis
+of loss functions that simultaneously reward good examples and penalize bad
+ones in LMs. Through both mathematical results and experiments on
+CausalDialogue and Anthropic HH-RLHF datasets, we identify distinct functional
+characteristics among these methods. We find that ExMATE serves as a superior
+surrogate for MLE, and that combining DPO with ExMATE instead of MLE further
+enhances both the statistical (5-7%) and generative (+18% win rate)
+performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mini-Omni: Language Models Can Hear, Talk While Thinking in Streaming 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16725v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16725v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhifei Xie, Changqiao Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in language models have achieved significant progress.
+GPT-4o, as a new milestone, has enabled real-time conversations with humans,
+demonstrating near-human natural fluency. Such human-computer interaction
+necessitates models with the capability to perform reasoning directly with the
+audio modality and generate output in streaming. However, this remains beyond
+the reach of current academic models, as they typically depend on extra TTS
+systems for speech synthesis, resulting in undesirable latency. This paper
+introduces the Mini-Omni, an audio-based end-to-end conversational model,
+capable of real-time speech interaction. To achieve this capability, we propose
+a text-instructed speech generation method, along with batch-parallel
+strategies during inference to further boost the performance. Our method also
+helps to retain the original model's language capabilities with minimal
+degradation, enabling other works to establish real-time interaction
+capabilities. We call this training method "Any Model Can Talk". We also
+introduce the VoiceAssistant-400K dataset to fine-tune models optimized for
+speech output. To our best knowledge, Mini-Omni is the first fully end-to-end,
+open-source model for real-time speech interaction, offering valuable potential
+for future research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A GREAT Architecture for Edge-Based Graph Problems Like TSP 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16717v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16717v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Attila Lischka, Jiaming Wu, Morteza Haghir Chehreghani, Balázs Kulcsár
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the last years, many neural network-based approaches have been proposed to
+tackle combinatorial optimization problems such as routing problems. Many of
+these approaches are based on graph neural networks (GNNs) or related
+transformers, operating on the Euclidean coordinates representing the routing
+problems. However, GNNs are inherently not well suited to operate on dense
+graphs, such as in routing problems. Furthermore, models operating on Euclidean
+coordinates cannot be applied to non-Euclidean versions of routing problems
+that are often found in real-world settings. To overcome these limitations, we
+propose a novel GNN-related edge-based neural model called Graph Edge Attention
+Network (GREAT). We evaluate the performance of GREAT in the
+edge-classification task to predict optimal edges in the Traveling Salesman
+Problem (TSP). We can use such a trained GREAT model to produce sparse TSP
+graph instances, keeping only the edges GREAT finds promising. Compared to
+other, non-learning-based methods to sparsify TSP graphs, GREAT can produce
+very sparse graphs while keeping most of the optimal edges. Furthermore, we
+build a reinforcement learning-based GREAT framework which we apply to
+Euclidean and non-Euclidean asymmetric TSP. This framework achieves
+state-of-the-art results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhanced forecasting of stock prices based on variational mode
+  decomposition, PatchTST, and adaptive scale-weighted layer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16707v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16707v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaorui Xue, Shaofang Li, Xiaonan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The significant fluctuations in stock index prices in recent years highlight
+the critical need for accurate forecasting to guide investment and financial
+strategies. This study introduces a novel composite forecasting framework that
+integrates variational mode decomposition (VMD), PatchTST, and adaptive
+scale-weighted layer (ASWL) to address these challenges. Utilizing datasets of
+four major stock indices--SP500, DJI, SSEC, and FTSE--from 2000 to 2024, the
+proposed method first decomposes the raw price series into intrinsic mode
+functions (IMFs) using VMD. Each IMF is then modeled with PatchTST to capture
+temporal patterns effectively. The ASWL module is applied to incorporate scale
+information, enhancing prediction accuracy. The final forecast is derived by
+aggregating predictions from all IMFs. The VMD-PatchTST-ASWL framework
+demonstrates significant improvements in forecasting accuracy compared to
+traditional models, showing robust performance across different indices. This
+innovative approach provides a powerful tool for stock index price forecasting,
+with potential applications in various financial analysis and investment
+decision-making contexts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SympGNNs: Symplectic Graph Neural Networks for identifiying
+  high-dimensional Hamiltonian systems and node classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16698v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16698v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alan John Varghese, Zhen Zhang, George Em Karniadakis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing neural network models to learn Hamiltonian systems, such as
+SympNets, although accurate in low-dimensions, struggle to learn the correct
+dynamics for high-dimensional many-body systems. Herein, we introduce
+Symplectic Graph Neural Networks (SympGNNs) that can effectively handle system
+identification in high-dimensional Hamiltonian systems, as well as node
+classification. SympGNNs combines symplectic maps with permutation
+equivariance, a property of graph neural networks. Specifically, we propose two
+variants of SympGNNs: i) G-SympGNN and ii) LA-SympGNN, arising from different
+parameterizations of the kinetic and potential energy. We demonstrate the
+capabilities of SympGNN on two physical examples: a 40-particle coupled
+Harmonic oscillator, and a 2000-particle molecular dynamics simulation in a
+two-dimensional Lennard-Jones potential. Furthermore, we demonstrate the
+performance of SympGNN in the node classification task, achieving accuracy
+comparable to the state-of-the-art. We also empirically show that SympGNN can
+overcome the oversmoothing and heterophily problems, two key challenges in the
+field of graph neural networks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CW-CNN & CW-AN: Convolutional Networks and Attention Networks for
+  CW-Complexes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16686v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16686v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rahul Khorana
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel framework for learning on CW-complex structured data
+points. Recent advances have discussed CW-complexes as ideal learning
+representations for problems in cheminformatics. However, there is a lack of
+available machine learning methods suitable for learning on CW-complexes. In
+this paper we develop notions of convolution and attention that are well
+defined for CW-complexes. These notions enable us to create the first neural
+network that can receive a CW-complex as input. We illustrate and interpret
+this framework in the context of supervised prediction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Catalog of Fairness-Aware Practices in Machine Learning Engineering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16683v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16683v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gianmario Voria, Giulia Sellitto, Carmine Ferrara, Francesco Abate, Andrea De Lucia, Filomena Ferrucci, Gemma Catolino, Fabio Palomba
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning's widespread adoption in decision-making processes raises
+concerns about fairness, particularly regarding the treatment of sensitive
+features and potential discrimination against minorities. The software
+engineering community has responded by developing fairness-oriented metrics,
+empirical studies, and approaches. However, there remains a gap in
+understanding and categorizing practices for engineering fairness throughout
+the machine learning lifecycle. This paper presents a novel catalog of
+practices for addressing fairness in machine learning derived from a systematic
+mapping study. The study identifies and categorizes 28 practices from existing
+literature, mapping them onto different stages of the machine learning
+lifecycle. From this catalog, the authors extract actionable items and
+implications for both researchers and practitioners in software engineering.
+This work aims to provide a comprehensive resource for integrating fairness
+considerations into the development and deployment of machine learning systems,
+enhancing their reliability, accountability, and credibility.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Entropic Distribution Matching in Supervised Fine-tuning of LLMs: Less
+  Overfitting and Better Diversity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16673v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16673v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziniu Li, Congliang Chen, Tian Xu, Zeyu Qin, Jiancong Xiao, Ruoyu Sun, Zhi-Quan Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models rely on Supervised Fine-Tuning (SFT) to specialize in
+downstream tasks. Cross Entropy (CE) loss is the de facto choice in SFT, but it
+often leads to overfitting and limited output diversity due to its aggressive
+updates to the data distribution. This paper aim to address these issues by
+introducing the maximum entropy principle, which favors models with flatter
+distributions that still effectively capture the data. Specifically, we develop
+a new distribution matching method called GEM, which solves reverse
+Kullback-Leibler divergence minimization with an entropy regularizer.
+  For the SFT of Llama-3-8B models, GEM outperforms CE in several aspects.
+First, when applied to the UltraFeedback dataset to develop general
+instruction-following abilities, GEM exhibits reduced overfitting, evidenced by
+lower perplexity and better performance on the IFEval benchmark. Furthermore,
+GEM enhances output diversity, leading to performance gains of up to 7 points
+on math reasoning and code generation tasks using best-of-n sampling, even
+without domain-specific data. Second, when fine-tuning with domain-specific
+datasets for math reasoning and code generation, GEM also shows less
+overfitting and improvements of up to 10 points compared with CE.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Iterative Graph Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16667v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16667v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fangyuan Yu, Hardeep Singh Arora, Matt Johnson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  By compressing diverse narratives, LLMs go beyond memorization, achieving
+intelligence by capturing generalizable causal relationships. However, they
+suffer from local 'representation gaps' due to insufficient training data
+diversity, limiting their real-world utility, especially in tasks requiring
+strict alignment to rules. Traditional alignment methods relying on heavy human
+annotations are inefficient and unscalable. Recent self-alignment techniques
+also fall short, as they often depend on self-selection based prompting and
+memorization-based learning. To address these issues, we introduce Iterative
+Graph Alignment (IGA), an annotation-free rule-based alignment algorithm. A
+teacher model (VLM) employs Iterative Graph Prompting (IGP) to create logical
+graphs and reference answers. The student model (LLM) identifies local
+knowledge gaps by attempting to align its responses with these references,
+collaborating with helper models to generate diverse answers. These aligned
+responses are then used for iterative supervised fine-tuning (SFT). Our
+evaluations across five rule-based scenarios demonstrate IGP's effectiveness,
+with a 73.12\% alignment improvement in Claude Sonnet 3.5, and
+Llama3-8B-Instruct achieving an 86.20\% improvement, outperforming Claude
+Sonnet 3.5 in rule-based alignment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimal Parallelization of Boosting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16653v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16653v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arthur da Cunha, Mikael Møller Høgsgaard, Kasper Green Larsen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent works on the parallel complexity of Boosting have established strong
+lower bounds on the tradeoff between the number of training rounds $p$ and the
+total parallel work per round $t$. These works have also presented highly
+non-trivial parallel algorithms that shed light on different regions of this
+tradeoff. Despite these advancements, a significant gap persists between the
+theoretical lower bounds and the performance of these algorithms across much of
+the tradeoff space. In this work, we essentially close this gap by providing
+both improved lower bounds on the parallel complexity of weak-to-strong
+learners, and a parallel Boosting algorithm whose performance matches these
+bounds across the entire $p$ vs.~$t$ compromise spectrum, up to logarithmic
+factors. Ultimately, this work settles the true parallel complexity of Boosting
+algorithms that are nearly sample-optimal.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Efficient Modelling of String Dynamics: A Comparison of State
+  Space and Koopman based Deep Learning Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16650v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16650v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rodrigo Diaz, Carlos De La Vega Martin, Mark Sandler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents an examination of State Space Models (SSM) and
+Koopman-based deep learning methods for modelling the dynamics of both linear
+and non-linear stiff strings. Through experiments with datasets generated under
+different initial conditions and sample rates, we assess the capacity of these
+models to accurately model the complex behaviours observed in string dynamics.
+Our findings indicate that our proposed Koopman-based model performs as well as
+or better than other existing approaches in non-linear cases for long-sequence
+modelling.
+  We inform the design of these architectures with the structure of the
+problems at hand. Although challenges remain in extending model predictions
+beyond the training horizon (i.e., extrapolation), the focus of our
+investigation lies in the models' ability to generalise across different
+initial conditions within the training time interval. This research contributes
+insights into the physical modelling of dynamical systems (in particular those
+addressing musical acoustics) by offering a comparative overview of these and
+previous methods and introducing innovative strategies for model improvement.
+Our results highlight the efficacy of these models in simulating non-linear
+dynamics and emphasise their wide-ranging applicability in accurately modelling
+dynamical systems over extended sequences.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to DAFx2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 3D Pose-Based Temporal Action Segmentation for Figure Skating: A
+  Fine-Grained and Jump Procedure-Aware Annotation Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16638v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16638v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ryota Tanaka, Tomohiro Suzuki, Keisuke Fujii
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding human actions from videos is essential in many domains,
+including sports. In figure skating, technical judgments are performed by
+watching skaters' 3D movements, and its part of the judging procedure can be
+regarded as a Temporal Action Segmentation (TAS) task. TAS tasks in figure
+skating that automatically assign temporal semantics to video are actively
+researched. However, there is a lack of datasets and effective methods for TAS
+tasks requiring 3D pose data. In this study, we first created the FS-Jump3D
+dataset of complex and dynamic figure skating jumps using optical markerless
+motion capture. We also propose a new fine-grained figure skating jump TAS
+dataset annotation method with which TAS models can learn jump procedures. In
+the experimental results, we validated the usefulness of 3D pose features as
+input and the fine-grained dataset for the TAS model in figure skating.
+FS-Jump3D Dataset is available at https://github.com/ryota-skating/FS-Jump3D.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 7th ACM International Workshop on Multimedia Content
+  Analysis in Sports</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Turbulence Strength $C_n^2$ Estimation from Video using Physics-based
+  Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16623v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16623v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ripon Kumar Saha, Esen Salcin, Jihoo Kim, Joseph Smith, Suren Jayasuriya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Images captured from a long distance suffer from dynamic image distortion due
+to turbulent flow of air cells with random temperatures, and thus refractive
+indices. This phenomenon, known as image dancing, is commonly characterized by
+its refractive-index structure constant $C_n^2$ as a measure of the turbulence
+strength. For many applications such as atmospheric forecast model,
+long-range/astronomy imaging, and aviation safety, optical communication
+technology, $C_n^2$ estimation is critical for accurately sensing the turbulent
+environment. Previous methods for $C_n^2$ estimation include estimation from
+meteorological data (temperature, relative humidity, wind shear, etc.) for
+single-point measurements, two-ended pathlength measurements from optical
+scintillometer for path-averaged $C_n^2$, and more recently estimating $C_n^2$
+from passive video cameras for low cost and hardware complexity. In this paper,
+we present a comparative analysis of classical image gradient methods for
+$C_n^2$ estimation and modern deep learning-based methods leveraging
+convolutional neural networks. To enable this, we collect a dataset of video
+capture along with reference scintillometer measurements for ground truth, and
+we release this unique dataset to the scientific community. We observe that
+deep learning methods can achieve higher accuracy when trained on similar data,
+but suffer from generalization errors to other, unseen imagery as compared to
+classical methods. To overcome this trade-off, we present a novel physics-based
+network architecture that combines learned convolutional layers with a
+differentiable image gradient method that maintains high accuracy while being
+generalizable across image datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code Available: https://github.com/Riponcs/Cn2Estimation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Infusing Auxiliary Knowledge for Distracted Driver Detection <span class="chip">KDD</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16621v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16621v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ishwar B Balappanawar, Ashmit Chamoli, Ruwan Wickramarachchi, Aditya Mishra, Ponnurangam Kumaraguru, Amit P. Sheth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Distracted driving is a leading cause of road accidents globally.
+Identification of distracted driving involves reliably detecting and
+classifying various forms of driver distraction (e.g., texting, eating, or
+using in-car devices) from in-vehicle camera feeds to enhance road safety. This
+task is challenging due to the need for robust models that can generalize to a
+diverse set of driver behaviors without requiring extensive annotated datasets.
+In this paper, we propose KiD3, a novel method for distracted driver detection
+(DDD) by infusing auxiliary knowledge about semantic relations between entities
+in a scene and the structural configuration of the driver's pose. Specifically,
+we construct a unified framework that integrates the scene graphs, and driver
+pose information with the visual cues in video frames to create a holistic
+representation of the driver's actions.Our results indicate that KiD3 achieves
+a 13.64% accuracy improvement over the vision-only baseline by incorporating
+such auxiliary knowledge with visual information.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at KiL 2024: Workshop on Knowledge-infused Learning
+  co-located with 30th ACM KDD Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hyperdimensional Vector Tsetlin Machines with Applications to Sequence
+  Learning and Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16620v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16620v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christian D. Blakely
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We construct a two-layered model for learning and generating sequential data
+that is both computationally fast and competitive with vanilla Tsetlin
+machines, adding numerous advantages. Through the use of hyperdimensional
+vector computing (HVC) algebras and Tsetlin machine clause structures, we
+demonstrate that the combination of both inherits the generality of data
+encoding and decoding of HVC with the fast interpretable nature of Tsetlin
+machines to yield a powerful machine learning model. We apply the approach in
+two areas, namely in forecasting, generating new sequences, and classification.
+For the latter, we derive results for the entire UCR Time Series Archive and
+compare with the standard benchmarks to see how well the method competes in
+time series classification.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Blending Low and High-Level Semantics of Time Series for Better Masked
+  Time Series Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16613v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16613v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Johan Vik Mathisen, Erlend Lokna, Daesoo Lee, Erlend Aune
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  State-of-the-art approaches in time series generation (TSG), such as
+TimeVQVAE, utilize vector quantization-based tokenization to effectively model
+complex distributions of time series. These approaches first learn to transform
+time series into a sequence of discrete latent vectors, and then a prior model
+is learned to model the sequence. The discrete latent vectors, however, only
+capture low-level semantics (\textit{e.g.,} shapes). We hypothesize that
+higher-fidelity time series can be generated by training a prior model on more
+informative discrete latent vectors that contain both low and high-level
+semantics (\textit{e.g.,} characteristic dynamics). In this paper, we introduce
+a novel framework, termed NC-VQVAE, to integrate self-supervised learning into
+those TSG methods to derive a discrete latent space where low and high-level
+semantics are captured. Our experimental results demonstrate that NC-VQVAE
+results in a considerable improvement in the quality of synthetic samples.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Data Quality Monitoring through Transfer Learning on Anomaly Detection
+  for the Hadron Calorimeters 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16612v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16612v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mulugeta Weldezgina Asres, Christian Walter Omlin, Long Wang, Pavel Parygin, David Yu, Jay Dittmann, The CMS-HCAL Collaboration
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The proliferation of sensors brings an immense volume of spatio-temporal (ST)
+data in many domains for various purposes, including monitoring, diagnostics,
+and prognostics applications. Data curation is a time-consuming process for a
+large volume of data, making it challenging and expensive to deploy data
+analytics platforms in new environments. Transfer learning (TL) mechanisms
+promise to mitigate data sparsity and model complexity by utilizing pre-trained
+models for a new task. Despite the triumph of TL in fields like computer vision
+and natural language processing, efforts on complex ST models for anomaly
+detection (AD) applications are limited. In this study, we present the
+potential of TL within the context of AD for the Hadron Calorimeter of the
+Compact Muon Solenoid experiment at CERN. We have transferred the ST AD models
+trained on data collected from one part of a calorimeter to another. We have
+investigated different configurations of TL on semi-supervised autoencoders of
+the ST AD models -- transferring convolutional, graph, and recurrent neural
+networks of both the encoder and decoder networks. The experiment results
+demonstrate that TL effectively enhances the model learning accuracy on a
+target subdetector. The TL achieves promising data reconstruction and AD
+performance while substantially reducing the trainable parameters of the AD
+models. It also improves robustness against anomaly contamination in the
+training data sets of the semi-supervised AD models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, 15 figures, and 9 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Subspace Representation Learning for Sparse Linear Arrays to Localize
+  More Sources than Sensors: A Deep Learning Methodology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16605v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16605v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kuan-Lin Chen, Bhaskar D. Rao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Localizing more sources than sensors with a sparse linear array (SLA) has
+long relied on minimizing a distance between two covariance matrices and recent
+algorithms often utilize semidefinite programming (SDP). Although deep neural
+network (DNN)-based methods offer new alternatives, they still depend on
+covariance matrix fitting. In this paper, we develop a novel methodology that
+estimates the co-array subspaces from a sample covariance for SLAs. Our
+methodology trains a DNN to learn signal and noise subspace representations
+that are invariant to the selection of bases. To learn such representations, we
+propose loss functions that gauge the separation between the desired and the
+estimated subspace. In particular, we propose losses that measure the length of
+the shortest path between subspaces viewed on a union of Grassmannians, and
+prove that it is possible for a DNN to approximate signal subspaces. The
+computation of learning subspaces of different dimensions is accelerated by a
+new batch sampling strategy called consistent rank sampling. The methodology is
+robust to array imperfections due to its geometry-agnostic and data-driven
+nature. In addition, we propose a fully end-to-end gridless approach that
+directly learns angles to study the possibility of bypassing subspace methods.
+Numerical results show that learning such subspace representations is more
+beneficial than learning covariances or angles. It outperforms conventional
+SDP-based methods such as the sparse and parametric approach (SPA) and existing
+DNN-based covariance reconstruction methods for a wide range of signal-to-noise
+ratios (SNRs), snapshots, and source numbers for both perfect and imperfect
+arrays.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages. Submitted to the IEEE Transactions on Signal Processing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ sEMG-Driven Physics-Informed Gated Recurrent Networks for Modeling Upper
+  Limb Multi-Joint Movement Dynamics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16599v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16599v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rajnish Kumar, Anand Gupta, Suriya Prakash Muthukrishnan, Lalan Kumar, Sitikantha Roy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Exoskeletons and rehabilitation systems offer great potential for enhancing
+human strength and recovery through advanced human-machine interfaces (HMIs)
+that adapt to movement dynamics. However, the real-time application of
+physics-informed neural networks (PINNs) is limited by their reliance on fixed
+input lengths and surrogate models. This study introduces a novel
+physics-informed Gated Recurrent Network (PiGRN) designed to predict
+multi-joint torques using surface electromyography (sEMG) data. The PiGRN model
+employs a Gated Recurrent Unit (GRU) to convert time-series sEMG inputs into
+multi-joint kinematics and external loads, which are then integrated into an
+equation of motion to ensure consistency with physical laws. Experimental
+validation with sEMG data from five participants performing elbow
+flexion-extension tasks showed that the PiGRN model accurately predicted joint
+torques for 10 unfamiliar movements, with RMSE values between 4.02\% and
+11.40\% and correlation coefficients ranging from 0.87 to 0.98. These findings
+highlight the PiGRN's potential for real-time exoskeleton and rehabilitation
+applications. Future research will explore more diverse datasets, improve
+musculoskeletal models, and investigate unsupervised learning methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ High-Dimensional Sparse Data Low-rank Representation via Accelerated
+  Asynchronous Parallel Stochastic Gradient Descent 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16592v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16592v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qicong Hu, Hao Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data characterized by high dimensionality and sparsity are commonly used to
+describe real-world node interactions. Low-rank representation (LR) can map
+high-dimensional sparse (HDS) data to low-dimensional feature spaces and infer
+node interactions via modeling data latent associations. Unfortunately,
+existing optimization algorithms for LR models are computationally inefficient
+and slowly convergent on large-scale datasets. To address this issue, this
+paper proposes an Accelerated Asynchronous Parallel Stochastic Gradient Descent
+A2PSGD for High-Dimensional Sparse Data Low-rank Representation with three
+fold-ideas: a) establishing a lock-free scheduler to simultaneously respond to
+scheduling requests from multiple threads; b) introducing a greedy
+algorithm-based load balancing strategy for balancing the computational load
+among threads; c) incorporating Nesterov's accelerated gradient into the
+learning scheme to accelerate model convergence. Empirical studies show that
+A2PSGD outperforms existing optimization algorithms for HDS data LR in both
+accuracy and training time.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CrisperWhisper: Accurate Timestamps on Verbatim Speech Transcriptions <span class="chip">INTERSPEECH2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16589v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16589v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laurin Wagner, Bernhard Thallinger, Mario Zusag
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We demonstrate that carefully adjusting the tokenizer of the Whisper speech
+recognition model significantly improves the precision of word-level timestamps
+when applying dynamic time warping to the decoder's cross-attention scores. We
+fine-tune the model to produce more verbatim speech transcriptions and employ
+several techniques to increase robustness against multiple speakers and
+background noise. These adjustments achieve state-of-the-art performance on
+benchmarks for verbatim speech transcription, word segmentation, and the timed
+detection of filler events, and can further mitigate transcription
+hallucinations. The code is available open
+https://github.com/nyrahealth/CrisperWhisper.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at INTERSPEECH2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Transformer</span>s Meet ACT-R: Repeat-Aware and Sequential Listening Session
+  Recommendation <span class="chip">RecSys'2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16578v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16578v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Viet-Anh Tran, Guillaume Salha-Galvan, Bruno Sguerra, Romain Hennequin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Music streaming services often leverage sequential recommender systems to
+predict the best music to showcase to users based on past sequences of
+listening sessions. Nonetheless, most sequential recommendation methods ignore
+or insufficiently account for repetitive behaviors. This is a crucial
+limitation for music recommendation, as repeatedly listening to the same song
+over time is a common phenomenon that can even change the way users perceive
+this song. In this paper, we introduce PISA (Psychology-Informed Session
+embedding using ACT-R), a session-level sequential recommender system that
+overcomes this limitation. PISA employs a Transformer architecture learning
+embedding representations of listening sessions and users using attention
+mechanisms inspired by Anderson's ACT-R (Adaptive Control of Thought-Rational),
+a cognitive architecture modeling human information access and memory dynamics.
+This approach enables us to capture dynamic and repetitive patterns from user
+behaviors, allowing us to effectively predict the songs they will listen to in
+subsequent sessions, whether they are repeated or new ones. We demonstrate the
+empirical relevance of PISA using both publicly available listening data from
+Last.fm and proprietary data from Deezer, a global music streaming service,
+confirming the critical importance of repetition modeling for sequential
+listening session recommendation. Along with this paper, we publicly release
+our proprietary dataset to foster future research in this field, as well as the
+source code of PISA to facilitate its future use.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages. Accepted by RecSys'2024, full paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Seeking the Sufficiency and Necessity Causal Features in Multimodal
+  Representation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16577v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16577v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Boyu Chen, Junjie Liu, Zhu Li, Mengyue yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning representations with a high Probability of Necessary and Sufficient
+Causes (PNS) has been shown to enhance deep learning models' ability. This task
+involves identifying causal features that are both sufficient (guaranteeing the
+outcome) and necessary (without which the outcome cannot occur). However,
+current research predominantly focuses on unimodal data, and extending PNS
+learning to multimodal settings presents significant challenges. The challenges
+arise as the conditions for PNS identifiability, Exogeneity and Monotonicity,
+need to be reconsidered in a multimodal context, where sufficient and necessary
+causal features are distributed across different modalities. To address this,
+we first propose conceptualizing multimodal representations as comprising
+modality-invariant and modality-specific components. We then analyze PNS
+identifiability for each component, while ensuring non-trivial PNS estimation.
+Finally, we formulate tractable optimization objectives that enable multimodal
+models to learn high-PNS representations, thereby enhancing their predictive
+performance. Experiments demonstrate the effectiveness of our method on both
+synthetic and real-world data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Adaptive Latent Factorization of Tensors Model for Embedding Dynamic
+  Communication Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16573v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16573v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Liao, Qicong Hu, Peng Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Dynamic Communication Network (DCN) describes the interactions over time
+among various communication nodes, and it is widely used in Big-data
+applications as a data source. As the number of communication nodes increases
+and temporal slots accumulate, each node interacts in with only a few nodes in
+a given temporal slot, the DCN can be represented by an High-Dimensional Sparse
+(HDS) tensor. In order to extract rich behavioral patterns from an HDS tensor
+in DCN, this paper proposes an Adaptive Temporal-dependent Tensor low-rank
+representation (ATT) model. It adopts a three-fold approach: a) designing a
+temporal-dependent method to reconstruct temporal feature matrix, thereby
+precisely represent the data by capturing the temporal patterns; b) achieving
+hyper-parameters adaptation of the model via the Differential Evolutionary
+Algorithms (DEA) to avoid tedious hyper-parameters tuning; c) employing
+nonnegative learning schemes for the model parameters to effectively handle an
+the nonnegativity inherent in HDS data. The experimental results on four
+real-world DCNs demonstrate that the proposed ATT model significantly
+outperforms several state-of-the-art models in both prediction errors and
+convergence rounds.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Identifying Terrain Physical Parameters from Vision -- Towards
+  Physical-Parameter-Aware Locomotion and Navigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16567v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16567v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaqi Chen, Jonas Frey, Ruyi Zhou, Takahiro Miki, Georg Martius, Marco Hutter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Identifying the physical properties of the surrounding environment is
+essential for robotic locomotion and navigation to deal with non-geometric
+hazards, such as slippery and deformable terrains. It would be of great benefit
+for robots to anticipate these extreme physical properties before contact;
+however, estimating environmental physical parameters from vision is still an
+open challenge. Animals can achieve this by using their prior experience and
+knowledge of what they have seen and how it felt. In this work, we propose a
+cross-modal self-supervised learning framework for vision-based environmental
+physical parameter estimation, which paves the way for future
+physical-property-aware locomotion and navigation. We bridge the gap between
+existing policies trained in simulation and identification of physical terrain
+parameters from vision. We propose to train a physical decoder in simulation to
+predict friction and stiffness from multi-modal input. The trained network
+allows the labeling of real-world images with physical parameters in a
+self-supervised manner to further train a visual network during deployment,
+which can densely predict the friction and stiffness from image data. We
+validate our physical decoder in simulation and the real world using a
+quadruped ANYmal robot, outperforming an existing baseline method. We show that
+our visual network can predict the physical properties in indoor and outdoor
+experiments while allowing fast adaptation to new environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Android Malware Detection Based on RGB Images and Multi-feature Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16555v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16555v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiqiang Wang, Qiulong Yu, Sicheng Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the widespread adoption of smartphones, Android malware has become a
+significant challenge in the field of mobile device security. Current Android
+malware detection methods often rely on feature engineering to construct
+dynamic or static features, which are then used for learning. However, static
+feature-based methods struggle to counter code obfuscation, packing, and
+signing techniques, while dynamic feature-based methods involve time-consuming
+feature extraction. Image-based methods for Android malware detection offer
+better resilience against malware variants and polymorphic malware. This paper
+proposes an end-to-end Android malware detection technique based on RGB images
+and multi-feature fusion. The approach involves extracting Dalvik Executable
+(DEX) files, AndroidManifest.xml files, and API calls from APK files,
+converting them into grayscale images, and enhancing their texture features
+using Canny edge detection, histogram equalization, and adaptive thresholding
+techniques. These grayscale images are then combined into an RGB image
+containing multi-feature fusion information, which is analyzed using mainstream
+image classification models for Android malware detection. Extensive
+experiments demonstrate that the proposed method effectively captures Android
+malware characteristics, achieving an accuracy of up to 97.25%, outperforming
+existing detection methods that rely solely on DEX files as classification
+features. Additionally, ablation experiments confirm the effectiveness of using
+the three key files for feature representation in the proposed approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages,10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Super-Resolution works for coastal simulations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16553v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16553v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhi-Song Liu, Markus Buttner, Vadym Aizinger, Andreas Rupp
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning fine-scale details of a coastal ocean simulation from a coarse
+representation is a challenging task. For real-world applications,
+high-resolution simulations are necessary to advance understanding of many
+coastal processes, specifically, to predict flooding resulting from tsunamis
+and storm surges. We propose a Deep Network for Coastal Super-Resolution
+(DNCSR) for spatiotemporal enhancement to efficiently learn the high-resolution
+numerical solution. Given images of coastal simulations produced on
+low-resolution computational meshes using low polynomial order discontinuous
+Galerkin discretizations and a coarse temporal resolution, the proposed DNCSR
+learns to produce high-resolution free surface elevation and velocity
+visualizations in both time and space. To efficiently model the dynamic changes
+over time and space, we propose grid-aware spatiotemporal attention to project
+the temporal features to the spatial domain for non-local feature matching. The
+coordinate information is also utilized via positional encoding. For the final
+reconstruction, we use the spatiotemporal bilinear operation to interpolate the
+missing frames and then expand the feature maps to the frequency domain for
+residual mapping. Besides data-driven losses, the proposed physics-informed
+loss guarantees gradient consistency and momentum changes. Their combination
+contributes to the overall 24% improvements in RMSE. To train the proposed
+model, we propose a large-scale coastal simulation dataset and use it for model
+optimization and evaluation. Our method shows superior super-resolution quality
+and fast computation compared to the state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Statistical and Geometrical properties of regularized Kernel
+  Kullback-Leibler divergence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16543v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16543v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Clémentine Chazal, Anna Korba, Francis Bach
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we study the statistical and geometrical properties of the
+Kullback-Leibler divergence with kernel covariance operators (KKL) introduced
+by Bach [2022]. Unlike the classical Kullback-Leibler (KL) divergence that
+involves density ratios, the KKL compares probability distributions through
+covariance operators (embeddings) in a reproducible kernel Hilbert space
+(RKHS), and compute the Kullback-Leibler quantum divergence. This novel
+divergence hence shares parallel but different aspects with both the standard
+Kullback-Leibler between probability distributions and kernel embeddings
+metrics such as the maximum mean discrepancy. A limitation faced with the
+original KKL divergence is its inability to be defined for distributions with
+disjoint supports. To solve this problem, we propose in this paper a
+regularised variant that guarantees that the divergence is well defined for all
+distributions. We derive bounds that quantify the deviation of the regularised
+KKL to the original one, as well as finite-sample bounds. In addition, we
+provide a closed-form expression for the regularised KKL, specifically
+applicable when the distributions consist of finite sets of points, which makes
+it implementable. Furthermore, we derive a Wasserstein gradient descent scheme
+of the KKL divergence in the case of discrete distributions, and study
+empirically its properties to transport a set of points to a target
+distribution.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SALSA: Speedy ASR-LLM Synchronous Aggregation <span class="chip">INTERSPEECH 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16542v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16542v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ashish Mittal, Darshan Prabhu, Sunita Sarawagi, Preethi Jyothi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Harnessing pre-trained LLMs to improve ASR systems, particularly for
+low-resource languages, is now an emerging area of research. Existing methods
+range from using LLMs for ASR error correction to tightly coupled systems that
+replace the ASR decoder with the LLM. These approaches either increase decoding
+time or require expensive training of the cross-attention layers. We propose
+SALSA, which couples the decoder layers of the ASR to the LLM decoder, while
+synchronously advancing both decoders. Such coupling is performed with a simple
+projection of the last decoder state, and is thus significantly more training
+efficient than earlier approaches. A challenge of our proposed coupling is
+handling the mismatch between the tokenizers of the LLM and ASR systems. We
+handle this mismatch using cascading tokenization with respect to the LLM and
+ASR vocabularies. We evaluate SALSA on 8 low-resource languages in the FLEURS
+benchmark, yielding substantial WER reductions of up to 38%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to INTERSPEECH 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SFR-GNN: Simple and Fast Robust GNNs against Structural Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16537v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16537v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xing Ai, Guanyu Zhu, Yulin Zhu, Yu Zheng, Gaolei Li, Jianhua Li, Kai Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) have demonstrated commendable performance for
+graph-structured data. Yet, GNNs are often vulnerable to adversarial structural
+attacks as embedding generation relies on graph topology. Existing efforts are
+dedicated to purifying the maliciously modified structure or applying adaptive
+aggregation, thereby enhancing the robustness against adversarial structural
+attacks. It is inevitable for a defender to consume heavy computational costs
+due to lacking prior knowledge about modified structures. To this end, we
+propose an efficient defense method, called Simple and Fast Robust Graph Neural
+Network (SFR-GNN), supported by mutual information theory. The SFR-GNN first
+pre-trains a GNN model using node attributes and then fine-tunes it over the
+modified graph in the manner of contrastive learning, which is free of
+purifying modified structures and adaptive aggregation, thus achieving great
+efficiency gains. Consequently, SFR-GNN exhibits a 24%--162% speedup compared
+to advanced robust models, demonstrating superior robustness for node
+classification tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TinyTNAS: GPU-Free, Time-Bound, Hardware-Aware Neural Architecture
+  Search for TinyML Time Series Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16535v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16535v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bidyut Saha, Riya Samanta, Soumya K. Ghosh, Ram Babu Roy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we present TinyTNAS, a novel hardware-aware multi-objective
+Neural Architecture Search (NAS) tool specifically designed for TinyML time
+series classification. Unlike traditional NAS methods that rely on GPU
+capabilities, TinyTNAS operates efficiently on CPUs, making it accessible for a
+broader range of applications. Users can define constraints on RAM, FLASH, and
+MAC operations to discover optimal neural network architectures within these
+parameters. Additionally, the tool allows for time-bound searches, ensuring the
+best possible model is found within a user-specified duration. By experimenting
+with benchmark dataset UCI HAR, PAMAP2, WISDM, MIT BIH, and PTB Diagnostic ECG
+Databas TinyTNAS demonstrates state-of-the-art accuracy with significant
+reductions in RAM, FLASH, MAC usage, and latency. For example, on the UCI HAR
+dataset, TinyTNAS achieves a 12x reduction in RAM usage, a 144x reduction in
+MAC operations, and a 78x reduction in FLASH memory while maintaining superior
+accuracy and reducing latency by 149x. Similarly, on the PAMAP2 and WISDM
+datasets, it achieves a 6x reduction in RAM usage, a 40x reduction in MAC
+operations, an 83x reduction in FLASH, and a 67x reduction in latency, all
+while maintaining superior accuracy. Notably, the search process completes
+within 10 minutes in a CPU environment. These results highlight TinyTNAS's
+capability to optimize neural network architectures effectively for
+resource-constrained TinyML applications, ensuring both efficiency and high
+performance. The code for TinyTNAS is available at the GitHub repository and
+can be accessed at https://github.com/BidyutSaha/TinyTNAS.git.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ WavTokenizer: an Efficient Acoustic Discrete Codec Tokenizer for Audio
+  Language Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16532v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16532v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengpeng Ji, Ziyue Jiang, Xize Cheng, Yifu Chen, Minghui Fang, Jialong Zuo, Qian Yang, Ruiqi Li, Ziang Zhang, Xiaoda Yang, Rongjie Huang, Yidi Jiang, Qian Chen, Siqi Zheng, Wen Wang, Zhou Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language models have been effectively applied to modeling natural signals,
+such as images, video, speech, and audio. A crucial component of these models
+is the codec tokenizer, which compresses high-dimensional natural signals into
+lower-dimensional discrete tokens. In this paper, we introduce WavTokenizer,
+which offers several advantages over previous SOTA acoustic codec models in the
+audio domain: 1)extreme compression. By compressing the layers of quantizers
+and the temporal dimension of the discrete codec, one-second audio of 24kHz
+sampling rate requires only a single quantizer with 40 or 75 tokens. 2)improved
+subjective quality. Despite the reduced number of tokens, WavTokenizer achieves
+state-of-the-art reconstruction quality with outstanding UTMOS scores and
+inherently contains richer semantic information. Specifically, we achieve these
+results by designing a broader VQ space, extended contextual windows, and
+improved attention networks, as well as introducing a powerful multi-scale
+discriminator and an inverse Fourier transform structure. We conducted
+extensive reconstruction experiments in the domains of speech, audio, and
+music. WavTokenizer exhibited strong performance across various objective and
+subjective metrics compared to state-of-the-art models. We also tested semantic
+information, VQ utilization, and adaptability to generative models.
+Comprehensive ablation studies confirm the necessity of each module in
+WavTokenizer. The related code, demos, and pre-trained models are available at
+https://github.com/jishengpeng/WavTokenizer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Working in progress. arXiv admin note: text overlap with
+  arXiv:2402.12208</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multitask learning for improved scour detection: A dynamic wave tank
+  study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16527v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16527v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simon M. Brealy, Aidan J. Hughes, Tina A. Dardeno, Lawrence A. Bull, Robin S. Mills, Nikolaos Dervilis, Keith Worden
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Population-based structural health monitoring (PBSHM), aims to share
+information between members of a population. An offshore wind (OW) farm could
+be considered as a population of nominally-identical wind-turbine structures.
+However, benign variations exist among members, such as geometry, sea-bed
+conditions and temperature differences. These factors could influence
+structural properties and therefore the dynamic response, making it more
+difficult to detect structural problems via traditional SHM techniques.
+  This paper explores the use of a Bayesian hierarchical model as a means of
+multitask learning, to infer foundation stiffness distribution parameters at
+both population and local levels. To do this, observations of natural frequency
+from populations of structures were first generated from both numerical and
+experimental models. These observations were then used in a partially-pooled
+Bayesian hierarchical model in tandem with surrogate FE models of the
+structures to infer foundation stiffness parameters. Finally, it is
+demonstrated how the learned parameters may be used as a basis to perform more
+robust anomaly detection (as compared to a no-pooling approach) e.g. as a
+result of scour.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 12 figures, early work features in ISWHM 2023 conference
+  proceedings and available here: arXiv:2402.19295. Submitted to the Renewable
+  Energy journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Variational Continual Learning via Task-Heuristic Modelling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16517v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16517v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fan Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Variational continual learning (VCL) is a turn-key learning algorithm that
+has state-of-the-art performance among the best continual learning models. In
+our work, we explore an extension of the generalized variational continual
+learning (GVCL) model, named AutoVCL, which combines task heuristics for
+informed learning and model optimization. We demonstrate that our model
+outperforms the standard GVCL with fixed hyperparameters, benefiting from the
+automatic adjustment of the hyperparameter based on the difficulty and
+similarity of the incoming task compared to the previous tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 2 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On-device AI: Quantization-aware Training of <span class="highlight-title">Transformer</span>s in Time-Series 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16495v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16495v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianheng Ling, Gregor Schiele
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial Intelligence (AI) models for time-series in pervasive computing
+keep getting larger and more complicated. The Transformer model is by far the
+most compelling of these AI models. However, it is difficult to obtain the
+desired performance when deploying such a massive model on a sensor device with
+limited resources. My research focuses on optimizing the Transformer model for
+time-series forecasting tasks. The optimized model will be deployed as hardware
+accelerators on embedded Field Programmable Gate Arrays (FPGAs). I will
+investigate the impact of applying Quantization-aware Training to the
+Transformer model to reduce its size and runtime memory footprint while
+maximizing the advantages of FPGAs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is accepted by 2023 IEEE International Conference on
+  Pervasive Computing and Communications(PhD Forum)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Exploratory Deep Learning Approach for Predicting Subsequent Suicidal
+  Acts in Chinese Psychological Support Hotlines 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16463v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16463v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Changwei Song, Qing Zhao, Jianqiang Li, Yining Chen, Yongsheng Tong, Guanghui Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Psychological support hotlines are an effective suicide prevention measure
+that typically relies on professionals using suicide risk assessment scales to
+predict individual risk scores. However, the accuracy of scale-based predictive
+methods for suicide risk assessment can vary widely depending on the expertise
+of the operator. This limitation underscores the need for more reliable
+methods, prompting this research's innovative exploration of the use of
+artificial intelligence to improve the accuracy and efficiency of suicide risk
+prediction within the context of psychological support hotlines. The study
+included data from 1,549 subjects from 2015-2017 in China who contacted a
+psychological support hotline. Each participant was followed for 12 months to
+identify instances of suicidal behavior. We proposed a novel multi-task
+learning method that uses the large-scale pre-trained model Whisper for feature
+extraction and fits psychological scales while predicting the risk of suicide.
+The proposed method yields a 2.4\% points improvement in F1-score compared to
+the traditional manual approach based on the psychological scales. Our model
+demonstrated superior performance compared to the other eight popular models.
+To our knowledge, this study is the first to apply deep learning to long-term
+speech data to predict suicide risk in China, indicating grate potential for
+clinical applications. The source code is publicly available at:
+\url{https://github.com/songchangwei/Suicide-Risk-Prediction}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HYGENE: A Diffusion-based Hypergraph Generation Method 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16457v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16457v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dorian Gailhard, Enzo Tartaglione, Lirida Naviner De Barros, Jhony H. Giraldo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hypergraphs are powerful mathematical structures that can model complex,
+high-order relationships in various domains, including social networks,
+bioinformatics, and recommender systems. However, generating realistic and
+diverse hypergraphs remains challenging due to their inherent complexity and
+lack of effective generative models. In this paper, we introduce a
+diffusion-based Hypergraph Generation (HYGENE) method that addresses these
+challenges through a progressive local expansion approach. HYGENE works on the
+bipartite representation of hypergraphs, starting with a single pair of
+connected nodes and iteratively expanding it to form the target hypergraph. At
+each step, nodes and hyperedges are added in a localized manner using a
+denoising diffusion process, which allows for the construction of the global
+structure before refining local details. Our experiments demonstrated the
+effectiveness of HYGENE, proving its ability to closely mimic a variety of
+properties in hypergraphs. To the best of our knowledge, this is the first
+attempt to employ deep learning models for hypergraph generation, and our work
+aims to lay the groundwork for future research in this area.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2312.11529 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Do Recommender Systems Promote Local Music? A Reproducibility Study
+  Using Music Streaming Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16430v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16430v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kristina Matrosova, Lilian Marey, Guillaume Salha-Galvan, Thomas Louail, Olivier Bodini, Manuel Moussallam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper examines the influence of recommender systems on local music
+representation, discussing prior findings from an empirical study on the LFM-2b
+public dataset. This prior study argued that different recommender systems
+exhibit algorithmic biases shifting music consumption either towards or against
+local content. However, LFM-2b users do not reflect the diverse audience of
+music streaming services. To assess the robustness of this study's conclusions,
+we conduct a comparative analysis using proprietary listening data from a
+global music streaming service, which we publicly release alongside this paper.
+We observe significant differences in local music consumption patterns between
+our dataset and LFM-2b, suggesting that caution should be exercised when
+drawing conclusions on local music based solely on LFM-2b. Moreover, we show
+that the algorithmic biases exhibited in the original work vary in our dataset,
+and that several unexplored model parameters can significantly influence these
+biases and affect the study's conclusion on both datasets. Finally, we discuss
+the complexity of accurately labeling local music, emphasizing the risk of
+misleading conclusions due to unreliable, biased, or incomplete labels. To
+encourage further research and ensure reproducibility, we have publicly shared
+our dataset and code.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gradient-free variational learning with conditional mixture networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16429v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16429v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Conor Heins, Hao Wu, Dimitrije Markovic, Alexander Tschantz, Jeff Beck, Christopher Buckley
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Balancing computational efficiency with robust predictive performance is
+crucial in supervised learning, especially for critical applications. Standard
+deep learning models, while accurate and scalable, often lack probabilistic
+features like calibrated predictions and uncertainty quantification. Bayesian
+methods address these issues but can be computationally expensive as model and
+data complexity increase. Previous work shows that fast variational methods can
+reduce the compute requirements of Bayesian methods by eliminating the need for
+gradient computation or sampling, but are often limited to simple models. We
+demonstrate that conditional mixture networks (CMNs), a probabilistic variant
+of the mixture-of-experts (MoE) model, are suitable for fast, gradient-free
+inference and can solve complex classification tasks. CMNs employ linear
+experts and a softmax gating network. By exploiting conditional conjugacy and
+P\'olya-Gamma augmentation, we furnish Gaussian likelihoods for the weights of
+both the linear experts and the gating network. This enables efficient
+variational updates using coordinate ascent variational inference (CAVI),
+avoiding traditional gradient-based optimization. We validate this approach by
+training two-layer CMNs on standard benchmarks from the UCI repository. Our
+method, CAVI-CMN, achieves competitive and often superior predictive accuracy
+compared to maximum likelihood estimation (MLE) with backpropagation, while
+maintaining competitive runtime and full posterior distributions over all model
+parameters. Moreover, as input size or the number of experts increases,
+computation time scales competitively with MLE and other gradient-based
+solutions like black-box variational inference (BBVI), making CAVI-CMN a
+promising tool for deep, fast, and gradient-free Bayesian networks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages main text (3 figures), including references. 9 pages
+  supplementary material (5 figures)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Comparative Study of Hyperparameter Tuning Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16425v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16425v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Subhasis Dasgupta, Jaydip Sen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The study emphasizes the challenge of finding the optimal trade-off between
+bias and variance, especially as hyperparameter optimization increases in
+complexity. Through empirical analysis, three hyperparameter tuning algorithms
+Tree-structured Parzen Estimator (TPE), Genetic Search, and Random Search are
+evaluated across regression and classification tasks. The results show that
+nonlinear models, with properly tuned hyperparameters, significantly outperform
+linear models. Interestingly, Random Search excelled in regression tasks, while
+TPE was more effective for classification tasks. This suggests that there is no
+one-size-fits-all solution, as different algorithms perform better depending on
+the task and model type. The findings underscore the importance of selecting
+the appropriate tuning method and highlight the computational challenges
+involved in optimizing machine learning models, particularly as search spaces
+expand.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This chapter has been accepted in the edited volume titles "Data
+  Science in Theory and Practice", editor J Sen & S Roy Choudhury. The volume
+  is expected to be published in October 2024 by Cambridge Scholars Publishing,
+  New Castle upon Tyne, UK. This chapter is 34 pages long and it contains 11
+  tables and 8 images</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fourier Spectral Physics Informed Neural Network: An Efficient and
+  Low-Memory PINN 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16414v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16414v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianchi Yu, Yiming Qi, Ivan Oseledets, Shiyi Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With growing investigations into solving partial differential equations by
+physics-informed neural networks (PINNs), more accurate and efficient PINNs are
+required to meet the practical demands of scientific computing. One bottleneck
+of current PINNs is computing the high-order derivatives via automatic
+differentiation which often necessitates substantial computing resources. In
+this paper, we focus on removing the automatic differentiation of the spatial
+derivatives and propose a spectral-based neural network that substitutes the
+differential operator with a multiplication. Compared to the PINNs, our
+approach requires lower memory and shorter training time. Thanks to the
+exponential convergence of the spectral basis, our approach is more accurate.
+Moreover, to handle the different situations between physics domain and
+spectral domain, we provide two strategies to train networks by their spectral
+information. Through a series of comprehensive experiments, We validate the
+aforementioned merits of our proposed network.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DeepSPoC: A Deep Learning-Based PDE Solver Governed by Sequential
+  Propagation of Chaos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16403v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16403v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kai Du, Yongle Xie, Tao Zhou, Yuancheng Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequential propagation of chaos (SPoC) is a recently developed tool to solve
+mean-field stochastic differential equations and their related nonlinear
+Fokker-Planck equations. Based on the theory of SPoC, we present a new method
+(deepSPoC) that combines the interacting particle system of SPoC and deep
+learning. Under the framework of deepSPoC, two classes of frequently used deep
+models include fully connected neural networks and normalizing flows are
+considered. For high-dimensional problems, spatial adaptive method are designed
+to further improve the accuracy and efficiency of deepSPoC. We analysis the
+convergence of the framework of deepSPoC under some simplified conditions and
+also provide a posterior error estimation for the algorithm. Finally, we test
+our methods on a wide range of different types of mean-field equations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Illuminating the Diversity-Fitness Trade-Off in Black-Box Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16393v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16393v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maria Laura Santoni, Elena Raponi, Aneta Neumann, Frank Neumann, Mike Preuss, Carola Doerr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In real-world applications, users often favor structurally diverse design
+choices over one high-quality solution. It is hence important to consider more
+solutions that decision-makers can compare and further explore based on
+additional criteria. Alongside the existing approaches of evolutionary
+diversity optimization, quality diversity, and multimodal optimization, this
+paper presents a fresh perspective on this challenge by considering the problem
+of identifying a fixed number of solutions with a pairwise distance above a
+specified threshold while maximizing their average quality.
+  We obtain first insight into these objectives by performing a subset
+selection on the search trajectories of different well-established search
+heuristics, whether specifically designed with diversity in mind or not. We
+emphasize that the main goal of our work is not to present a new algorithm but
+to look at the problem in a more fundamental and theoretically tractable way by
+asking the question: What trade-off exists between the minimum distance within
+batches of solutions and the average quality of their fitness? These insights
+also provide us with a way of making general claims concerning the properties
+of optimization problems that shall be useful in turn for benchmarking
+algorithms of the approaches enumerated above.
+  A possibly surprising outcome of our empirical study is the observation that
+naive uniform random sampling establishes a very strong baseline for our
+problem, hardly ever outperformed by the search trajectories of the considered
+heuristics. We interpret these results as a motivation to develop algorithms
+tailored to produce diverse solutions of high average quality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TempoKGAT: A Novel Graph Attention Network Approach for Temporal Graph
+  Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16391v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16391v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lena Sasal, Daniel Busby, Abdenour Hadid
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks (GNN) have shown significant capabilities in handling
+structured data, yet their application to dynamic, temporal data remains
+limited. This paper presents a new type of graph attention network, called
+TempoKGAT, which combines time-decaying weight and a selective neighbor
+aggregation mechanism on the spatial domain, which helps uncover latent
+patterns in the graph data. In this approach, a top-k neighbor selection based
+on the edge weights is introduced to represent the evolving features of the
+graph data. We evaluated the performance of our TempoKGAT on multiple datasets
+from the traffic, energy, and health sectors involving spatio-temporal data. We
+compared the performance of our approach to several state-of-the-art methods
+found in the literature on several open-source datasets. Our method shows
+superior accuracy on all datasets. These results indicate that TempoKGAT builds
+on existing methodologies to optimize prediction accuracy and provide new
+insights into model interpretation in temporal contexts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Addressing Common Misinterpretations of KART and UAT in Neural Network
+  Literature 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16389v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16389v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vugar Ismailov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This note addresses the Kolmogorov-Arnold Representation Theorem (KART) and
+the Universal Approximation Theorem (UAT), focusing on their common
+misinterpretations in some papers related to neural network approximation. Our
+remarks aim to support a more accurate understanding of KART and UAT among
+neural network specialists.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TG-PhyNN: An Enhanced Physically-Aware Graph Neural Network framework
+  for forecasting Spatio-Temporal Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16379v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16379v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zakaria Elabid, Lena Sasal, Daniel Busby, Abdenour Hadid
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurately forecasting dynamic processes on graphs, such as traffic flow or
+disease spread, remains a challenge. While Graph Neural Networks (GNNs) excel
+at modeling and forecasting spatio-temporal data, they often lack the ability
+to directly incorporate underlying physical laws. This work presents TG-PhyNN,
+a novel Temporal Graph Physics-Informed Neural Network framework. TG-PhyNN
+leverages the power of GNNs for graph-based modeling while simultaneously
+incorporating physical constraints as a guiding principle during training. This
+is achieved through a two-step prediction strategy that enables the calculation
+of physical equation derivatives within the GNN architecture. Our findings
+demonstrate that TG-PhyNN significantly outperforms traditional forecasting
+models (e.g., GRU, LSTM, GAT) on real-world spatio-temporal datasets like
+PedalMe (traffic flow), COVID-19 spread, and Chickenpox outbreaks. These
+datasets are all governed by well-defined physical principles, which TG-PhyNN
+effectively exploits to offer more reliable and accurate forecasts in various
+domains where physical processes govern the dynamics of data. This paves the
+way for improved forecasting in areas like traffic flow prediction, disease
+outbreak prediction, and potentially other fields where physics plays a crucial
+role.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Machine learning models for daily rainfall forecasting in Northern
+  Tropical Africa using tropical wave predictors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16349v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16349v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Athul Rasheeda Satheesh, Peter Knippertz, Andreas H. Fink
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Numerical weather prediction (NWP) models often underperform compared to
+simpler climatology-based precipitation forecasts in northern tropical Africa,
+even after statistical postprocessing. AI-based forecasting models show promise
+but have avoided precipitation due to its complexity. Synoptic-scale forcings
+like African easterly waves and other tropical waves (TWs) are important for
+predictability in tropical Africa, yet their value for predicting daily
+rainfall remains unexplored. This study uses two machine-learning models--gamma
+regression and a convolutional neural network (CNN)--trained on TW predictors
+from satellite-based GPM IMERG data to predict daily rainfall during the
+July-September monsoon season. Predictor variables are derived from the local
+amplitude and phase information of seven TW from the target and
+up-and-downstream neighboring grids at 1-degree spatial resolution. The ML
+models are combined with Easy Uncertainty Quantification (EasyUQ) to generate
+calibrated probabilistic forecasts and are compared with three benchmarks:
+Extended Probabilistic Climatology (EPC15), ECMWF operational ensemble forecast
+(ENS), and a probabilistic forecast from the ENS control member using EasyUQ
+(CTRL EasyUQ). The study finds that downstream predictor variables offer the
+highest predictability, with downstream tropical depression (TD)-type
+wave-based predictors being most important. Other waves like mixed-Rossby
+gravity (MRG), Kelvin, and inertio-gravity waves also contribute significantly
+but show regional preferences. ENS forecasts exhibit poor skill due to
+miscalibration. CTRL EasyUQ shows improvement over ENS and marginal enhancement
+over EPC15. Both gamma regression and CNN forecasts significantly outperform
+benchmarks in tropical Africa. This study highlights the potential of ML models
+trained on TW-based predictors to improve daily precipitation forecasts in
+tropical Africa.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Do Graph Neural Networks Work for High Entropy Alloys? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16337v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16337v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hengrui Zhang, Ruishu Huang, Jie Chen, James M. Rondinelli, Wei Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks (GNNs) have excelled in predictive modeling for both
+crystals and molecules, owing to the expressiveness of graph representations.
+High-entropy alloys (HEAs), however, lack chemical long-range order, limiting
+the applicability of current graph representations. To overcome this challenge,
+we propose a representation of HEAs as a collection of local environment (LE)
+graphs. Based on this representation, we introduce the LESets machine learning
+model, an accurate, interpretable GNN for HEA property prediction. We
+demonstrate the accuracy of LESets in modeling the mechanical properties of
+quaternary HEAs. Through analyses and interpretation, we further extract
+insights into the modeling and design of HEAs. In a broader sense, LESets
+extends the potential applicability of GNNs to disordered materials with
+combinatorial complexity formed by diverse constituents and their flexible
+configurations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GL-TSVM: A robust and smooth twin support vector machine with guardian
+  loss function 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16336v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16336v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mushir Akhtar, M. Tanveer, Mohd. Arshad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Twin support vector machine (TSVM), a variant of support vector machine
+(SVM), has garnered significant attention due to its $3/4$ times lower
+computational complexity compared to SVM. However, due to the utilization of
+the hinge loss function, TSVM is sensitive to outliers or noise. To remedy it,
+we introduce the guardian loss (G-loss), a novel loss function distinguished by
+its asymmetric, bounded, and smooth characteristics. We then fuse the proposed
+G-loss function into the TSVM and yield a robust and smooth classifier termed
+GL-TSVM. Further, to adhere to the structural risk minimization (SRM) principle
+and reduce overfitting, we incorporate a regularization term into the objective
+function of GL-TSVM. To address the optimization challenges of GL-TSVM, we
+devise an efficient iterative algorithm. The experimental analysis on UCI and
+KEEL datasets substantiates the effectiveness of the proposed GL-TSVM in
+comparison to the baseline models. Moreover, to showcase the efficacy of the
+proposed GL-TSVM in the biomedical domain, we evaluated it on the breast cancer
+(BreaKHis) and schizophrenia datasets. The outcomes strongly demonstrate the
+competitiveness of the proposed GL-TSVM against the baseline models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2404.18101</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Self-Improving Diffusion Models with Synthetic Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16333v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16333v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sina Alemohammad, Ahmed Imtiaz Humayun, Shruti Agarwal, John Collomosse, Richard Baraniuk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The artificial intelligence (AI) world is running out of real data for
+training increasingly large generative models, resulting in accelerating
+pressure to train on synthetic data. Unfortunately, training new generative
+models with synthetic data from current or past generation models creates an
+autophagous (self-consuming) loop that degrades the quality and/or diversity of
+the synthetic data in what has been termed model autophagy disorder (MAD) and
+model collapse. Current thinking around model autophagy recommends that
+synthetic data is to be avoided for model training lest the system deteriorate
+into MADness. In this paper, we take a different tack that treats synthetic
+data differently from real data. Self-IMproving diffusion models with Synthetic
+data (SIMS) is a new training concept for diffusion models that uses
+self-synthesized data to provide negative guidance during the generation
+process to steer a model's generative process away from the non-ideal synthetic
+data manifold and towards the real data distribution. We demonstrate that SIMS
+is capable of self-improvement; it establishes new records based on the
+Fr\'echet inception distance (FID) metric for CIFAR-10 and ImageNet-64
+generation and achieves competitive results on FFHQ-64 and ImageNet-512.
+Moreover, SIMS is, to the best of our knowledge, the first prophylactic
+generative AI algorithm that can be iteratively trained on self-generated
+synthetic data without going MAD. As a bonus, SIMS can adjust a diffusion
+model's synthetic data distribution to match any desired in-domain target
+distribution to help mitigate biases and ensure fairness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Minimising changes to audit when updating decision trees 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16321v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16321v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anj Simmons, Scott Barnett, Anupam Chaudhuri, Sankhya Singh, Shangeetha Sivasothy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interpretable models are important, but what happens when the model is
+updated on new training data? We propose an algorithm for updating a decision
+tree while minimising the number of changes to the tree that a human would need
+to audit. We achieve this via a greedy approach that incorporates the number of
+changes to the tree as part of the objective function. We compare our algorithm
+to existing methods and show that it sits in a sweet spot between final
+accuracy and number of changes to audit.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Passenger hazard perception based on EEG signals for highly automated
+  driving vehicles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16315v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16315v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ashton Yu Xuan Tan, Yingkai Yang, Xiaofei Zhang, Bowen Li, Xiaorong Gao, Sifa Zheng, Jianqiang Wang, Xinyu Gu, Jun Li, Yang Zhao, Yuxin Zhang, Tania Stathaki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Enhancing the safety of autonomous vehicles is crucial, especially given
+recent accidents involving automated systems. As passengers in these vehicles,
+humans' sensory perception and decision-making can be integrated with
+autonomous systems to improve safety. This study explores neural mechanisms in
+passenger-vehicle interactions, leading to the development of a Passenger
+Cognitive Model (PCM) and the Passenger EEG Decoding Strategy (PEDS). Central
+to PEDS is a novel Convolutional Recurrent Neural Network (CRNN) that captures
+spatial and temporal EEG data patterns. The CRNN, combined with stacking
+algorithms, achieves an accuracy of $85.0\% \pm 3.18\%$. Our findings highlight
+the predictive power of pre-event EEG data, enhancing the detection of
+hazardous scenarios and offering a network-driven framework for safer
+autonomous vehicles.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Physics of Language Models: Part 2.2, How to Learn From Mistakes on
+  Grade-School Math Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16293v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16293v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tian Ye, Zicheng Xu, Yuanzhi Li, Zeyuan Allen-Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language models have demonstrated remarkable performance in solving reasoning
+tasks; however, even the strongest models still occasionally make reasoning
+mistakes. Recently, there has been active research aimed at improving reasoning
+accuracy, particularly by using pretrained language models to "self-correct"
+their mistakes via multi-round prompting. In this paper, we follow this line of
+work but focus on understanding the usefulness of incorporating
+"error-correction" data directly into the pretraining stage. This data consists
+of erroneous solution steps immediately followed by their corrections. Using a
+synthetic math dataset, we show promising results: this type of pretrain data
+can help language models achieve higher reasoning accuracy directly (i.e.,
+through simple auto-regression, without multi-round prompting) compared to
+pretraining on the same amount of error-free data. We also delve into many
+details, such as (1) how this approach differs from beam search, (2) how such
+data can be prepared, (3) whether masking is needed on the erroneous tokens,
+(4) the amount of error required, (5) whether such data can be deferred to the
+fine-tuning stage, and many others.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2407.20311</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Flexible framework for generating synthetic electrocardiograms and
+  photoplethysmograms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16291v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16291v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Katri Karhinoja, Antti Vasankari, Jukka-Pekka Sirkiä, Antti Airola, David Wong, Matti Kaisti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  By generating synthetic biosignals, the quantity and variety of health data
+can be increased. This is especially useful when training machine learning
+models by enabling data augmentation and introduction of more physiologically
+plausible variation to the data. For these purposes, we have developed a
+synthetic biosignal model for two signal modalities, electrocardiography (ECG)
+and photoplethysmography (PPG). The model produces realistic signals that
+account for physiological effects such as breathing modulation and changes in
+heart rate due to physical stress. Arrhythmic signals can be generated with
+beat intervals extracted from real measurements. The model also includes a
+flexible approach to adding different kinds of noise and signal artifacts. The
+noise is generated from power spectral densities extracted from both measured
+noisy signals and modeled power spectra. Importantly, the model also
+automatically produces labels for noise, segmentation (e.g. P and T waves, QRS
+complex, for electrocardiograms), and artifacts. We assessed how this
+comprehensive model can be used in practice to improve the performance of
+models trained on ECG or PPG data. For example, we trained an LSTM to detect
+ECG R-peaks using both real ECG signals from the MIT-BIH arrythmia set and our
+new generator. The F1 score of the model was 0.83 using real data, in
+comparison to 0.98 using our generator. In addition, the model can be used for
+example in signal segmentation, quality detection and bench-marking detection
+algorithms. The model code has been released in
+\url{https://github.com/UTU-Health-Research/framework_for_synthetic_biosignals}
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OpenFGL: A Comprehensive Benchmarks for Federated Graph Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16288v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16288v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xunkai Li, Yinlin Zhu, Boyang Pang, Guochen Yan, Yeyu Yan, Zening Li, Zhengyu Wu, Wentao Zhang, Rong-Hua Li, Guoren Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated graph learning (FGL) has emerged as a promising distributed
+training paradigm for graph neural networks across multiple local systems
+without direct data sharing. This approach is particularly beneficial in
+privacy-sensitive scenarios and offers a new perspective on addressing
+scalability challenges in large-scale graph learning. Despite the proliferation
+of FGL, the diverse motivations from practical applications, spanning various
+research backgrounds and experimental settings, pose a significant challenge to
+fair evaluation. To fill this gap, we propose OpenFGL, a unified benchmark
+designed for the primary FGL scenarios: Graph-FL and Subgraph-FL. Specifically,
+OpenFGL includes 38 graph datasets from 16 application domains, 8 federated
+data simulation strategies that emphasize graph properties, and 5 graph-based
+downstream tasks. Additionally, it offers 18 recently proposed SOTA FGL
+algorithms through a user-friendly API, enabling a thorough comparison and
+comprehensive evaluation of their effectiveness, robustness, and efficiency.
+Empirical results demonstrate the ability of FGL while also revealing its
+potential limitations, offering valuable insights for future exploration in
+this thriving field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Near-Optimal Policy Identification in Robust Constrained Markov Decision
+  Processes via Epigraph Form 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16286v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16286v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Toshinori Kitamura, Tadashi Kozuno, Wataru Kumagai, Kenta Hoshino, Yohei Hosoe, Kazumi Kasaura, Masashi Hamaya, Paavo Parmas, Yutaka Matsuo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Designing a safe policy for uncertain environments is crucial in real-world
+control applications. However, this challenge remains inadequately addressed
+within the Markov decision process (MDP) framework. This paper presents the
+first algorithm capable of identifying a near-optimal policy in a robust
+constrained MDP (RCMDP), where an optimal policy minimizes cumulative cost
+while satisfying constraints in the worst-case scenario across a set of
+environments. We first prove that the conventional Lagrangian max-min
+formulation with policy gradient methods can become trapped in suboptimal
+solutions by encountering a sum of conflicting gradients from the objective and
+constraint functions during its inner minimization problem. To address this, we
+leverage the epigraph form of the RCMDP problem, which resolves the conflict by
+selecting a single gradient from either the objective or the constraints.
+Building on the epigraph form, we propose a binary search algorithm with a
+policy gradient subroutine and prove that it identifies an
+$\varepsilon$-optimal policy in an RCMDP with
+$\tilde{\mathcal{O}}(\varepsilon^{-4})$ policy evaluations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ART: Actually Robust Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16285v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16285v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sebastian Chwilczyński, Kacper Trębacz, Karol Cyganik, Mateusz Małecki, Dariusz Brzezinski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current interest in deep learning captures the attention of many programmers
+and researchers. Unfortunately, the lack of a unified schema for developing
+deep learning models results in methodological inconsistencies, unclear
+documentation, and problems with reproducibility. Some guidelines have been
+proposed, yet currently, they lack practical implementations. Furthermore,
+neural network training often takes on the form of trial and error, lacking a
+structured and thoughtful process. To alleviate these issues, in this paper, we
+introduce Art, a Python library designed to help automatically impose rules and
+standards while developing deep learning pipelines. Art divides model
+development into a series of smaller steps of increasing complexity, each
+concluded with a validation check improving the interpretability and robustness
+of the process. The current version of Art comes equipped with nine predefined
+steps inspired by Andrej Karpathy's Recipe for Training Neural Networks, a
+visualization dashboard, and integration with loggers such as Neptune. The code
+related to this paper is available at:
+https://github.com/SebChw/Actually-Robust-Training.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Customer Churn Prediction in Telecommunications: An Adaptive
+  Ensemble Learning Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16284v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16284v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammed Affan Shaikhsurab, Pramod Magadum
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Customer churn, the discontinuation of services by existing customers, poses
+a significant challenge to the telecommunications industry. This paper proposes
+a novel adaptive ensemble learning framework for highly accurate customer churn
+prediction. The framework integrates multiple base models, including XGBoost,
+LightGBM, LSTM, a Multi-Layer Perceptron (MLP) neural network, and Support
+Vector Machine (SVM). These models are strategically combined using a stacking
+ensemble method, further enhanced by meta-feature generation from base model
+predictions. A rigorous data preprocessing pipeline, coupled with a
+multi-faceted feature engineering approach, optimizes model performance. The
+framework is evaluated on three publicly available telecom churn datasets,
+demonstrating substantial accuracy improvements over state-of-the-art
+techniques. The research achieves a remarkable 99.28% accuracy, signifying a
+major advancement in churn prediction.The implications of this research for
+developing proactive customer retention strategies withinthe telecommunications
+industry are discussed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages,2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Web Service QoS Prediction via Extended Canonical Polyadic-based Tensor
+  Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16278v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16278v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qu Wang, Hao Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Today, numerous web services with similar functionalities are available on
+the Internet. Users often evaluate the Quality of Service (QoS) to choose the
+best option among them. Predicting the QoS values of these web services is a
+significant challenge in the field of web services. A Canonical Polyadic
+(CP)-based tensor network model has proven to be efficient for predicting
+dynamic QoS data. However, current CP-based tensor network models do not
+consider the correlation of users and services in the low-dimensional latent
+feature space, thereby limiting model's prediction capability. To tackle this
+issue, this paper proposes an Extended Canonical polyadic-based Tensor Network
+(ECTN) model. It models the correlation of users and services via building a
+relation dimension between user feature and service feature in low-dimensional
+space, and then designs an extended CP decomposition structure to improve
+prediction accuracy. Experiments are conducted on two public dynamic QoS data,
+and the results show that compared with state-of-the-art QoS prediction models,
+the ECTN obtains higher prediction accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On Convergence of Average-Reward Q-Learning in Weakly Communicating
+  Markov Decision Processes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16262v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16262v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Wan, Huizhen Yu, Richard S. Sutton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper analyzes reinforcement learning (RL) algorithms for Markov
+decision processes (MDPs) under the average-reward criterion. We focus on
+Q-learning algorithms based on relative value iteration (RVI), which are
+model-free stochastic analogues of the classical RVI method for average-reward
+MDPs. These algorithms have low per-iteration complexity, making them
+well-suited for large state space problems. We extend the almost-sure
+convergence analysis of RVI Q-learning algorithms developed by Abounadi,
+Bertsekas, and Borkar (2001) from unichain to weakly communicating MDPs. This
+extension is important both practically and theoretically: weakly communicating
+MDPs cover a much broader range of applications compared to unichain MDPs, and
+their optimality equations have a richer solution structure (with multiple
+degrees of freedom), introducing additional complexity in proving algorithmic
+convergence. We also characterize the sets to which RVI Q-learning algorithms
+converge, showing that they are compact, connected, potentially nonconvex, and
+comprised of solutions to the average-reward optimality equation, with exactly
+one less degree of freedom than the general solution set of this equation.
+Furthermore, we extend our analysis to two RVI-based hierarchical
+average-reward RL algorithms using the options framework, proving their
+almost-sure convergence and characterizing their sets of convergence under the
+assumption that the underlying semi-Markov decision process is weakly
+communicating.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating Time-Series Training <span class="highlight-title">Dataset</span> through Lens of Spectrum in Deep
+  State Space Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16261v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16261v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sekitoshi Kanai, Yasutoshi Ida, Kazuki Adachi, Mihiro Uchida, Tsukasa Yoshida, Shin'ya Yamaguchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study investigates a method to evaluate time-series datasets in terms of
+the performance of deep neural networks (DNNs) with state space models (deep
+SSMs) trained on the dataset. SSMs have attracted attention as components
+inside DNNs to address time-series data. Since deep SSMs have powerful
+representation capacities, training datasets play a crucial role in solving a
+new task. However, the effectiveness of training datasets cannot be known until
+deep SSMs are actually trained on them. This can increase the cost of data
+collection for new tasks, as a trial-and-error process of data collection and
+time-consuming training are needed to achieve the necessary performance. To
+advance the practical use of deep SSMs, the metric of datasets to estimate the
+performance early in the training can be one key element. To this end, we
+introduce the concept of data evaluation methods used in system identification.
+In system identification of linear dynamical systems, the effectiveness of
+datasets is evaluated by using the spectrum of input signals. We introduce this
+concept to deep SSMs, which are nonlinear dynamical systems. We propose the
+K-spectral metric, which is the sum of the top-K spectra of signals inside deep
+SSMs, by focusing on the fact that each layer of a deep SSM can be regarded as
+a linear dynamical system. Our experiments show that the K-spectral metric has
+a large absolute value of the correlation coefficient with the performance and
+can be used to evaluate the quality of training datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Coalitions of AI-based Methods Predict 15-Year Risks of Breast Cancer
+  Metastasis Using Real-World Clinical Data with AUC up to 0.9 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16256v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16256v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xia Jiang, Yijun Zhou, Alan Wells, Adam Brufsky
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Breast cancer is one of the two cancers responsible for the most deaths in
+women, with about 42,000 deaths each year in the US. That there are over
+300,000 breast cancers newly diagnosed each year suggests that only a fraction
+of the cancers result in mortality. Thus, most of the women undergo seemingly
+curative treatment for localized cancers, but a significant later succumb to
+metastatic disease for which current treatments are only temporizing for the
+vast majority. The current prognostic metrics are of little actionable value
+for 4 of the 5 women seemingly cured after local treatment, and many women are
+exposed to morbid and even mortal adjuvant therapies unnecessarily, with these
+adjuvant therapies reducing metastatic recurrence by only a third. Thus, there
+is a need for better prognostics to target aggressive treatment at those who
+are likely to relapse and spare those who were actually cured. While there is a
+plethora of molecular and tumor-marker assays in use and under-development to
+detect recurrence early, these are time consuming, expensive and still often
+un-validated as to actionable prognostic utility. A different approach would
+use large data techniques to determine clinical and histopathological
+parameters that would provide accurate prognostics using existing data. Herein,
+we report on machine learning, together with grid search and Bayesian Networks
+to develop algorithms that present a AUC of up to 0.9 in ROC analyses, using
+only extant data. Such algorithms could be rapidly translated to clinical
+management as they do not require testing beyond routine tumor evaluations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Iterated Energy-based Flow Matching for Sampling from Boltzmann
+  Densities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16249v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16249v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongyeop Woo, Sungsoo Ahn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we consider the problem of training a generator from
+evaluations of energy functions or unnormalized densities. This is a
+fundamental problem in probabilistic inference, which is crucial for scientific
+applications such as learning the 3D coordinate distribution of a molecule. To
+solve this problem, we propose iterated energy-based flow matching (iEFM), the
+first off-policy approach to train continuous normalizing flow (CNF) models
+from unnormalized densities. We introduce the simulation-free energy-based flow
+matching objective, which trains the model to predict the Monte Carlo
+estimation of the marginal vector field constructed from known energy
+functions. Our framework is general and can be extended to variance-exploding
+(VE) and optimal transport (OT) conditional probability paths. We evaluate iEFM
+on a two-dimensional Gaussian mixture model (GMM) and an eight-dimensional
+four-particle double-well potential (DW-4) energy function. Our results
+demonstrate that iEFM outperforms existing methods, showcasing its potential
+for efficient and scalable probabilistic modeling in complex high-dimensional
+systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PACiM: A Sparsity-Centric Hybrid Compute-in-Memory Architecture via
+  Probabilistic Approximation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16246v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16246v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenlun Zhang, Shimpei Ando, Yung-Chin Chen, Satomi Miyagi, Shinya Takamaeda-Yamazaki, Kentaro Yoshioka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Approximate computing emerges as a promising approach to enhance the
+efficiency of compute-in-memory (CiM) systems in deep neural network
+processing. However, traditional approximate techniques often significantly
+trade off accuracy for power efficiency, and fail to reduce data transfer
+between main memory and CiM banks, which dominates power consumption. This
+paper introduces a novel probabilistic approximate computation (PAC) method
+that leverages statistical techniques to approximate multiply-and-accumulation
+(MAC) operations, reducing approximation error by 4X compared to existing
+approaches. PAC enables efficient sparsity-based computation in CiM systems by
+simplifying complex MAC vector computations into scalar calculations. Moreover,
+PAC enables sparsity encoding and eliminates the LSB activations transmission,
+significantly reducing data reads and writes. This sets PAC apart from
+traditional approximate computing techniques, minimizing not only computation
+power but also memory accesses by 50%, thereby boosting system-level
+efficiency. We developed PACiM, a sparsity-centric architecture that fully
+exploits sparsity to reduce bit-serial cycles by 81% and achieves a peak 8b/8b
+efficiency of 14.63 TOPS/W in 65 nm CMOS while maintaining high accuracy of
+93.85/72.36/66.02% on CIFAR-10/CIFAR-100/ImageNet benchmarks using a ResNet-18
+model, demonstrating the effectiveness of our PAC methodology.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large-Scale Multi-omic Biosequence <span class="highlight-title">Transformer</span>s for Modeling
+  Peptide-Nucleotide Interactions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16245v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16245v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sully F. Chen, Robert J. Steele, Beakal Lemeneh, Shivanand P. Lad, Eric Oermann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The transformer architecture has revolutionized bioinformatics and driven
+progress in the understanding and prediction of the properties of biomolecules.
+Almost all research on large-scale biosequence transformers has focused on one
+domain at a time (single-omic), usually nucleotides or peptides. These models
+have seen incredible success in downstream tasks in each domain and have
+achieved particularly noteworthy breakthroughs in sequences of peptides and
+structural modeling. However, these single-omic models are naturally incapable
+of modeling multi-omic tasks, one of the most biologically critical being
+nucleotide-peptide interactions.
+  We present our work training the first multi-omic nucleotide-peptide
+foundation models. We show that these multi-omic models (MOMs) can learn joint
+representations between various single-omic distributions that are emergently
+consistent with the Central Dogma of molecular biology, despite only being
+trained on unlabeled biosequences. We further demonstrate that MOMs can be
+fine-tuned to achieve state-of-the-art results on peptide-nucleotide
+interaction tasks, namely predicting the change in Gibbs free energy
+({\Delta}G) of the binding interaction between a given oligonucleotide and
+peptide, as well as the effect on this binding interaction due to mutations in
+the oligonucleotide sequence ({\Delta}{\Delta}G).
+  Remarkably, we show that multi-omic biosequence transformers emergently learn
+useful structural information without any prior structural training, allowing
+us to predict which peptide residues are most involved in the
+peptide-nucleotide binding interaction. Lastly, we provide evidence that
+multi-omic biosequence models are non-inferior to foundation models trained on
+single-omics distributions, suggesting a more generalized or foundational
+approach to building these models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Conditional Image Generation with Explainable Latent Space
+  Manipulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16232v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16232v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kshitij Pathania
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the realm of image synthesis, achieving fidelity to a reference image
+while adhering to conditional prompts remains a significant challenge. This
+paper proposes a novel approach that integrates a diffusion model with latent
+space manipulation and gradient-based selective attention mechanisms to address
+this issue. Leveraging Grad-SAM (Gradient-based Selective Attention
+Manipulation), we analyze the cross attention maps of the cross attention
+layers and gradients for the denoised latent vector, deriving importance scores
+of elements of denoised latent vector related to the subject of interest. Using
+this information, we create masks at specific timesteps during denoising to
+preserve subjects while seamlessly integrating the reference image features.
+This approach ensures the faithful formation of subjects based on conditional
+prompts, while concurrently refining the background for a more coherent
+composition. Our experiments on places365 dataset demonstrate promising
+results, with our proposed model achieving the lowest mean and median Frechet
+Inception Distance (FID) scores compared to baseline models, indicating
+superior fidelity preservation. Furthermore, our model exhibits competitive
+performance in aligning the generated images with provided textual
+descriptions, as evidenced by high CLIP scores. These results highlight the
+effectiveness of our approach in both fidelity preservation and textual context
+preservation, offering a significant advancement in text-to-image synthesis
+tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages , 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Policy Adaptation via Language Optimization: Decomposing Tasks for
+  Few-Shot Imitation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16228v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16228v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vivek Myers, Bill Chunyuan Zheng, Oier Mees, Sergey Levine, Kuan Fang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learned language-conditioned robot policies often struggle to effectively
+adapt to new real-world tasks even when pre-trained across a diverse set of
+instructions. We propose a novel approach for few-shot adaptation to unseen
+tasks that exploits the semantic understanding of task decomposition provided
+by vision-language models (VLMs). Our method, Policy Adaptation via Language
+Optimization (PALO), combines a handful of demonstrations of a task with
+proposed language decompositions sampled from a VLM to quickly enable rapid
+nonparametric adaptation, avoiding the need for a larger fine-tuning dataset.
+We evaluate PALO on extensive real-world experiments consisting of challenging
+unseen, long-horizon robot manipulation tasks. We find that PALO is able of
+consistently complete long-horizon, multi-tier tasks in the real world,
+outperforming state of the art pre-trained generalist policies, and methods
+that have access to the same demonstrations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 14 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Targeted Cause Discovery with Data-Driven Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16218v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16218v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jang-Hyun Kim, Claudia Skok Gibbs, Sangdoo Yun, Hyun Oh Song, Kyunghyun Cho
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel machine learning approach for inferring causal variables
+of a target variable from observations. Our goal is to identify both direct and
+indirect causes within a system, thereby efficiently regulating the target
+variable when the difficulty and cost of intervening on each causal variable
+vary. Our method employs a neural network trained to identify causality through
+supervised learning on simulated data. By implementing a local-inference
+strategy, we achieve linear complexity with respect to the number of variables,
+efficiently scaling up to thousands of variables. Empirical results demonstrate
+the effectiveness of our method in identifying causal relationships within
+large-scale gene regulatory networks, outperforming existing causal discovery
+methods that primarily focus on direct causality. We validate our model's
+generalization capability across novel graph structures and generating
+mechanisms, including gene regulatory networks of E. coli and the human K562
+cell line. Implementation codes are available at
+https://github.com/snu-mllab/Targeted-Cause-Discovery.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adversarial Network Optimization under Bandit Feedback: Maximizing
+  Utility in Non-Stationary Multi-Hop Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16215v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16215v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yan Dai, Longbo Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stochastic Network Optimization (SNO) concerns scheduling in stochastic
+queueing systems. It has been widely studied in network theory. Classical SNO
+algorithms require network conditions to be stationary with time, which fails
+to capture the non-stationary components in many real-world scenarios. Many
+existing algorithms also assume knowledge of network conditions before
+decision, which rules out applications where unpredictability presents.
+  Motivated by these issues, we consider Adversarial Network Optimization (ANO)
+under bandit feedback. Specifically, we consider the task of *i)* maximizing
+some unknown and time-varying utility function associated to scheduler's
+actions, where *ii)* the underlying network is a non-stationary multi-hop one
+whose conditions change arbitrarily with time, and *iii)* only bandit feedback
+(effect of actually deployed actions) is revealed after decisions. Our proposed
+`UMO2` algorithm ensures network stability and also matches the utility
+maximization performance of any "mildly varying" reference policy up to a
+polynomially decaying gap. To our knowledge, no previous ANO algorithm handled
+multi-hop networks or achieved utility guarantees under bandit feedback,
+whereas ours can do both.
+  Technically, our method builds upon a novel integration of online learning
+into Lyapunov analyses: To handle complex inter-dependencies among queues in
+multi-hop networks, we propose meticulous techniques to balance online learning
+and Lyapunov arguments. To tackle the learning obstacles due to potentially
+unbounded queue sizes, we design a new online linear optimization algorithm
+that automatically adapts to loss magnitudes. To maximize utility, we propose a
+bandit convex optimization algorithm with novel queue-dependent learning rate
+scheduling that suites drastically varying queue lengths. Our new insights in
+online learning can be of independent interest.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Application of Machine Learning in Tidal Evolution Simulation of
+  Star-Planet Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16212v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16212v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuaishuai Guo, Jianheng Guo, KaiFan Ji, Hui Liu, Lei Xing
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the release of a large amount of astronomical data, an increasing number
+of close-in hot Jupiters have been discovered. Calculating their evolutionary
+curves using star-planet interaction models presents a challenge. To expedite
+the generation of evolutionary curves for these close-in hot Jupiter systems,
+we utilized tidal interaction models established on MESA to create 15,745
+samples of star-planet systems and 7,500 samples of stars. Additionally, we
+employed a neural network (Multi-Layer Perceptron - MLP) to predict the
+evolutionary curves of the systems, including stellar effective temperature,
+radius, stellar rotation period, and planetary orbital period. The median
+relative errors of the predicted evolutionary curves were found to be 0.15%,
+0.43%, 2.61%, and 0.57%, respectively. Furthermore, the speed at which we
+generate evolutionary curves exceeds that of model-generated curves by more
+than four orders of magnitude. We also extracted features of planetary
+migration states and utilized lightGBM to classify the samples into 6
+categories for prediction. We found that by combining three types that undergo
+long-term double synchronization into one label, the classifier effectively
+recognized these features. Apart from systems experiencing long-term double
+synchronization, the median relative errors of the predicted evolutionary
+curves were all below 4%. Our work provides an efficient method to save
+significant computational resources and time with minimal loss in accuracy.
+This research also lays the foundation for analyzing the evolutionary
+characteristics of systems under different migration states, aiding in the
+understanding of the underlying physical mechanisms of such systems. Finally,
+to a large extent, our approach could replace the calculations of theoretical
+models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ReXamine-Global: A Framework for Uncovering Inconsistencies in Radiology
+  Report Generation Metrics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16208v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16208v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oishi Banerjee, Agustina Saenz, Kay Wu, Warren Clements, Adil Zia, Dominic Buensalido, Helen Kavnoudias, Alain S. Abi-Ghanem, Nour El Ghawi, Cibele Luna, Patricia Castillo, Khaled Al-Surimi, Rayyan A. Daghistani, Yuh-Min Chen, Heng-sheng Chao, Lars Heiliger, Moon Kim, Johannes Haubold, Frederic Jonske, Pranav Rajpurkar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given the rapidly expanding capabilities of generative AI models for
+radiology, there is a need for robust metrics that can accurately measure the
+quality of AI-generated radiology reports across diverse hospitals. We develop
+ReXamine-Global, a LLM-powered, multi-site framework that tests metrics across
+different writing styles and patient populations, exposing gaps in their
+generalization. First, our method tests whether a metric is undesirably
+sensitive to reporting style, providing different scores depending on whether
+AI-generated reports are stylistically similar to ground-truth reports or not.
+Second, our method measures whether a metric reliably agrees with experts, or
+whether metric and expert scores of AI-generated report quality diverge for
+some sites. Using 240 reports from 6 hospitals around the world, we apply
+ReXamine-Global to 7 established report evaluation metrics and uncover serious
+gaps in their generalizability. Developers can apply ReXamine-Global when
+designing new report evaluation metrics, ensuring their robustness across
+sites. Additionally, our analysis of existing metrics can guide users of those
+metrics towards evaluation procedures that work reliably at their sites of
+interest.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revisit Micro-batch Clipping: Adaptive Data Pruning via Gradient
+  Manipulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16204v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16204v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lun Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Micro-batch clipping, a gradient clipping method, has recently shown
+potential in enhancing auto-speech recognition (ASR) model performance.
+However, the underlying mechanism behind this improvement remains mysterious,
+particularly the observation that only certain micro-batch sizes are
+beneficial. In this paper, we make the first attempt to explain this
+phenomenon. Inspired by recent data pruning research, we assume that specific
+training samples may impede model convergence during certain training phases.
+Under this assumption, the convergence analysis shows that micro-batch clipping
+can improve the convergence rate asymptotically at the cost of an additional
+constant bias that does not diminish with more training iterations. The bias is
+dependent on a few factors and can be minimized at specific micro-batch size,
+thereby elucidating the existence of the sweet-spot micro-batch size observed
+previously. We also verify the effectiveness of micro-batch clipping beyond
+speech models on vision and language models, and show promising performance
+gains in these domains. An exploration of potential limitations shows that
+micro-batch clipping is less effective when training data originates from
+multiple distinct domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Short-Term Electricity-Load Forecasting by Deep Learning: A
+  Comprehensive <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16202v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16202v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qi Dong, Rubing Huang, Chenhui Cui, Dave Towey, Ling Zhou, Jinyu Tian, Jianzhou Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Short-Term Electricity-Load Forecasting (STELF) refers to the prediction of
+the immediate demand (in the next few hours to several days) for the power
+system. Various external factors, such as weather changes and the emergence of
+new electricity consumption scenarios, can impact electricity demand, causing
+load data to fluctuate and become non-linear, which increases the complexity
+and difficulty of STELF. In the past decade, deep learning has been applied to
+STELF, modeling and predicting electricity demand with high accuracy, and
+contributing significantly to the development of STELF. This paper provides a
+comprehensive survey on deep-learning-based STELF over the past ten years. It
+examines the entire forecasting process, including data pre-processing, feature
+extraction, deep-learning modeling and optimization, and results evaluation.
+This paper also identifies some research challenges and potential research
+directions to be further investigated in future work.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Uni-3DAD: GAN-Inversion Aided Universal 3D Anomaly Detection on
+  Model-free Products 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16201v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16201v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiayu Liu, Shancong Mou, Nathan Gaw, Yinan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Anomaly detection is a long-standing challenge in manufacturing systems.
+Traditionally, anomaly detection has relied on human inspectors. However, 3D
+point clouds have gained attention due to their robustness to environmental
+factors and their ability to represent geometric data. Existing 3D anomaly
+detection methods generally fall into two categories. One compares scanned 3D
+point clouds with design files, assuming these files are always available.
+However, such assumptions are often violated in many real-world applications
+where model-free products exist, such as fresh produce (i.e., ``Cookie",
+``Potato", etc.), dentures, bone, etc. The other category compares patches of
+scanned 3D point clouds with a library of normal patches named memory bank.
+However, those methods usually fail to detect incomplete shapes, which is a
+fairly common defect type (i.e., missing pieces of different products). The
+main challenge is that missing areas in 3D point clouds represent the absence
+of scanned points. This makes it infeasible to compare the missing region with
+existing point cloud patches in the memory bank. To address these two
+challenges, we proposed a unified, unsupervised 3D anomaly detection framework
+capable of identifying all types of defects on model-free products. Our method
+integrates two detection modules: a feature-based detection module and a
+reconstruction-based detection module. Feature-based detection covers geometric
+defects, such as dents, holes, and cracks, while the reconstruction-based
+method detects missing regions. Additionally, we employ a One-class Support
+Vector Machine (OCSVM) to fuse the detection results from both modules. The
+results demonstrate that (1) our proposed method outperforms the
+state-of-the-art methods in identifying incomplete shapes and (2) it still
+maintains comparable performance with the SOTA methods in detecting all other
+types of anomalies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Variational Mode-Driven Graph Convolutional Network for Spatiotemporal
+  Traffic Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16191v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16191v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Osama Ahmad, Zubair Khalid
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper focuses on spatio-temporal (ST) traffic prediction traffic using
+graph neural networks. Given that ST data consists of non-stationary and
+complex time events, interpreting and predicting such trends is comparatively
+complicated. Representation of ST data in modes helps us infer behavior and
+assess the impact of noise on prediction applications. We propose a framework
+that decomposes ST data into modes using the variational mode decomposition
+(VMD) method, which is then fed into the neural network for forecasting future
+states. This hybrid approach is known as a variational mode graph convolutional
+network (VMGCN). Instead of exhaustively searching for the number of modes,
+they are determined using the reconstruction loss from the real-time
+application data. We also study the significance of each mode and the impact of
+bandwidth constraints on different horizon predictions in traffic flow data. We
+evaluate the performance of our proposed network on the LargeST dataset for
+both short and long-term predictions. Our framework yields better results
+compared to state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE Transactions on Intelligent Transportation Systems Submission,
+  2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A More Unified Theory of Transfer Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16189v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16189v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Steve Hanneke, Samory Kpotufe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We show that some basic moduli of continuity $\delta$ -- which measure how
+fast target risk decreases as source risk decreases -- appear to be at the root
+of many of the classical relatedness measures in transfer learning and related
+literature. Namely, bounds in terms of $\delta$ recover many of the existing
+bounds in terms of other measures of relatedness -- both in regression and
+classification -- and can at times be tighter.
+  We are particularly interested in general situations where the learner has
+access to both source data and some or no target data. The unified perspective
+allowed by the moduli $\delta$ allow us to extend many existing notions of
+relatedness at once to these scenarios involving target data: interestingly,
+while $\delta$ itself might not be efficiently estimated, adaptive procedures
+exist -- based on reductions to confidence sets -- which can get nearly tight
+rates in terms of $\delta$ with no prior distributional knowledge. Such
+adaptivity to unknown $\delta$ immediately implies adaptivity to many classical
+relatedness notions, in terms of combined source and target samples' sizes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Real-Time Energy Pricing in New Zealand: An Evolving Stream Analysis <span class="chip">PRICAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16187v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16187v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yibin Sun, Heitor Murilo Gomes, Bernhard Pfahringer, Albert Bifet
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a group of novel datasets representing real-time
+time-series and streaming data of energy prices in New Zealand, sourced from
+the Electricity Market Information (EMI) website maintained by the New Zealand
+government. The datasets are intended to address the scarcity of proper
+datasets for streaming regression learning tasks. We conduct extensive analyses
+and experiments on these datasets, covering preprocessing techniques,
+regression tasks, prediction intervals, concept drift detection, and anomaly
+detection. Our experiments demonstrate the datasets' utility and highlight the
+challenges and opportunities for future research in energy price forecasting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 Pages, 8 figures, short version accepted by PRICAI</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Single-Loop Deterministic and Stochastic Interior-Point Algorithms for
+  Nonlinearly Constrained Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16186v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16186v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Frank E. Curtis, Xin Jiang, Qi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  An interior-point algorithm framework is proposed, analyzed, and tested for
+solving nonlinearly constrained continuous optimization problems. The main
+setting of interest is when the objective and constraint functions may be
+nonlinear and/or nonconvex, and when constraint values and derivatives are
+tractable to compute, but objective function values and derivatives can only be
+estimated. The algorithm is intended primarily for a setting that is similar
+for stochastic-gradient methods for unconstrained optimization, namely, the
+setting when stochastic-gradient estimates are available and employed in place
+of gradients of the objective, and when no objective function values (nor
+estimates of them) are employed. This is achieved by the interior-point
+framework having a single-loop structure rather than the nested-loop structure
+that is typical of contemporary interior-point methods. For completeness,
+convergence guarantees for the framework are provided both for deterministic
+and stochastic settings. Numerical experiments show that the algorithm yields
+good performance on a large set of test problems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Minibatch-SGD-Based Learning Meta-Policy for Inventory Systems with
+  Myopic Optimal Policy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16181v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16181v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiameng Lyu, Jinxing Xie, Shilin Yuan, Yuan Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stochastic gradient descent (SGD) has proven effective in solving many
+inventory control problems with demand learning. However, it often faces the
+pitfall of an infeasible target inventory level that is lower than the current
+inventory level. Several recent works (e.g., Huh and Rusmevichientong (2009),
+Shi et al.(2016)) are successful to resolve this issue in various inventory
+systems. However, their techniques are rather sophisticated and difficult to be
+applied to more complicated scenarios such as multi-product and
+multi-constraint inventory systems.
+  In this paper, we address the infeasible-target-inventory-level issue from a
+new technical perspective -- we propose a novel minibatch-SGD-based
+meta-policy. Our meta-policy is flexible enough to be applied to a general
+inventory systems framework covering a wide range of inventory management
+problems with myopic clairvoyant optimal policy. By devising the optimal
+minibatch scheme, our meta-policy achieves a regret bound of
+$\mathcal{O}(\sqrt{T})$ for the general convex case and $\mathcal{O}(\log T)$
+for the strongly convex case. To demonstrate the power and flexibility of our
+meta-policy, we apply it to three important inventory control problems:
+multi-product and multi-constraint systems, multi-echelon serial systems, and
+one-warehouse and multi-store systems by carefully designing
+application-specific subroutines.We also conduct extensive numerical
+experiments to demonstrate that our meta-policy enjoys competitive regret
+performance, high computational efficiency, and low variances among a wide
+range of applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Forthcoming in Management Science</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Batched Stochastic Bandit for Nondegenerate Functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.05733v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.05733v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Liu, Yunlu Shu, Tianyu Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper studies batched bandit learning problems for nondegenerate
+functions. We introduce an algorithm that solves the batched bandit problem for
+nondegenerate functions near-optimally. More specifically, we introduce an
+algorithm, called Geometric Narrowing (GN), whose regret bound is of order
+$\widetilde{{\mathcal{O}}} ( A_{+}^d \sqrt{T} )$. In addition, GN only needs
+$\mathcal{O} (\log \log T)$ batches to achieve this regret. We also provide
+lower bound analysis for this problem. More specifically, we prove that over
+some (compact) doubling metric space of doubling dimension $d$: 1. For any
+policy $\pi$, there exists a problem instance on which $\pi$ admits a regret of
+order ${\Omega} ( A_-^d \sqrt{T})$; 2. No policy can achieve a regret of order
+$ A_-^d \sqrt{T} $ over all problem instances, using less than $ \Omega ( \log
+\log T ) $ rounds of communications. Our lower bound analysis shows that the GN
+algorithm achieves near optimal regret with minimal number of batches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>34 pages, 14 colored figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VGBench: Evaluating Large Language Models on Vector Graphics
+  Understanding and Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.10972v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.10972v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bocheng Zou, Mu Cai, Jianrui Zhang, Yong Jae Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the realm of vision models, the primary mode of representation is using
+pixels to rasterize the visual world. Yet this is not always the best or unique
+way to represent visual content, especially for designers and artists who
+depict the world using geometry primitives such as polygons. Vector graphics
+(VG), on the other hand, offer a textual representation of visual content,
+which can be more concise and powerful for content like cartoons, sketches and
+scientific figures. Recent studies have shown promising results on processing
+vector graphics with capable Large Language Models (LLMs). However, such works
+focus solely on qualitative results, understanding, or a specific type of
+vector graphics. We propose VGBench, a comprehensive benchmark for LLMs on
+handling vector graphics through diverse aspects, including (a) both visual
+understanding and generation, (b) evaluation of various vector graphics
+formats, (c) diverse question types, (d) wide range of prompting techniques,
+(e) under multiple LLMs and (f) comparison with VLMs on rasterized
+representations. Evaluating on our collected 4279 understanding and 5845
+generation samples, we find that LLMs show strong capability on both aspects
+while exhibiting less desirable performance on low-level formats (SVG). Both
+data and evaluation pipeline will be open-sourced at https://vgbench.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://vgbench.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Conditional score-based diffusion models for solving inverse problems in
+  mechanics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.13154v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.13154v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Agnimitra Dasgupta, Harisankar Ramaswamy, Javier Murgoitio-Esandi, Ken Foo, Runze Li, Qifa Zhou, Brendan Kennedy, Assad Oberai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a framework to perform Bayesian inference using conditional
+score-based diffusion models to solve a class of inverse problems in mechanics
+involving the inference of a specimen's spatially varying material properties
+from noisy measurements of its mechanical response to loading. Conditional
+score-based diffusion models are generative models that learn to approximate
+the score function of a conditional distribution using samples from the joint
+distribution. More specifically, the score functions corresponding to multiple
+realizations of the measurement are approximated using a single neural network,
+the so-called score network, which is subsequently used to sample the posterior
+distribution using an appropriate Markov chain Monte Carlo scheme based on
+Langevin dynamics. Training the score network only requires simulating the
+forward model. Hence, the proposed approach can accommodate black-box forward
+models and complex measurement noise. Moreover, once the score network has been
+trained, it can be re-used to solve the inverse problem for different
+realizations of the measurements. We demonstrate the efficacy of the proposed
+approach on a suite of high-dimensional inverse problems in mechanics that
+involve inferring heterogeneous material properties from noisy measurements.
+Some examples we consider involve synthetic data, while others include data
+collected from actual elastography experiments. Further, our applications
+demonstrate that the proposed approach can handle different measurement
+modalities, complex patterns in the inferred quantities, non-Gaussian and
+non-additive noise models, and nonlinear black-box forward models. The results
+show that the proposed framework can solve large-scale physics-based inverse
+problems efficiently.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FilFL: Client Filtering for Optimized Client Participation in Federated
+  Learning <span class="chip">ECAI'24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.06599v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.06599v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fares Fourati, Salma Kharrat, Vaneet Aggarwal, Mohamed-Slim Alouini, Marco Canini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning, an emerging machine learning paradigm, enables clients to
+collaboratively train a model without exchanging local data. Clients
+participating in the training process significantly impact the convergence
+rate, learning efficiency, and model generalization. We propose a novel
+approach, client filtering, to improve model generalization and optimize client
+participation and training. The proposed method periodically filters available
+clients to identify a subset that maximizes a combinatorial objective function
+with an efficient greedy filtering algorithm. Thus, the clients are assessed as
+a combination rather than individually. We theoretically analyze the
+convergence of federated learning with client filtering in heterogeneous
+settings and evaluate its performance across diverse vision and language tasks,
+including realistic scenarios with time-varying client availability. Our
+empirical results demonstrate several benefits of our approach, including
+improved learning efficiency, faster convergence, and up to 10% higher test
+accuracy than training without client filtering.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ECAI'24</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning to <span class="highlight-title">Prompt</span> Your Domain for Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03103v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03103v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guoyizhe Wei, Feng Wang, Anshul Shah, Rama Chellappa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prompt learning has recently become a very efficient transfer learning
+paradigm for Contrastive Language Image Pretraining (CLIP) models. Compared
+with fine-tuning the entire encoder, prompt learning can obtain highly
+competitive results by optimizing only a small number of parameters, which
+presents considerably exciting benefits for federated learning applications
+that prioritizes communication efficiency. However, in this work, we identify
+that directly transferring prompt learning approaches into federated learning
+does not yield favorable results since the model often suffers from
+considerable domain gaps across different clients. To address this issue, we
+propose ADAPT, a novel domain-aware prompt learning approach that facilitates
+both intra- and inter-domain prompts across federated participants. The basic
+idea of ADAPT is that the prompted CLIP should detect the input image's domain
+correspondence and before making the prediction of its category. Extensive
+experiments of ADAPT demonstrate its significant efficiency and effectiveness
+in federated learning. For example, by learning and sharing only 0.08M
+parameters, our ADAPT attains a 68.4% average accuracy over six domains in the
+DomainNet dataset, which improves the original CLIP by a large margin of 14.8%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluation Framework for Feedback Generation Methods in Skeletal
+  Movement Assessment <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.09359v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.09359v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tal Hakim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The application of machine-learning solutions to movement assessment from
+skeleton videos has attracted significant research attention in recent years.
+This advancement has made rehabilitation at home more accessible, utilizing
+movement assessment algorithms that can operate on affordable equipment for
+human pose detection and analysis from 2D or 3D videos. While the primary
+objective of automatic assessment tasks is to score movements, the automatic
+generation of feedback highlighting key movement issues has the potential to
+significantly enhance and accelerate the rehabilitation process. While numerous
+research works exist in the field of automatic movement assessment, only a
+handful address feedback generation. In this study, we propose terminology and
+criteria for the classification, evaluation, and comparison of feedback
+generation solutions. We discuss the challenges associated with each feedback
+generation approach and use our proposed criteria to classify existing
+solutions. To our knowledge, this is the first work that formulates feedback
+generation in skeletal movement assessment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to xAI4Biometrics 2024 at ECCV 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adaptive Log-Euclidean Metrics for SPD Matrix Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.15477v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.15477v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziheng Chen, Yue Song, Tianyang Xu, Zhiwu Huang, Xiao-Jun Wu, Nicu Sebe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Symmetric Positive Definite (SPD) matrices have received wide attention in
+machine learning due to their intrinsic capacity to encode underlying
+structural correlation in data. Many successful Riemannian metrics have been
+proposed to reflect the non-Euclidean geometry of SPD manifolds. However, most
+existing metric tensors are fixed, which might lead to sub-optimal performance
+for SPD matrix learning, especially for deep SPD neural networks. To remedy
+this limitation, we leverage the commonly encountered pullback techniques and
+propose Adaptive Log-Euclidean Metrics (ALEMs), which extend the widely used
+Log-Euclidean Metric (LEM). Compared with the previous Riemannian metrics, our
+metrics contain learnable parameters, which can better adapt to the complex
+dynamics of Riemannian neural networks with minor extra computations. We also
+present a complete theoretical analysis to support our ALEMs, including
+algebraic and Riemannian properties. The experimental and theoretical results
+demonstrate the merit of the proposed metrics in improving the performance of
+SPD neural networks. The efficacy of our metrics is further showcased on a set
+of recently developed Riemannian building blocks, including Riemannian batch
+normalization, Riemannian Residual blocks, and Riemannian classifiers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by TIP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Wasserstein Gradient Boosting: A Framework for Distribution-Valued
+  Supervised Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.09536v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.09536v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Takuo Matsubara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gradient boosting is a sequential ensemble method that fits a new weaker
+learner to pseudo residuals at each iteration. We propose Wasserstein gradient
+boosting, a novel extension of gradient boosting that fits a new weak learner
+to alternative pseudo residuals that are Wasserstein gradients of loss
+functionals of probability distributions assigned at each input. It solves
+distribution-valued supervised learning, where the output values of the
+training dataset are probability distributions for each input. In
+classification and regression, a model typically returns, for each input, a
+point estimate of a parameter of a noise distribution specified for a response
+variable, such as the class probability parameter of a categorical distribution
+specified for a response label. A main application of Wasserstein gradient
+boosting in this paper is tree-based evidential learning, which returns a
+distributional estimate of the response parameter for each input. We
+empirically demonstrate the superior performance of the probabilistic
+prediction by Wasserstein gradient boosting in comparison with existing
+uncertainty quantification methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GEAR: An Efficient KV Cache Compression Recipe for Near-Lossless
+  Generative Inference of LLM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.05527v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.05527v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Kang, Qingru Zhang, Souvik Kundu, Geonhwa Jeong, Zaoxing Liu, Tushar Krishna, Tuo Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Key-value (KV) caching has become the de-facto to accelerate generation speed
+for large language models (LLMs) inference. However, the growing cache demand
+with increasing sequence length has transformed LLM inference to be a memory
+bound problem, significantly constraining the system throughput. Existing
+methods rely on dropping unimportant tokens or quantizing all entries
+uniformly. Such methods, however, often incur high approximation errors to
+represent the compressed matrices. The autoregressive decoding process further
+compounds the error of each step, resulting in critical deviation in model
+generation and deterioration of performance. To tackle this challenge, we
+propose GEAR, an efficient KV cache compression framework that achieves
+near-lossless high-ratio compression. GEAR first applies quantization to
+majority of entries of similar magnitudes to ultra-low precision. It then
+employs a low rank matrix to approximate the quantization error, and a sparse
+matrix to remedy individual errors from outlier entries. By adeptly integrating
+three techniques, GEAR is able to fully exploit their synergistic potentials.
+Our experiments demonstrate that compared to alternatives, GEAR achieves
+near-lossless 4-bit KV cache compression with up to 2.38x throughput
+improvement, while reducing peak-memory size up to 2.29x. Our code is publicly
+available at https://github.com/HaoKang-Timmy/GEAR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Misam: Using ML in Dataflow Selection of Sparse-Sparse Matrix
+  Multiplication <span class="chip">ISCA 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.10166v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.10166v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanjali Yadav, Bahar Asgari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sparse matrix-matrix multiplication (SpGEMM) is a critical operation in
+numerous fields, including scientific computing, graph analytics, and deep
+learning. These applications exploit the sparsity of matrices to reduce storage
+and computational demands. However, the irregular structure of sparse matrices
+poses significant challenges for performance optimization. Traditional hardware
+accelerators are tailored for specific sparsity patterns with fixed dataflow
+schemes - inner, outer, and row-wise but often perform suboptimally when the
+actual sparsity deviates from these predetermined patterns. As the use of
+SpGEMM expands across various domains, each with distinct sparsity
+characteristics, the demand for hardware accelerators that can efficiently
+handle a range of sparsity patterns is increasing. This paper presents a
+machine learning based approach for adaptively selecting the most appropriate
+dataflow scheme for SpGEMM tasks with diverse sparsity patterns. By employing
+decision trees and deep reinforcement learning, we explore the potential of
+these techniques to surpass heuristic-based methods in identifying optimal
+dataflow schemes. We evaluate our models by comparing their performance with
+that of a heuristic, highlighting the strengths and weaknesses of each
+approach. Our findings suggest that using machine learning for dynamic dataflow
+selection in hardware accelerators can provide upto 28 times gains.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ISCA 2024 MLArchSys workshop
+  https://openreview.net/forum?id=A1V9FaZRbV</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Iterative Methods for Vecchia-Laplace Approximations for Latent Gaussian
+  Process Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12000v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12000v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pascal Kündig, Fabio Sigrist
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Latent Gaussian process (GP) models are flexible probabilistic non-parametric
+function models. Vecchia approximations are accurate approximations for GPs to
+overcome computational bottlenecks for large data, and the Laplace
+approximation is a fast method with asymptotic convergence guarantees to
+approximate marginal likelihoods and posterior predictive distributions for
+non-Gaussian likelihoods. Unfortunately, the computational complexity of
+combined Vecchia-Laplace approximations grows faster than linearly in the
+sample size when used in combination with direct solver methods such as the
+Cholesky decomposition. Computations with Vecchia-Laplace approximations can
+thus become prohibitively slow precisely when the approximations are usually
+the most accurate, i.e., on large data sets. In this article, we present
+iterative methods to overcome this drawback. Among other things, we introduce
+and analyze several preconditioners, derive new convergence results, and
+propose novel methods for accurately approximating predictive variances. We
+analyze our proposed methods theoretically and in experiments with simulated
+and real-world data. In particular, we obtain a speed-up of an order of
+magnitude compared to Cholesky-based calculations and a threefold increase in
+prediction accuracy in terms of the continuous ranked probability score
+compared to a state-of-the-art method on a large satellite data set. All
+methods are implemented in a free C++ software library with high-level Python
+and R packages.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Methods for Recovering Conditional Independence Graphs: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.06829v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.06829v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Harsh Shrivastava, Urszula Chajewska
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conditional Independence (CI) graphs are a type of probabilistic graphical
+models that are primarily used to gain insights about feature relationships.
+Each edge represents the partial correlation between the connected features
+which gives information about their direct dependence. In this survey, we list
+out different methods and study the advances in techniques developed to recover
+CI graphs. We cover traditional optimization methods as well as recently
+developed deep learning architectures along with their recommended
+implementations. To facilitate wider adoption, we include preliminaries that
+consolidate associated operations, for example techniques to obtain covariance
+matrix for mixed datatypes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Post-processing fairness with minimal changes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15096v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15096v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Federico Di Gennaro, Thibault Laugel, Vincent Grari, Xavier Renard, Marcin Detyniecki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce a novel post-processing algorithm that is both
+model-agnostic and does not require the sensitive attribute at test time. In
+addition, our algorithm is explicitly designed to enforce minimal changes
+between biased and debiased predictions; a property that, while highly
+desirable, is rarely prioritized as an explicit objective in fairness
+literature. Our approach leverages a multiplicative factor applied to the logit
+value of probability scores produced by a black-box classifier. We demonstrate
+the efficacy of our method through empirical evaluations, comparing its
+performance against other four debiasing algorithms on two widely used datasets
+in fairness research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Not (yet) the whole story: Evaluating Visual Storytelling Requires More
+  than Measuring Coherence, Grounding, and Repetition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.04559v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.04559v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aditya K Surikuchi, Raquel Fernández, Sandro Pezzelle
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual storytelling consists in generating a natural language story given a
+temporally ordered sequence of images. This task is not only challenging for
+models, but also very difficult to evaluate with automatic metrics since there
+is no consensus about what makes a story 'good'. In this paper, we introduce a
+novel method that measures story quality in terms of human likeness regarding
+three key aspects highlighted in previous work: visual grounding, coherence,
+and repetitiveness. We then use this method to evaluate the stories generated
+by several models, showing that the foundation model LLaVA obtains the best
+result, but only slightly so compared to TAPM, a 50-times smaller visual
+storytelling model. Upgrading the visual and language components of TAPM
+results in a model that yields competitive performance with a relatively low
+number of parameters. Finally, we carry out a human evaluation study, whose
+results suggest that a 'good' story may require more than a human-like level of
+visual grounding, coherence, and repetition.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Gameplay Filters: Robust Zero-Shot Safety through Adversarial
+  Imagination 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.00846v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.00846v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Duy P. Nguyen, Kai-Chieh Hsu, Wenhao Yu, Jie Tan, Jaime F. Fisac
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the impressive recent advances in learning-based robot control,
+ensuring robustness to out-of-distribution conditions remains an open
+challenge. Safety filters can, in principle, keep arbitrary control policies
+from incurring catastrophic failures by overriding unsafe actions, but existing
+solutions for complex (e.g., legged) robot dynamics do not span the full motion
+envelope and instead rely on local, reduced-order models. These filters tend to
+overly restrict agility and can still fail when perturbed away from nominal
+conditions. This paper presents the gameplay filter, a new class of predictive
+safety filter that continually plays out hypothetical matches between its
+simulation-trained safety strategy and a virtual adversary co-trained to invoke
+worst-case events and sim-to-real error, and precludes actions that would cause
+it to fail down the line. We demonstrate the scalability and robustness of the
+approach with a first-of-its-kind full-order safety filter for (36-D)
+quadrupedal dynamics. Physical experiments on two different quadruped platforms
+demonstrate the superior zero-shot effectiveness of the gameplay filter under
+large perturbations such as tugging and unmodeled terrain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generalization of Hamiltonian algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.14469v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.14469v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andreas Maurer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The paper proves generalization results for a class of stochastic learning
+algorithms. The method applies whenever the algorithm generates an absolutely
+continuous distribution relative to some a-priori measure and the Radon Nikodym
+derivative has subgaussian concentration. Applications are bounds for the Gibbs
+algorithm and randomizations of stable deterministic algorithms as well as
+PAC-Bayesian bounds with data-dependent priors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Trajectory Forecasting through Low-Rank Adaptation of Discrete Latent
+  Codes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20743v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20743v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Riccardo Benaglia, Angelo Porrello, Pietro Buzzega, Simone Calderara, Rita Cucchiara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Trajectory forecasting is crucial for video surveillance analytics, as it
+enables the anticipation of future movements for a set of agents, e.g.
+basketball players engaged in intricate interactions with long-term intentions.
+Deep generative models offer a natural learning approach for trajectory
+forecasting, yet they encounter difficulties in achieving an optimal balance
+between sampling fidelity and diversity. We address this challenge by
+leveraging Vector Quantized Variational Autoencoders (VQ-VAEs), which utilize a
+discrete latent space to tackle the issue of posterior collapse. Specifically,
+we introduce an instance-based codebook that allows tailored latent
+representations for each example. In a nutshell, the rows of the codebook are
+dynamically adjusted to reflect contextual information (i.e., past motion
+patterns extracted from the observed trajectories). In this way, the
+discretization process gains flexibility, leading to improved reconstructions.
+Notably, instance-level dynamics are injected into the codebook through
+low-rank updates, which restrict the customization of the codebook to a lower
+dimension space. The resulting discrete space serves as the basis of the
+subsequent step, which regards the training of a diffusion-based predictive
+model. We show that such a two-fold framework, augmented with instance-level
+discretization, leads to accurate and diverse forecasts, yielding
+state-of-the-art performance on three established benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 3 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Verification of Geometric Robustness of Neural Networks via Piecewise
+  Linear Approximation and Lipschitz Optimisation <span class="chip">ECAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13140v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13140v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ben Batten, Yang Zheng, Alessandro De Palma, Panagiotis Kouvaros, Alessio Lomuscio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We address the problem of verifying neural networks against geometric
+transformations of the input image, including rotation, scaling, shearing, and
+translation. The proposed method computes provably sound piecewise linear
+constraints for the pixel values by using sampling and linear approximations in
+combination with branch-and-bound Lipschitz optimisation. The method obtains
+provably tighter over-approximations of the perturbation region than the
+present state-of-the-art. We report results from experiments on a comprehensive
+set of verification benchmarks on MNIST and CIFAR10. We show that our proposed
+implementation resolves up to 32% more verification cases than present
+approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ECAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Efficacy of Text-Based Input Modalities for Action Anticipation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12972v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12972v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Apoorva Beedu, Harish Haresamudram, Karan Samel, Irfan Essa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Anticipating future actions is a highly challenging task due to the diversity
+and scale of potential future actions; yet, information from different
+modalities help narrow down plausible action choices. Each modality can provide
+diverse and often complementary context for the model to learn from. While
+previous multi-modal methods leverage information from modalities such as video
+and audio, we primarily explore how text descriptions of actions and objects
+can also lead to more accurate action anticipation by providing additional
+contextual cues, e.g., about the environment and its contents. We propose a
+Multi-modal Contrastive Anticipative Transformer (M-CAT), a video transformer
+architecture that jointly learns from multi-modal features and text
+descriptions of actions and objects. We train our model in two stages, where
+the model first learns to align video clips with descriptions of future
+actions, and is subsequently fine-tuned to predict future actions. Compared to
+existing methods, M-CAT has the advantage of learning additional context from
+two types of text inputs: rich descriptions of future actions during
+pre-training, and, text descriptions for detected objects and actions during
+modality feature fusion. Through extensive experimental evaluation, we
+demonstrate that our model outperforms previous methods on the EpicKitchens
+datasets, and show that using simple text descriptions of actions and objects
+aid in more effective action anticipation. In addition, we examine the impact
+of object and action information obtained via text, and perform extensive
+ablations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Standardized Interpretable Fairness Measures for Continuous Risk Scores 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.11375v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.11375v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ann-Kristin Becker, Oana Dumitrasc, Klaus Broelemann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a standardized version of fairness measures for continuous scores
+with a reasonable interpretation based on the Wasserstein distance. Our
+measures are easily computable and well suited for quantifying and interpreting
+the strength of group disparities as well as for comparing biases across
+different models, datasets, or time points. We derive a link between the
+different families of existing fairness measures for scores and show that the
+proposed standardized fairness measures outperform ROC-based fairness measures
+because they are more explicit and can quantify significant biases that
+ROC-based fairness measures miss.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Follow-up Attention: An Empirical Study of Developer and Neural Model
+  Code Exploration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.05506v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.05506v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matteo Paltenghi, Rahul Pandita, Austin Z. Henley, Albert Ziegler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent neural models of code, such as OpenAI Codex and AlphaCode, have
+demonstrated remarkable proficiency at code generation due to the underlying
+attention mechanism. However, it often remains unclear how the models actually
+process code, and to what extent their reasoning and the way their attention
+mechanism scans the code matches the patterns of developers. A poor
+understanding of the model reasoning process limits the way in which current
+neural models are leveraged today, so far mostly for their raw prediction. To
+fill this gap, this work studies how the processed attention signal of three
+open large language models - CodeGen, InCoder and GPT-J - agrees with how
+developers look at and explore code when each answers the same sensemaking
+questions about code. Furthermore, we contribute an open-source eye-tracking
+dataset comprising 92 manually-labeled sessions from 25 developers engaged in
+sensemaking tasks. We empirically evaluate five heuristics that do not use the
+attention and ten attention-based post-processing approaches of the attention
+signal of CodeGen against our ground truth of developers exploring code,
+including the novel concept of follow-up attention which exhibits the highest
+agreement between model and human attention. Our follow-up attention method can
+predict the next line a developer will look at with 47% accuracy. This
+outperforms the baseline prediction accuracy of 42.3%, which uses the session
+history of other developers to recommend the next line. These results
+demonstrate the potential of leveraging the attention signal of pre-trained
+models for effective code exploration.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at IEEE Transactions on Software Engineering</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unified Convergence Theory of Stochastic and Variance-Reduced Cubic
+  Newton Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.11962v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.11962v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        El Mahdi Chayti, Nikita Doikov, Martin Jaggi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study stochastic Cubic Newton methods for solving general possibly
+non-convex minimization problems. We propose a new framework, which we call the
+helper framework, that provides a unified view of the stochastic and
+variance-reduced second-order algorithms equipped with global complexity
+guarantees. It can also be applied to learning with auxiliary information. Our
+helper framework offers the algorithm designer high flexibility for
+constructing and analyzing the stochastic Cubic Newton methods, allowing
+arbitrary size batches, and the use of noisy and possibly biased estimates of
+the gradients and Hessians, incorporating both the variance reduction and the
+lazy Hessian updates. We recover the best-known complexities for the stochastic
+and variance-reduced Cubic Newton, under weak assumptions on the noise. A
+direct consequence of our theory is the new lazy stochastic second-order
+method, which significantly improves the arithmetic complexity for large
+dimension problems. We also establish complexity bounds for the classes of
+gradient-dominated objectives, that include convex and strongly convex
+problems. For Auxiliary Learning, we show that using a helper (auxiliary
+function) can outperform training alone if a given similarity measure is small.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ No Regrets: Investigating and Improving Regret Approximations for
+  Curriculum Discovery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15099v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15099v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Rutherford, Michael Beukman, Timon Willi, Bruno Lacerda, Nick Hawes, Jakob Foerster
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  What data or environments to use for training to improve downstream
+performance is a longstanding and very topical question in reinforcement
+learning. In particular, Unsupervised Environment Design (UED) methods have
+gained recent attention as their adaptive curricula enable agents to be robust
+to in- and out-of-distribution tasks. We ask to what extent these methods are
+themselves robust when applied to a novel setting, closely inspired by a
+real-world robotics problem. Surprisingly, we find that the state-of-the-art
+UED methods either do not improve upon the na\"{i}ve baseline of Domain
+Randomisation (DR), or require substantial hyperparameter tuning to do so. Our
+analysis shows that this is due to their underlying scoring functions failing
+to predict intuitive measures of ``learnability'', i.e., in finding the
+settings that the agent sometimes solves, but not always. Based on this, we
+instead directly train on levels with high learnability and find that this
+simple and intuitive approach outperforms UED methods and DR in several
+binary-outcome environments, including on our domain and the standard UED
+domain of Minigrid. We further introduce a new adversarial evaluation procedure
+for directly measuring robustness, closely mirroring the conditional value at
+risk (CVaR). We open-source all our code and present visualisations of final
+policies here: https://github.com/amacrutherford/sampling-for-learnability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Innovative Speech-Based Deep Learning Approaches for Parkinson's Disease
+  Classification: A Systematic <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.17844v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.17844v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lisanne van Gelderen, Cristian Tejedor-García
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Parkinson's disease (PD), the second most prevalent neurodegenerative
+disorder worldwide, frequently presents with early-stage speech impairments.
+Recent advancements in Artificial Intelligence (AI), particularly deep learning
+(DL), have significantly enhanced PD diagnosis through the analysis of speech
+data. Nevertheless, the progress of research is restricted by the limited
+availability of publicly accessible speech-based PD datasets, primarily due to
+privacy concerns. The goal of this systematic review is to explore the current
+landscape of speech-based DL approaches for PD classification, based on 33
+scientific works published between 2020 and March 2024. We discuss their
+available resources, capabilities, potential limitations, and issues related to
+bias, explainability, and privacy. Furthermore, this review provides an
+overview of publicly accessible speech-based datasets and open-source material
+for PD. The DL approaches are categorized into end-to-end (E2E) learning,
+transfer learning (TL) and deep acoustic features extraction (DAFE) approaches.
+Among E2E approaches, Convolutional Neural Networks (CNNs) are prevalent,
+though Transformers are increasingly popular. E2E approaches face challenges
+such as limited data and computational resources, especially with Transformers.
+TL addresses these issues by providing more robust PD diagnosis and better
+generalizability across languages. DAFE aims to improve the explainability and
+interpretability of results by examining the specific effects of deep features
+on both other DL approaches and more traditional machine learning (ML) methods.
+However, it often underperforms compared to E2E and TL approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted in Applied Sciences - peer reviewed Open Access journal.
+  This research was funded by the NWO research programme AiNed Fellowship
+  Grants under the project Responsible AI for Voice Diagnostics (RAIVD) - grant
+  number NGF.1607.22.013</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GANs Conditioning Methods: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15640v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15640v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anis Bourou, Auguste Genovesio, Valérie Mezger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, Generative Adversarial Networks (GANs) have seen significant
+advancements, leading to their widespread adoption across various fields. The
+original GAN architecture enables the generation of images without any specific
+control over the content, making it an unconditional generation process.
+However, many practical applications require precise control over the generated
+output, which has led to the development of conditional GANs (cGANs) that
+incorporate explicit conditioning to guide the generation process. cGANs extend
+the original framework by incorporating additional information (conditions),
+enabling the generation of samples that adhere to that specific criteria.
+Various conditioning methods have been proposed, each differing in how they
+integrate the conditioning information into both the generator and the
+discriminator networks. In this work, we review the conditioning methods
+proposed for GANs, exploring the characteristics of each method and
+highlighting their unique mechanisms and theoretical foundations. Furthermore,
+we conduct a comparative analysis of these methods, evaluating their
+performance on various image datasets. Through these analyses, we aim to
+provide insights into the strengths and limitations of various conditioning
+techniques, guiding future research and application in generative modeling.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Force-Guided Bridge Matching for Full-Atom Time-Coarsened Dynamics of
+  Peptides 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15126v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15126v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyang Yu, Wenbing Huang, Yang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Molecular Dynamics (MD) simulations are irreplaceable and ubiquitous in
+fields of materials science, chemistry, pharmacology just to name a few.
+Conventional MD simulations are plagued by numerical stability as well as long
+equilibration time issues, which limits broader applications of MD simulations.
+Recently, a surge of deep learning approaches have been devised for
+time-coarsened dynamics, which learns the state transition mechanism over much
+larger time scales to overcome these limitations. However, only a few methods
+target the underlying Boltzmann distribution by resampling techniques, where
+proposals are rarely accepted as new states with low efficiency. In this work,
+we propose a force-guided bridge matching model, FBM, a novel framework that
+first incorporates physical priors into bridge matching for full-atom
+time-coarsened dynamics. With the guidance of our well-designed intermediate
+force field, FBM is feasible to target the Boltzmann-like distribution by
+direct inference without extra steps. Experiments on small peptides verify our
+superiority in terms of comprehensive metrics and demonstrate transferability
+to unseen peptide systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FRRI: a novel algorithm for fuzzy-rough rule induction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.04447v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.04447v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Henri Bollaert, Marko Palangetić, Chris Cornelis, Salvatore Greco, Roman Słowiński
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interpretability is the next frontier in machine learning research. In the
+search for white box models - as opposed to black box models, like random
+forests or neural networks - rule induction algorithms are a logical and
+promising option, since the rules can easily be understood by humans. Fuzzy and
+rough set theory have been successfully applied to this archetype, almost
+always separately. As both approaches to rule induction involve granular
+computing based on the concept of equivalence classes, it is natural to combine
+them. The QuickRules\cite{JensenCornelis2009} algorithm was a first attempt at
+using fuzzy rough set theory for rule induction. It is based on QuickReduct, a
+greedy algorithm for building decision reducts. QuickRules already showed an
+improvement over other rule induction methods. However, to evaluate the full
+potential of a fuzzy rough rule induction algorithm, one needs to start from
+the foundations. In this paper, we introduce a novel rule induction algorithm
+called Fuzzy Rough Rule Induction (FRRI). We provide background and explain the
+workings of our algorithm. Furthermore, we perform a computational experiment
+to evaluate the performance of our algorithm and compare it to other
+state-of-the-art rule induction approaches. We find that our algorithm is more
+accurate while creating small rulesets consisting of relatively short rules. We
+end the paper by outlining some directions for future work.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Guide to Feature Importance Methods for Scientific Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.12862v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.12862v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fiona Katharina Ewald, Ludwig Bothmann, Marvin N. Wright, Bernd Bischl, Giuseppe Casalicchio, Gunnar König
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While machine learning (ML) models are increasingly used due to their high
+predictive power, their use in understanding the data-generating process (DGP)
+is limited. Understanding the DGP requires insights into feature-target
+associations, which many ML models cannot directly provide due to their opaque
+internal mechanisms. Feature importance (FI) methods provide useful insights
+into the DGP under certain conditions. Since the results of different FI
+methods have different interpretations, selecting the correct FI method for a
+concrete use case is crucial and still requires expert knowledge. This paper
+serves as a comprehensive guide to help understand the different
+interpretations of global FI methods. Through an extensive review of FI methods
+and providing new proofs regarding their interpretation, we facilitate a
+thorough understanding of these methods and formulate concrete recommendations
+for scientific inference. We conclude by discussing options for FI uncertainty
+estimation and point to directions for future research aiming at full
+statistical inference from black-box ML models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Advances and Open Challenges in Federated Foundation Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.15381v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.15381v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chao Ren, Han Yu, Hongyi Peng, Xiaoli Tang, Bo Zhao, Liping Yi, Alysa Ziying Tan, Yulan Gao, Anran Li, Xiaoxiao Li, Zengxiang Li, Qiang Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The integration of Foundation Models (FMs) with Federated Learning (FL)
+presents a transformative paradigm in Artificial Intelligence (AI). This
+integration offers enhanced capabilities while addressing concerns of privacy,
+data decentralization, and computational efficiency. This paper provides a
+comprehensive survey of the emerging field of Federated Foundation Models
+(FedFM), elucidating their synergistic relationship and exploring novel
+methodologies, challenges, and future directions that the FL research field
+needs to focus on in order to thrive in the age of FMs. A systematic
+multi-tiered taxonomy is proposed, categorizing existing FedFM approaches for
+model training, aggregation, trustworthiness, and incentivization. Key
+challenges, including how to enable FL to deal with high complexity of
+computational demands, privacy considerations, contribution evaluation, and
+communication efficiency, are thoroughly discussed. Moreover, the paper
+explores the intricate challenges of communication, scalability, and security
+inherent in training/fine-tuning FMs via FL. It highlights the potential of
+quantum computing to revolutionize the processes of training, inference,
+optimization, and data encryption. This survey also introduces the
+implementation requirement of FedFM and some practical FedFM applications.
+Then, this survey provides the lessons with a clear understanding of our
+findings for FedFM. Finally, this survey not only provides insights into the
+current state and challenges of FedFM but also paves the way for future
+research directions, emphasizing the need for developing trustworthy solutions.
+It serves as a foundational guide for researchers and practitioners interested
+in contributing to this interdisciplinary and rapidly advancing field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Survey of Federated Foundation Models (FedFM)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Next Level Message-Passing with Hierarchical Support Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.15852v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.15852v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Carlos Vonessen, Florian Grötschla, Roger Wattenhofer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Message-Passing Neural Networks (MPNNs) are extensively employed in graph
+learning tasks but suffer from limitations such as the restricted scope of
+information exchange, by being confined to neighboring nodes during each round
+of message passing. Various strategies have been proposed to address these
+limitations, including incorporating virtual nodes to facilitate global
+information exchange. In this study, we introduce the Hierarchical Support
+Graph (HSG), an extension of the virtual node concept created through recursive
+coarsening of the original graph. This approach provides a flexible framework
+for enhancing information flow in graphs, independent of the specific MPNN
+layers utilized. We present a theoretical analysis of HSGs, investigate their
+empirical performance, and demonstrate that HSGs can surpass other methods
+augmented with virtual nodes, achieving state-of-the-art results across
+multiple datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A comparison between humans and AI at recognizing objects in unusual
+  poses 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.03973v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.03973v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Netta Ollikka, Amro Abbas, Andrea Perin, Markku Kilpeläinen, Stéphane Deny
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning is closing the gap with human vision on several object
+recognition benchmarks. Here we investigate this gap for challenging images
+where objects are seen in unusual poses. We find that humans excel at
+recognizing objects in such poses. In contrast, state-of-the-art deep networks
+for vision (EfficientNet, SWAG, ViT, SWIN, BEiT, ConvNext) and state-of-the-art
+large vision-language models (Claude 3.5, Gemini 1.5, GPT-4) are systematically
+brittle on unusual poses, with the exception of Gemini showing excellent
+robustness in that condition. As we limit image exposure time, human
+performance degrades to the level of deep networks, suggesting that additional
+mental processes (requiring additional time) are necessary to identify objects
+in unusual poses. An analysis of error patterns of humans vs. networks reveals
+that even time-limited humans are dissimilar to feed-forward deep networks. In
+conclusion, our comparison reveals that humans and deep networks rely on
+different mechanisms for recognizing objects in unusual poses. Understanding
+the nature of the mental processes taking place during extra viewing time may
+be key to reproduce the robustness of human vision in silico.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Gradient Descent Fails to Learn High-frequency Functions and Modular
+  Arithmetic 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12660v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12660v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rustem Takhanov, Maxat Tezekbayev, Artur Pak, Arman Bolatov, Zhenisbek Assylbekov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Classes of target functions containing a large number of approximately
+orthogonal elements are known to be hard to learn by the Statistical Query
+algorithms. Recently this classical fact re-emerged in a theory of
+gradient-based optimization of neural networks. In the novel framework, the
+hardness of a class is usually quantified by the variance of the gradient with
+respect to a random choice of a target function.
+  A set of functions of the form $x\to ax \bmod p$, where $a$ is taken from
+${\mathbb Z}_p$, has attracted some attention from deep learning theorists and
+cryptographers recently. This class can be understood as a subset of
+$p$-periodic functions on ${\mathbb Z}$ and is tightly connected with a class
+of high-frequency periodic functions on the real line.
+  We present a mathematical analysis of limitations and challenges associated
+with using gradient-based learning techniques to train a high-frequency
+periodic function or modular multiplication from examples. We highlight that
+the variance of the gradient is negligibly small in both cases when either a
+frequency or the prime base $p$ is large. This in turn prevents such a learning
+algorithm from being successful.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The $μ\mathcal{G}$ Language for Programming Graph Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.09441v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.09441v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matteo Belenchia, Flavio Corradini, Michela Quadrini, Michele Loreti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks form a class of deep learning architectures
+specifically designed to work with graph-structured data. As such, they share
+the inherent limitations and problems of deep learning, especially regarding
+the issues of explainability and trustworthiness. We propose $\mu\mathcal{G}$,
+an original domain-specific language for the specification of graph neural
+networks that aims to overcome these issues. The language's syntax is
+introduced, and its meaning is rigorously defined by a denotational semantics.
+An equivalent characterization in the form of an operational semantics is also
+provided and, together with a type system, is used to prove the type soundness
+of $\mu\mathcal{G}$. We show how $\mu\mathcal{G}$ programs can be represented
+in a more user-friendly graphical visualization, and provide examples of its
+generality by showing how it can be used to define some of the most popular
+graph neural network models, or to develop any custom graph processing
+application.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluating the Predictive Features of Person-Centric Knowledge Graph
+  Embeddings: Unfolding Ablation Studies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15294v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15294v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christos Theodoropoulos, Natasha Mulligan, Joao Bettencourt-Silva
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Developing novel predictive models with complex biomedical information is
+challenging due to various idiosyncrasies related to heterogeneity,
+standardization or sparseness of the data. We previously introduced a
+person-centric ontology to organize information about individual patients, and
+a representation learning framework to extract person-centric knowledge graphs
+(PKGs) and to train Graph Neural Networks (GNNs). In this paper, we propose a
+systematic approach to examine the results of GNN models trained with both
+structured and unstructured information from the MIMIC-III dataset. Through
+ablation studies on different clinical, demographic, and social data, we show
+the robustness of this approach in identifying predictive features in PKGs for
+the task of readmission prediction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in the 34th Medical Informatics Europe Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can Synthetic Audio From Generative Foundation Models Assist Audio
+  Recognition and Speech Modeling? <span class="chip">INTERSPEECH</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.08800v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.08800v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tiantian Feng, Dimitrios Dimitriadis, Shrikanth Narayanan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in foundation models have enabled audio-generative models
+that produce high-fidelity sounds associated with music, events, and human
+actions. Despite the success achieved in modern audio-generative models, the
+conventional approach to assessing the quality of the audio generation relies
+heavily on distance metrics like Frechet Audio Distance. In contrast, we aim to
+evaluate the quality of audio generation by examining the effectiveness of
+using them as training data. Specifically, we conduct studies to explore the
+use of synthetic audio for audio recognition. Moreover, we investigate whether
+synthetic audio can serve as a resource for data augmentation in speech-related
+modeling. Our comprehensive experiments demonstrate the potential of using
+synthetic audio for audio recognition and speech-related modeling. Our code is
+available at https://github.com/usc-sail/SynthAudio.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to 2024 INTERSPEECH; corrections to ActivityNet labels</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Topology-aware Data Augmentation for High-Degree Graph Neural
+  Networks <span class="chip">KDD 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.05482v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.05482v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yurui Lai, Xiaoyang Lin, Renchi Yang, Hongtao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, graph neural networks (GNNs) have emerged as a potent tool
+for learning on graph-structured data and won fruitful successes in varied
+fields. The majority of GNNs follow the message-passing paradigm, where
+representations of each node are learned by recursively aggregating features of
+its neighbors. However, this mechanism brings severe over-smoothing and
+efficiency issues over high-degree graphs (HDGs), wherein most nodes have
+dozens (or even hundreds) of neighbors, such as social networks, transaction
+graphs, power grids, etc. Additionally, such graphs usually encompass rich and
+complex structure semantics, which are hard to capture merely by feature
+aggregations in GNNs. Motivated by the above limitations, we propose TADA, an
+efficient and effective front-mounted data augmentation framework for GNNs on
+HDGs. Under the hood, TADA includes two key modules: (i) feature expansion with
+structure embeddings, and (ii) topology- and attribute-aware graph
+sparsification. The former obtains augmented node features and enhanced model
+capacity by encoding the graph structure into high-quality structure embeddings
+with our highly-efficient sketching method. Further, by exploiting
+task-relevant features extracted from graph structures and attributes, the
+second module enables the accurate identification and reduction of numerous
+redundant/noisy edges from the input graph, thereby alleviating over-smoothing
+and facilitating faster feature aggregations over HDGs. Empirically, TADA
+considerably improves the predictive performance of mainstream GNN models on 8
+real homophilic/heterophilic HDGs in terms of node classification, while
+achieving efficient training and inference processes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is the technical report for the paper accepted to KDD 2024. 16
+  pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PsychoGAT: A Novel Psychological Measurement Paradigm through
+  Interactive Fiction Games with LLM Agents <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.12326v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.12326v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qisen Yang, Zekun Wang, Honghui Chen, Shenzhi Wang, Yifan Pu, Xin Gao, Wenhao Huang, Shiji Song, Gao Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Psychological measurement is essential for mental health, self-understanding,
+and personal development. Traditional methods, such as self-report scales and
+psychologist interviews, often face challenges with engagement and
+accessibility. While game-based and LLM-based tools have been explored to
+improve user interest and automate assessment, they struggle to balance
+engagement with generalizability. In this work, we propose PsychoGAT
+(Psychological Game AgenTs) to achieve a generic gamification of psychological
+assessment. The main insight is that powerful LLMs can function both as adept
+psychologists and innovative game designers. By incorporating LLM agents into
+designated roles and carefully managing their interactions, PsychoGAT can
+transform any standardized scales into personalized and engaging interactive
+fiction games. To validate the proposed method, we conduct psychometric
+evaluations to assess its effectiveness and employ human evaluators to examine
+the generated content across various psychological constructs, including
+depression, cognitive distortions, and personality traits. Results demonstrate
+that PsychoGAT serves as an effective assessment tool, achieving statistically
+significant excellence in psychometric metrics such as reliability, convergent
+validity, and discriminant validity. Moreover, human evaluations confirm
+PsychoGAT's enhancements in content coherence, interactivity, interest,
+immersion, and satisfaction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Neighborhood and Global Perturbations Supported SAM in Federated
+  Learning: From Local Tweaks To Global Awareness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14144v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14144v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Boyuan Li, Zihao Peng, Yafei Li, Mingliang Xu, Shengbo Chen, Baofeng Ji, Cong Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) can be coordinated under the orchestration of a
+central server to collaboratively build a privacy-preserving model without the
+need for data exchange. However, participant data heterogeneity leads to local
+optima divergence, subsequently affecting convergence outcomes. Recent research
+has focused on global sharpness-aware minimization (SAM) and dynamic
+regularization techniques to enhance consistency between global and local
+generalization and optimization objectives. Nonetheless, the estimation of
+global SAM introduces additional computational and memory overhead, while
+dynamic regularization suffers from bias in the local and global dual variables
+due to training isolation. In this paper, we propose a novel FL algorithm,
+FedTOGA, designed to consider optimization and generalization objectives while
+maintaining minimal uplink communication overhead. By linking local
+perturbations to global updates, global generalization consistency is improved.
+Additionally, global updates are used to correct local dynamic regularizers,
+reducing dual variables bias and enhancing optimization consistency. Global
+updates are passively received by clients, reducing overhead. We also propose
+neighborhood perturbation to approximate local perturbation, analyzing its
+strengths and limitations. Theoretical analysis shows FedTOGA achieves faster
+convergence $O(1/T)$ under non-convex functions. Empirical studies demonstrate
+that FedTOGA outperforms state-of-the-art algorithms, with a 1\% accuracy
+increase and 30\% faster convergence, achieving state-of-the-art.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Uncertainty-based Fairness Measures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.11299v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.11299v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Selim Kuzucu, Jiaee Cheong, Hatice Gunes, Sinan Kalkan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unfair predictions of machine learning (ML) models impede their broad
+acceptance in real-world settings. Tackling this arduous challenge first
+necessitates defining what it means for an ML model to be fair. This has been
+addressed by the ML community with various measures of fairness that depend on
+the prediction outcomes of the ML models, either at the group level or the
+individual level. These fairness measures are limited in that they utilize
+point predictions, neglecting their variances, or uncertainties, making them
+susceptible to noise, missingness and shifts in data. In this paper, we first
+show that an ML model may appear to be fair with existing point-based fairness
+measures but biased against a demographic group in terms of prediction
+uncertainties. Then, we introduce new fairness measures based on different
+types of uncertainties, namely, aleatoric uncertainty and epistemic
+uncertainty. We demonstrate on many datasets that (i) our uncertainty-based
+measures are complementary to existing measures of fairness, and (ii) they
+provide more insights about the underlying issues leading to bias.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LaMAGIC: Language-Model-based Topology Generation for Analog Integrated
+  Circuits 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.18269v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.18269v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen-Chia Chang, Yikang Shen, Shaoze Fan, Jing Li, Shun Zhang, Ningyuan Cao, Yiran Chen, Xin Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the realm of electronic and electrical engineering, automation of analog
+circuit is increasingly vital given the complexity and customized requirements
+of modern applications. However, existing methods only develop search-based
+algorithms that require many simulation iterations to design a custom circuit
+topology, which is usually a time-consuming process. To this end, we introduce
+LaMAGIC, a pioneering language model-based topology generation model that
+leverages supervised finetuning for automated analog circuit design. LaMAGIC
+can efficiently generate an optimized circuit design from the custom
+specification in a single pass. Our approach involves a meticulous development
+and analysis of various input and output formulations for circuit. These
+formulations can ensure canonical representations of circuits and align with
+the autoregressive nature of LMs to effectively addressing the challenges of
+representing analog circuits as graphs. The experimental results show that
+LaMAGIC achieves a success rate of up to 96\% under a strict tolerance of 0.01.
+We also examine the scalability and adaptability of LaMAGIC, specifically
+testing its performance on more complex circuits. Our findings reveal the
+enhanced effectiveness of our adjacency matrix-based circuit formulation with
+floating-point input, suggesting its suitability for handling intricate circuit
+designs. This research not only demonstrates the potential of language models
+in graph generation, but also builds a foundational framework for future
+explorations in automated analog circuit design.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Proceedings of the 41st International Conference on Machine Learning,
+  PMLR 235:6253-6262 https://proceedings.mlr.press/v235/chang24c.html</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ 1 From the Pursuit of Universal AGI Architecture to Systematic Approach
+  to Heterogenous AGI: Addressing Alignment, Energy, & AGI Grand Challenges 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.15274v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.15274v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eren Kurshan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  AI faces a trifecta of grand challenges: the Energy Wall, the Alignment
+Problem and the Leap from Narrow AI to AGI. Contemporary AI solutions consume
+unsustainable amounts of energy during model training and daily operations.
+Making things worse, the amount of computation required to train each new AI
+model has been doubling every 2 months since 2020, directly translating to
+unprecedented increases in energy consumption.
+  The leap from AI to AGI requires multiple functional subsystems operating in
+a balanced manner, which requires a system architecture. However, the current
+approach to artificial intelligence lacks system design; even though system
+characteristics play a key role in the human brain; from the way it processes
+information to how it makes decisions. System design is the key to alignment,
+one of the most challenging goals in AI. This difficulty stems from the fact
+that the complexity of human moral system requires a similarly sophisticated
+system for alignment. Without accurately reflecting the complexity of these
+core moral subsystems and systems, aligning AI with human values becomes
+significantly more challenging.
+  In this paper, we posit that system design is the missing piece in overcoming
+the grand challenges. We present a Systematic Approach to AGI that utilizes
+system design principles to AGI, while providing ways to overcome the energy
+wall and the alignment challenges. This paper asserts that artificial
+intelligence can be realized through a multiplicity of design-specific
+pathways, rather than a singular, overarching AGI architecture. AGI systems may
+exhibit diverse architectural configurations and capabilities, contingent upon
+their intended use cases. It advocates for a focus on employing system design
+principles as a guiding framework, rather than solely concentrating on a
+universal AGI architecture.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>International Journal on Semantic Computing (2024) Categories:
+  Artificial Intelligence; AI; Artificial General Intelligence; AGI; System
+  Design; System Architecture</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Erasing Concepts from Text-to-Image Diffusion Models with Few-shot
+  Unlearning <span class="chip">BMVC2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.07288v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.07288v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Masane Fuchi, Tomohiro Takagi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generating images from text has become easier because of the scaling of
+diffusion models and advancements in the field of vision and language. These
+models are trained using vast amounts of data from the Internet. Hence, they
+often contain undesirable content such as copyrighted material. As it is
+challenging to remove such data and retrain the models, methods for erasing
+specific concepts from pre-trained models have been investigated. We propose a
+novel concept-erasure method that updates the text encoder using few-shot
+unlearning in which a few real images are used. The discussion regarding the
+generated images after erasing a concept has been lacking. While there are
+methods for specifying the transition destination for concepts, the validity of
+the specified concepts is unclear. Our method implicitly achieves this by
+transitioning to the latent concepts inherent in the model or the images. Our
+method can erase a concept within 10 s, making concept erasure more accessible
+than ever before. Implicitly transitioning to related concepts leads to more
+natural concept erasure. We applied the proposed method to various concepts and
+confirmed that concept erasure can be achieved tens to hundreds of times faster
+than with current methods. By varying the parameters to be updated, we obtained
+results suggesting that, like previous research, knowledge is primarily
+accumulated in the feed-forward networks of the text encoder. Our code is
+available at \url{https://github.com/fmp453/few-shot-erasing}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 28 figures, accepted by BMVC2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Best-of-Both-Worlds Algorithm for Constrained MDPs with Long-Term
+  Constraints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.14326v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.14326v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jacopo Germano, Francesco Emanuele Stradi, Gianmarco Genalti, Matteo Castiglioni, Alberto Marchesi, Nicola Gatti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study online learning in episodic constrained Markov decision processes
+(CMDPs), where the learner aims at collecting as much reward as possible over
+the episodes, while satisfying some long-term constraints during the learning
+process. Rewards and constraints can be selected either stochastically or
+adversarially, and the transition function is not known to the learner. While
+online learning in classical (unconstrained) MDPs has received considerable
+attention over the last years, the setting of CMDPs is still largely
+unexplored. This is surprising, since in real-world applications, such as,
+e.g., autonomous driving, automated bidding, and recommender systems, there are
+usually additional constraints and specifications that an agent has to obey
+during the learning process. In this paper, we provide the first
+best-of-both-worlds algorithm for CMDPs with long-term constraints, in the
+flavor of Balseiro et al. (2023). Our algorithm is capable of handling settings
+in which rewards and constraints are selected either stochastically or
+adversarially, without requiring any knowledge of the underling process.
+Moreover, our algorithm matches state-of-the-art regret and constraint
+violation bounds for settings in which constraints are selected stochastically,
+while it is the first to provide guarantees in the case in which they are
+chosen adversarially.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Category-Theoretical and Topos-Theoretical Frameworks in Machine
+  Learning: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14014v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14014v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiyang Jia, Guohong Peng, Zheng Yang, Tianhao Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this survey, we provide an overview of category theory-derived machine
+learning from four mainstream perspectives: gradient-based learning,
+probability-based learning, invariance and equivalence-based learning, and
+topos-based learning. For the first three topics, we primarily review research
+in the past five years, updating and expanding on the previous survey by
+Shiebler et al.. The fourth topic, which delves into higher category theory,
+particularly topos theory, is surveyed for the first time in this paper. In
+certain machine learning methods, the compositionality of functors plays a
+vital role, prompting the development of specific categorical frameworks.
+However, when considering how the global properties of a network reflect in
+local structures and how geometric properties are expressed with logic, the
+topos structure becomes particularly significant and profound.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CAST: Cluster-Aware Self-Training for Tabular Data via Reliable
+  Confidence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.06380v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.06380v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minwook Kim, Juseong Kim, Ki Beom Kim, Giltae Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tabular data is one of the most widely used data modalities, encompassing
+numerous datasets with substantial amounts of unlabeled data. Despite this
+prevalence, there is a notable lack of simple and versatile methods for
+utilizing unlabeled data in the tabular domain, where both gradient-boosting
+decision trees and neural networks are employed. In this context, self-training
+has gained attraction due to its simplicity and versatility, yet it is
+vulnerable to noisy pseudo-labels caused by erroneous confidence. Several
+solutions have been proposed to handle this problem, but they often compromise
+the inherent advantages of self-training, resulting in limited applicability in
+the tabular domain. To address this issue, we explore a novel direction of
+reliable confidence in self-training contexts and conclude that self-training
+can be improved by making that the confidence, which represents the value of
+the pseudo-label, aligns with the cluster assumption. In this regard, we
+propose Cluster-Aware Self-Training (CAST) for tabular data, which enhances
+existing self-training algorithms at a negligible cost while maintaining
+simplicity and versatility. Concretely, CAST calibrates confidence by
+regularizing the classifier's confidence based on local density for each class
+in the labeled training data, resulting in lower confidence for pseudo-labels
+in low-density regions. Extensive empirical evaluations on up to 21 real-world
+datasets confirm not only the superior performance of CAST but also its
+robustness in various setups in self-training contexts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages for main body, and 10 additional pages for appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mitigating Label Noise on Graph via Topological Sample Selection <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.01942v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.01942v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhao Wu, Jiangchao Yao, Xiaobo Xia, Jun Yu, Ruxin Wang, Bo Han, Tongliang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the success of the carefully-annotated benchmarks, the effectiveness
+of existing graph neural networks (GNNs) can be considerably impaired in
+practice when the real-world graph data is noisily labeled. Previous
+explorations in sample selection have been demonstrated as an effective way for
+robust learning with noisy labels, however, the conventional studies focus on
+i.i.d data, and when moving to non-iid graph data and GNNs, two notable
+challenges remain: (1) nodes located near topological class boundaries are very
+informative for classification but cannot be successfully distinguished by the
+heuristic sample selection. (2) there is no available measure that considers
+the graph topological information to promote sample selection in a graph. To
+address this dilemma, we propose a $\textit{Topological Sample Selection}$
+(TSS) method that boosts the informative sample selection process in a graph by
+utilising topological information. We theoretically prove that our procedure
+minimizes an upper bound of the expected risk under target clean distribution,
+and experimentally show the superiority of our method compared with
+state-of-the-art baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Satellite Sunroof: High-res Digital Surface Models and Roof Segmentation
+  for Global Solar Mapping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14400v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14400v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vishal Batchu, Alex Wilson, Betty Peng, Carl Elkin, Umangi Jain, Christopher Van Arsdale, Ross Goroshin, Varun Gulshan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The transition to renewable energy, particularly solar, is key to mitigating
+climate change. Google's Solar API aids this transition by estimating solar
+potential from aerial imagery, but its impact is constrained by geographical
+coverage. This paper proposes expanding the API's reach using satellite
+imagery, enabling global solar potential assessment. We tackle challenges
+involved in building a Digital Surface Model (DSM) and roof instance
+segmentation from lower resolution and single oblique views using deep learning
+models. Our models, trained on aligned satellite and aerial datasets, produce
+25cm DSMs and roof segments. With ~1m DSM MAE on buildings, ~5deg roof pitch
+error and ~56% IOU on roof segmentation, they significantly enhance the Solar
+API's potential to promote solar adoption.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DiffiT: Diffusion Vision <span class="highlight-title">Transformer</span>s for Image Generation <span class="chip">ECCV'24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.02139v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.02139v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Hatamizadeh, Jiaming Song, Guilin Liu, Jan Kautz, Arash Vahdat
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models with their powerful expressivity and high sample quality
+have achieved State-Of-The-Art (SOTA) performance in the generative domain. The
+pioneering Vision Transformer (ViT) has also demonstrated strong modeling
+capabilities and scalability, especially for recognition tasks. In this paper,
+we study the effectiveness of ViTs in diffusion-based generative learning and
+propose a new model denoted as Diffusion Vision Transformers (DiffiT).
+Specifically, we propose a methodology for finegrained control of the denoising
+process and introduce the Time-dependant Multihead Self Attention (TMSA)
+mechanism. DiffiT is surprisingly effective in generating high-fidelity images
+with significantly better parameter efficiency. We also propose latent and
+image space DiffiT models and show SOTA performance on a variety of
+class-conditional and unconditional synthesis tasks at different resolutions.
+The Latent DiffiT model achieves a new SOTA FID score of 1.73 on ImageNet256
+dataset while having 19.85%, 16.88% less parameters than other
+Transformer-based diffusion models such as MDT and DiT,respectively. Code:
+https://github.com/NVlabs/DiffiT
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ECCV'24</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning from Heterogeneity: A Dynamic Learning Framework for
+  Hypergraphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03411v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03411v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tiehua Zhang, Yuze Liu, Zhishu Shen, Xingjun Ma, Peng Qi, Zhijun Ding, Jiong Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural network (GNN) has gained increasing popularity in recent years
+owing to its capability and flexibility in modeling complex graph structure
+data. Among all graph learning methods, hypergraph learning is a technique for
+exploring the implicit higher-order correlations when training the embedding
+space of the graph. In this paper, we propose a hypergraph learning framework
+named LFH that is capable of dynamic hyperedge construction and attentive
+embedding update utilizing the heterogeneity attributes of the graph.
+Specifically, in our framework, the high-quality features are first generated
+by the pairwise fusion strategy that utilizes explicit graph structure
+information when generating initial node embedding. Afterwards, a hypergraph is
+constructed through the dynamic grouping of implicit hyperedges, followed by
+the type-specific hypergraph learning process. To evaluate the effectiveness of
+our proposed framework, we conduct comprehensive experiments on several popular
+datasets with eleven state-of-the-art models on both node classification and
+link prediction tasks, which fall into categories of homogeneous pairwise graph
+learning, heterogeneous pairwise graph learning, and hypergraph learning. The
+experiment results demonstrate a significant performance gain (average 12.5% in
+node classification and 13.3% in link prediction) compared with recent
+state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Communication Optimization for Distributed Training: Architecture,
+  Advances, and Opportunities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.07585v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.07585v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunze Wei, Tianshuo Hu, Cong Liang, Yong Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The past few years have witnessed the flourishing of large-scale deep neural
+network models with ever-growing parameter numbers. Training such large-scale
+models typically requires massive memory and computing resources, necessitating
+distributed training. As GPU performance has rapidly evolved in recent years,
+computation time has shrunk, making communication a larger portion of the
+overall training time. Consequently, optimizing communication for distributed
+training has become crucial. In this article, we briefly introduce the general
+architecture of distributed deep neural network training and analyze
+relationships among Parallelization Strategy, Collective Communication Library,
+and Network from the perspective of communication optimization, which forms a
+three-layer paradigm. We then review current representative research advances
+within this three-layer paradigm. We find that layers in the current
+three-layer paradigm are relatively independent and there is a rich design
+space for cross-layer collaborative optimization in distributed training
+scenarios. Therefore, we advocate "Vertical" and "Horizontal" co-designs which
+extend the three-layer paradigm to a five-layer paradigm. We also advocate
+"Intra-Inter" and "Host-Net" co-designs to further utilize the potential of
+heterogeneous resources. We hope this article can shed some light on future
+research on communication optimization for distributed training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GenRec: Generative Sequential Recommendation with Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.21191v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.21191v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Panfeng Cao, Pietro Lio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequential recommendation is a task to capture hidden user preferences from
+historical user item interaction data and recommend next items for the user.
+Significant progress has been made in this domain by leveraging classification
+based learning methods. Inspired by the recent paradigm of 'pretrain, prompt
+and predict' in NLP, we consider sequential recommendation as a sequence to
+sequence generation task and propose a novel model named Generative
+Recommendation (GenRec). Unlike classification based models that learn explicit
+user and item representations, GenRec utilizes the sequence modeling capability
+of Transformer and adopts the masked item prediction objective to effectively
+learn the hidden bidirectional sequential patterns. Different from existing
+generative sequential recommendation models, GenRec does not rely on manually
+designed hard prompts. The input to GenRec is textual user item sequence and
+the output is top ranked next items. Moreover, GenRec is lightweight and
+requires only a few hours to train effectively in low-resource settings, making
+it highly applicable to real-world scenarios and helping to democratize large
+language models in the sequential recommendation domain. Our extensive
+experiments have demonstrated that GenRec generalizes on various public
+real-world datasets and achieves state-of-the-art results. Our experiments also
+validate the effectiveness of the the proposed masked item prediction objective
+that improves the model performance by a large margin.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Data-Limited Graph Neural Networks by Actively Distilling
+  Knowledge from Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.13989v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.13989v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quan Li, Tianxiang Zhao, Lingwei Chen, Junjie Xu, Suhang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graphs are pervasive in the real-world, such as social network analysis,
+bioinformatics, and knowledge graphs. Graph neural networks (GNNs) have great
+ability in node classification, a fundamental task on graphs. Unfortunately,
+conventional GNNs still face challenges in scenarios with few labeled nodes,
+despite the prevalence of few-shot node classification tasks in real-world
+applications. To address this challenge, various approaches have been proposed,
+including graph meta-learning, transfer learning, and methods based on Large
+Language Models (LLMs). However, traditional meta-learning and transfer
+learning methods often require prior knowledge from base classes or fail to
+exploit the potential advantages of unlabeled nodes. Meanwhile, LLM-based
+methods may overlook the zero-shot capabilities of LLMs and rely heavily on the
+quality of generated contexts. In this paper, we propose a novel approach that
+integrates LLMs and GNNs, leveraging the zero-shot inference and reasoning
+capabilities of LLMs and employing a Graph-LLM-based active learning paradigm
+to enhance GNNs' performance. Extensive experiments demonstrate the
+effectiveness of our model in improving node classification accuracy with
+considerably limited labeled data, surpassing state-of-the-art baselines by
+significant margins.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 3 Figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VFLIP: A Backdoor Defense for Vertical Federated Learning via
+  Identification and Purification <span class="chip">ESORICS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15591v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15591v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yungi Cho, Woorim Han, Miseon Yu, Younghan Lee, Ho Bae, Yunheung Paek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vertical Federated Learning (VFL) focuses on handling vertically partitioned
+data over FL participants. Recent studies have discovered a significant
+vulnerability in VFL to backdoor attacks which specifically target the distinct
+characteristics of VFL. Therefore, these attacks may neutralize existing
+defense mechanisms designed primarily for Horizontal Federated Learning (HFL)
+and deep neural networks. In this paper, we present the first backdoor defense,
+called VFLIP, specialized for VFL. VFLIP employs the identification and
+purification techniques that operate at the inference stage, consequently
+improving the robustness against backdoor attacks to a great extent. VFLIP
+first identifies backdoor-triggered embeddings by adopting a participant-wise
+anomaly detection approach. Subsequently, VFLIP conducts purification which
+removes the embeddings identified as malicious and reconstructs all the
+embeddings based on the remaining embeddings. We conduct extensive experiments
+on CIFAR10, CINIC10, Imagenette, NUS-WIDE, and BankMarketing to demonstrate
+that VFLIP can effectively mitigate backdoor attacks in VFL.
+https://github.com/blingcho/VFLIP-esorics24
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by 29th European Symposium on Research in Computer Security
+  (ESORICS 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CityLight: A Universal Model for Coordinated Traffic Signal Control in
+  City-scale Heterogeneous Intersections 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02126v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02126v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinwei Zeng, Chao Yu, Xinyi Yang, Wenxuan Ao, Qianyue Hao, Jian Yuan, Yong Li, Yu Wang, Huazhong Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The increasingly severe congestion problem in modern cities strengthens the
+significance of developing city-scale traffic signal control (TSC) methods for
+traffic efficiency enhancement. While reinforcement learning has been widely
+explored in TSC, most of them still target small-scale optimization and cannot
+directly scale to the city level due to unbearable resource demand. Only a few
+of them manage to tackle city-level optimization, namely a thousand-scale
+optimization, by incorporating parameter-sharing mechanisms, but hardly have
+they fully tackled the heterogeneity of intersections and intricate
+between-intersection interactions inherent in real-world city road networks. To
+fill in the gap, we target at the two important challenges in adopting
+parameter-sharing paradigms to solve TSC: inconsistency of inner state
+representations for intersections heterogeneous in configuration, scale, and
+orders of available traffic phases; intricacy of impacts from neighborhood
+intersections that have various relative traffic relationships due to
+inconsistent phase orders and diverse relative positioning. Our method,
+CityLight, features a universal representation module that not only aligns the
+state representations of intersections by reindexing their phases based on
+their semantics and designing heterogeneity-preserving observations, but also
+encodes the narrowed relative traffic relation types to project the
+neighborhood intersections onto a uniform relative traffic impact space. We
+further attentively fuse neighborhood representations based on their competing
+relations and incorporate neighborhood-integrated rewards to boost
+coordination. Extensive experiments with hundreds to tens of thousands of
+intersections validate the surprising effectiveness and generalizability of
+CityLight, with an overall performance gain of 11.68% and a 22.59% improvement
+in transfer scenarios in throughput.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Non-Stationary Bandit Learning via Predictive Sampling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.01970v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.01970v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yueyang Liu, Xu Kuang, Benjamin Van Roy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Thompson sampling has proven effective across a wide range of stationary
+bandit environments. However, as we demonstrate in this paper, it can perform
+poorly when applied to non-stationary environments. We attribute such failures
+to the fact that, when exploring, the algorithm does not differentiate actions
+based on how quickly the information acquired loses its usefulness due to
+non-stationarity. Building upon this insight, we propose predictive sampling,
+an algorithm that deprioritizes acquiring information that quickly loses
+usefulness. A theoretical guarantee on the performance of predictive sampling
+is established through a Bayesian regret bound. We provide versions of
+predictive sampling for which computations tractably scale to complex bandit
+environments of practical interest. Through numerical simulations, we
+demonstrate that predictive sampling outperforms Thompson sampling in all
+non-stationary environments examined.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Normalized Bottleneck Distance on Persistence Diagrams and Homology
+  Preservation under Dimension Reduction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.06727v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.06727v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nathan H. May, Bala Krishnamoorthy, Patrick Gambill
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Persistence diagrams (PDs) are used as signatures of point cloud data. Two
+clouds of points can be compared using the bottleneck distance d_B between
+their PDs. A potential drawback of this pipeline is that point clouds sampled
+from topologically similar manifolds can have arbitrarily large d_B when there
+is a large scaling between them. This situation is typical in dimension
+reduction frameworks.
+  We define, and study properties of, a new scale-invariant distance between
+PDs termed normalized bottleneck distance, d_N. In defining d_N, we develop a
+broader framework called metric decomposition for comparing finite metric
+spaces of equal cardinality with a bijection. We utilize metric decomposition
+to prove a stability result for d_N by deriving an explicit bound on the
+distortion of the bijective map. We then study two popular dimension reduction
+techniques, Johnson-Lindenstrauss (JL) projections and metric multidimensional
+scaling (mMDS), and a third class of general biLipschitz mappings. We provide
+new bounds on how well these dimension reduction techniques preserve homology
+with respect to d_N. For a JL map f that transforms input X to f(X), we show
+that d_N(dgm(X),dgm(f(X))) < e, where dgm(X) is the Vietoris-Rips PD of X, and
+pairwise distances are preserved by f up to the tolerance 0 < \epsilon < 1. For
+mMDS, we present new bounds for d_B and d_N between PDs of X and its projection
+in terms of the eigenvalues of the covariance matrix. And for k-biLipschitz
+maps, we show that d_N is bounded by the product of (k^2-1)/k and the ratio of
+diameters of X and f(X). Finally, we use computational experiments to
+demonstrate the increased effectiveness of using the normalized bottleneck
+distance for clustering sets of point clouds sampled from different shapes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Added computational experiments; published in La Matematica</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LlamaDuo: LLMOps Pipeline for Seamless Migration from Service LLMs to
+  Small-Scale Local LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13467v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13467v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chansung Park, Juyong Jiang, Fan Wang, Sayak Paul, Jing Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The widespread adoption of cloud-based proprietary large language models
+(LLMs) has introduced significant challenges, including operational
+dependencies, privacy concerns, and the necessity of continuous internet
+connectivity. In this work, we introduce an LLMOps pipeline, "LlamaDuo", for
+the seamless migration of knowledge and abilities from service-oriented LLMs to
+smaller, locally manageable models. This pipeline is crucial for ensuring
+service continuity in the presence of operational failures, strict privacy
+policies, or offline requirements. Our LlamaDuo involves fine-tuning a small
+language model against the service LLM using a synthetic dataset generated by
+the latter. If the performance of the fine-tuned model falls short of
+expectations, it is enhanced by further fine-tuning with additional similar
+data created by the service LLM. This iterative process guarantees that the
+smaller model can eventually match or even surpass the service LLM's
+capabilities in specific downstream tasks, offering a practical and scalable
+solution for managing AI deployments in constrained environments. Extensive
+experiments with leading edge LLMs are conducted to demonstrate the
+effectiveness, adaptability, and affordability of LlamaDuo across various
+downstream tasks. Our pipeline implementation is available at
+https://github.com/deep-diver/llamaduo.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, 18 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scalable Variational Causal Discovery Unconstrained by Acyclicity <span class="chip">ECAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.04992v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.04992v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nu Hoang, Bao Duong, Thin Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bayesian causal discovery offers the power to quantify epistemic
+uncertainties among a broad range of structurally diverse causal theories
+potentially explaining the data, represented in forms of directed acyclic
+graphs (DAGs). However, existing methods struggle with efficient DAG sampling
+due to the complex acyclicity constraint. In this study, we propose a scalable
+Bayesian approach to effectively learn the posterior distribution over causal
+graphs given observational data thanks to the ability to generate DAGs without
+explicitly enforcing acyclicity. Specifically, we introduce a novel
+differentiable DAG sampling method that can generate a valid acyclic causal
+graph by mapping an unconstrained distribution of implicit topological orders
+to a distribution over DAGs. Given this efficient DAG sampling scheme, we are
+able to model the posterior distribution over causal graphs using a simple
+variational distribution over a continuous domain, which can be learned via the
+variational inference framework. Extensive empirical experiments on both
+simulated and real datasets demonstrate the superior performance of the
+proposed model compared to several state-of-the-art baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ECAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enabling Causal Discovery in Post-Nonlinear Models with Normalizing
+  Flows <span class="chip">ECAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.04980v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.04980v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nu Hoang, Bao Duong, Thin Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Post-nonlinear (PNL) causal models stand out as a versatile and adaptable
+framework for modeling intricate causal relationships. However, accurately
+capturing the invertibility constraint required in PNL models remains
+challenging in existing studies. To address this problem, we introduce CAF-PoNo
+(Causal discovery via Normalizing Flows for Post-Nonlinear models), harnessing
+the power of the normalizing flows architecture to enforce the crucial
+invertibility constraint in PNL models. Through normalizing flows, our method
+precisely reconstructs the hidden noise, which plays a vital role in
+cause-effect identification through statistical independence testing.
+Furthermore, the proposed approach exhibits remarkable extensibility, as it can
+be seamlessly expanded to facilitate multivariate causal discovery via causal
+order identification, empowering us to efficiently unravel complex causal
+relationships. Extensive experimental evaluations on both simulated and real
+datasets consistently demonstrate that the proposed method outperforms several
+state-of-the-art approaches in both bivariate and multivariate causal discovery
+tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Acepted at ECAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ How to avoid machine learning pitfalls: a guide for academic researchers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2108.02497v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2108.02497v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael A. Lones
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mistakes in machine learning practice are commonplace, and can result in a
+loss of confidence in the findings and products of machine learning. This guide
+outlines common mistakes that occur when using machine learning, and what can
+be done to avoid them. Whilst it should be accessible to anyone with a basic
+understanding of machine learning techniques, it focuses on issues that are of
+particular concern within academic research, such as the need to do rigorous
+comparisons and reach valid conclusions. It covers five stages of the machine
+learning process: what to do before model building, how to reliably build
+models, how to robustly evaluate models, how to compare models fairly, and how
+to report results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">3</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MultiMediate'24: Multi-Domain Engagement Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16625v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16625v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Philipp Müller, Michal Balazia, Tobias Baur, Michael Dietz, Alexander Heimerl, Anna Penzkofer, Dominik Schiller, François Brémond, Jan Alexandersson, Elisabeth André, Andreas Bulling
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Estimating the momentary level of participant's engagement is an important
+prerequisite for assistive systems that support human interactions. Previous
+work has addressed this task in within-domain evaluation scenarios, i.e.
+training and testing on the same dataset. This is in contrast to real-life
+scenarios where domain shifts between training and testing data frequently
+occur. With MultiMediate'24, we present the first challenge addressing
+multi-domain engagement estimation. As training data, we utilise the NOXI
+database of dyadic novice-expert interactions. In addition to within-domain
+test data, we add two new test domains. First, we introduce recordings
+following the NOXI protocol but covering languages that are not present in the
+NOXI training data. Second, we collected novel engagement annotations on the
+MPIIGroupInteraction dataset which consists of group discussions between three
+to four people. In this way, MultiMediate'24 evaluates the ability of
+approaches to generalise across factors such as language and cultural
+background, group size, task, and screen-mediated vs. face-to-face interaction.
+This paper describes the MultiMediate'24 challenge and presents baseline
+results. In addition, we discuss selected challenge solutions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2308.08256</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Human-Inspired Audio-Visual Speech Recognition: Spike Activity, Cueing
+  Interaction and Causal Processing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16564v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16564v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qianhui Liu, Jiadong Wang, Yang Wang, Xin Yang, Gang Pan, Haizhou Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans naturally perform audiovisual speech recognition (AVSR), enhancing the
+accuracy and robustness by integrating auditory and visual information. Spiking
+neural networks (SNNs), which mimic the brain's information-processing
+mechanisms, are well-suited for emulating the human capability of AVSR. Despite
+their potential, research on SNNs for AVSR is scarce, with most existing
+audio-visual multimodal methods focused on object or digit recognition. These
+models simply integrate features from both modalities, neglecting their unique
+characteristics and interactions. Additionally, they often rely on future
+information for current processing, which increases recognition latency and
+limits real-time applicability. Inspired by human speech perception, this paper
+proposes a novel human-inspired SNN named HI-AVSNN for AVSR, incorporating
+three key characteristics: cueing interaction, causal processing and spike
+activity. For cueing interaction, we propose a visual-cued auditory attention
+module (VCA2M) that leverages visual cues to guide attention to auditory
+features. We achieve causal processing by aligning the SNN's temporal dimension
+with that of visual and auditory features and applying temporal masking to
+utilize only past and current information. To implement spike activity, in
+addition to using SNNs, we leverage the event camera to capture lip movement as
+spikes, mimicking the human retina and providing efficient visual data. We
+evaluate HI-AVSNN on an audiovisual speech recognition dataset combining the
+DVS-Lip dataset with its corresponding audio samples. Experimental results
+demonstrate the superiority of our proposed fusion method, outperforming
+existing audio-visual SNN fusion methods and achieving a 2.27% improvement in
+accuracy over the only existing SNN-based AVSR method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ WavTokenizer: an Efficient Acoustic Discrete Codec Tokenizer for Audio
+  Language Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16532v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16532v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengpeng Ji, Ziyue Jiang, Xize Cheng, Yifu Chen, Minghui Fang, Jialong Zuo, Qian Yang, Ruiqi Li, Ziang Zhang, Xiaoda Yang, Rongjie Huang, Yidi Jiang, Qian Chen, Siqi Zheng, Wen Wang, Zhou Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language models have been effectively applied to modeling natural signals,
+such as images, video, speech, and audio. A crucial component of these models
+is the codec tokenizer, which compresses high-dimensional natural signals into
+lower-dimensional discrete tokens. In this paper, we introduce WavTokenizer,
+which offers several advantages over previous SOTA acoustic codec models in the
+audio domain: 1)extreme compression. By compressing the layers of quantizers
+and the temporal dimension of the discrete codec, one-second audio of 24kHz
+sampling rate requires only a single quantizer with 40 or 75 tokens. 2)improved
+subjective quality. Despite the reduced number of tokens, WavTokenizer achieves
+state-of-the-art reconstruction quality with outstanding UTMOS scores and
+inherently contains richer semantic information. Specifically, we achieve these
+results by designing a broader VQ space, extended contextual windows, and
+improved attention networks, as well as introducing a powerful multi-scale
+discriminator and an inverse Fourier transform structure. We conducted
+extensive reconstruction experiments in the domains of speech, audio, and
+music. WavTokenizer exhibited strong performance across various objective and
+subjective metrics compared to state-of-the-art models. We also tested semantic
+information, VQ utilization, and adaptability to generative models.
+Comprehensive ablation studies confirm the necessity of each module in
+WavTokenizer. The related code, demos, and pre-trained models are available at
+https://github.com/jishengpeng/WavTokenizer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Working in progress. arXiv admin note: text overlap with
+  arXiv:2402.12208</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2024-08-28T00:00:00Z">2024-08-28</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">71</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CoGen: Learning from Feedback with Coupled Comprehension and Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15992v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15992v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mustafa Omer Gul, Yoav Artzi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Systems with both language comprehension and generation capabilities can
+benefit from the tight connection between the two. This work studies coupling
+comprehension and generation with focus on continually learning from
+interaction with users. We propose techniques to tightly integrate the two
+capabilities for both learning and inference. We situate our studies in
+two-player reference games, and deploy various models for thousands of
+interactions with human users, while learning from interaction feedback
+signals. We show dramatic improvements in performance over time, with
+comprehension-generation coupling leading to performance improvements up to 26%
+in absolute terms and up to 17% higher accuracies compared to a non-coupled
+system. Our analysis also shows coupling has substantial qualitative impact on
+the system's language, making it significantly more human-like.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BattleAgentBench: A Benchmark for Evaluating Cooperation and Competition
+  Capabilities of Language Models in Multi-Agent Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15971v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15971v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Wang, Dan Zhang, Tao Feng, Boyan Wang, Jie Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) are becoming increasingly powerful and capable
+of handling complex tasks, e.g., building single agents and multi-agent
+systems. Compared to single agents, multi-agent systems have higher
+requirements for the collaboration capabilities of language models. Many
+benchmarks are proposed to evaluate their collaborative abilities. However,
+these benchmarks lack fine-grained evaluations of LLM collaborative
+capabilities. Additionally, multi-agent collaborative and competitive scenarios
+are ignored in existing works. To address these two problems, we propose a
+benchmark, called BattleAgentBench, which defines seven sub-stages of three
+varying difficulty levels and conducts a fine-grained evaluation of language
+models in terms of single-agent scenario navigation capabilities, paired-agent
+task execution abilities, and multi-agent collaboration and competition
+capabilities. We conducted extensive evaluations on leading four closed-source
+and seven open-source models. Experimental results indicate that API-based
+models perform excellently on simple tasks but open-source small models
+struggle with simple tasks. Regarding difficult tasks that require
+collaborative and competitive abilities, although API-based models have
+demonstrated some collaborative capabilities, there is still enormous room for
+improvement.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ More Text, Less Point: Towards 3D Data-Efficient Point-Language
+  Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15966v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15966v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuan Tang, Xu Han, Xianzhi Li, Qiao Yu, Jinfeng Xu, Yixue Hao, Long Hu, Min Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Enabling Large Language Models (LLMs) to comprehend the 3D physical world
+remains a significant challenge. Due to the lack of large-scale 3D-text pair
+datasets, the success of LLMs has yet to be replicated in 3D understanding. In
+this paper, we rethink this issue and propose a new task: 3D Data-Efficient
+Point-Language Understanding. The goal is to enable LLMs to achieve robust 3D
+object understanding with minimal 3D point cloud and text data pairs. To
+address this task, we introduce GreenPLM, which leverages more text data to
+compensate for the lack of 3D data. First, inspired by using CLIP to align
+images and text, we utilize a pre-trained point cloud-text encoder to map the
+3D point cloud space to the text space. This mapping leaves us to seamlessly
+connect the text space with LLMs. Once the point-text-LLM connection is
+established, we further enhance text-LLM alignment by expanding the
+intermediate text space, thereby reducing the reliance on 3D point cloud data.
+Specifically, we generate 6M free-text descriptions of 3D objects, and design a
+three-stage training strategy to help LLMs better explore the intrinsic
+connections between different modalities. To achieve efficient modality
+alignment, we design a zero-parameter cross-attention module for token pooling.
+Extensive experimental results show that GreenPLM requires only 12% of the 3D
+training data used by existing state-of-the-art models to achieve superior 3D
+understanding. Remarkably, GreenPLM also achieves competitive performance using
+text-only data. The code and weights are available at:
+https://github.com/TangYuan96/GreenPLM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging Open Knowledge for Advancing Task Expertise in Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15915v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15915v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuncheng Yang, Yulei Qin, Tong Wu, Zihan Xu, Gang Li, Pengcheng Guo, Hang Shao, Yucheng Shi, Ke Li, Xing Sun, Jie Yang, Yun Gu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The cultivation of expertise for large language models (LLMs) to solve tasks
+of specific areas often requires special-purpose tuning with calibrated
+behaviors on the expected stable outputs. To avoid huge cost brought by manual
+preparation of instruction datasets and training resources up to hundreds of
+hours, the exploitation of open knowledge including a wealth of low rank
+adaptation (LoRA) models and instruction datasets serves as a good starting
+point. However, existing methods on model and data selection focus on the
+performance of general-purpose capabilities while neglecting the knowledge gap
+exposed in domain-specific deployment. In the present study, we propose to
+bridge such gap by introducing few human-annotated samples (i.e., K-shot) for
+advancing task expertise of LLMs with open knowledge. Specifically, we develop
+an efficient and scalable pipeline to cost-efficiently produce task experts
+where K-shot data intervene in selecting the most promising expert candidates
+and the task-relevant instructions. A mixture-of-expert (MoE) system is built
+to make the best use of individual-yet-complementary knowledge between multiple
+experts. We unveil the two keys to the success of a MoE system, 1) the abidance
+by K-shot, and 2) the insistence on diversity. For the former, we ensure that
+models that truly possess problem-solving abilities on K-shot are selected
+rather than those blind guessers. Besides, during data selection, instructions
+that share task-relevant contexts with K-shot are prioritized. For the latter,
+we highlight the diversity of constituting experts and that of the fine-tuning
+instructions throughout the model and data selection process. Extensive
+experimental results confirm the superiority of our approach over existing
+methods on utilization of open knowledge across various tasks. Codes and models
+will be released later.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, 12 tables, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLM-Based Multi-Hop Question Answering with Knowledge Graph Integration
+  in Evolving Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15903v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15903v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruirui Chen, Weifeng Jiang, Chengwei Qin, Ishaan Singh Rawal, Cheston Tan, Dongkyu Choi, Bo Xiong, Bo Ai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid obsolescence of information in Large Language Models (LLMs) has
+driven the development of various techniques to incorporate new facts. However,
+existing methods for knowledge editing still face difficulties with multi-hop
+questions that require accurate fact identification and sequential logical
+reasoning, particularly among numerous fact updates. To tackle these
+challenges, this paper introduces Graph Memory-based Editing for Large Language
+Models (GMeLLo), a straitforward and effective method that merges the explicit
+knowledge representation of Knowledge Graphs (KGs) with the linguistic
+flexibility of LLMs. Beyond merely leveraging LLMs for question answering,
+GMeLLo employs these models to convert free-form language into structured
+queries and fact triples, facilitating seamless interaction with KGs for rapid
+updates and precise multi-hop reasoning. Our results show that GMeLLo
+significantly surpasses current state-of-the-art knowledge editing methods in
+the multi-hop question answering benchmark, MQuAKE, especially in scenarios
+with extensive knowledge edits.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Nexus: Specialization meets Adaptability for Efficiently Training
+  Mixture of Experts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15901v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15901v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikolas Gritsch, Qizhen Zhang, Acyr Locatelli, Sara Hooker, Ahmet Üstün
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Efficiency, specialization, and adaptability to new data distributions are
+qualities that are hard to combine in current Large Language Models. The
+Mixture of Experts (MoE) architecture has been the focus of significant
+research because its inherent conditional computation enables such desirable
+properties. In this work, we focus on "upcycling" dense expert models into an
+MoE, aiming to improve specialization while also adding the ability to adapt to
+new tasks easily. We introduce Nexus, an enhanced MoE architecture with
+adaptive routing where the model learns to project expert embeddings from
+domain representations. This approach allows Nexus to flexibly add new experts
+after the initial upcycling through separately trained dense models, without
+requiring large-scale MoE training for unseen data domains. Our experiments
+show that Nexus achieves a relative gain of up to 2.1% over the baseline for
+initial upcycling, and a 18.8% relative gain for extending the MoE with a new
+expert by using limited finetuning data. This flexibility of Nexus is crucial
+to enable an open-source ecosystem where every user continuously assembles
+their own MoE-mix according to their needs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A New Method for Cross-Lingual-based Semantic Role Labeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15896v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15896v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Ebrahimi, Behrouz Minaei Bidgoli, Nasim Khozouei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic role labeling is a crucial task in natural language processing,
+enabling better comprehension of natural language. However, the lack of
+annotated data in multiple languages has posed a challenge for researchers. To
+address this, a deep learning algorithm based on model transfer has been
+proposed. The algorithm utilizes a dataset consisting of the English portion of
+CoNLL2009 and a corpus of semantic roles in Persian. To optimize the efficiency
+of training, only ten percent of the educational data from each language is
+used. The results of the proposed model demonstrate significant improvements
+compared to Niksirt et al.'s model. In monolingual mode, the proposed model
+achieved a 2.05 percent improvement on F1-score, while in cross-lingual mode,
+the improvement was even more substantial, reaching 6.23 percent. Worth noting
+is that the compared model only trained two of the four stages of semantic role
+labeling and employed golden data for the remaining two stages. This suggests
+that the actual superiority of the proposed model surpasses the reported
+numbers by a significant margin. The development of cross-lingual methods for
+semantic role labeling holds promise, particularly in addressing the scarcity
+of annotated data for various languages. These advancements pave the way for
+further research in understanding and processing natural language across
+different linguistic contexts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bias in LLMs as Annotators: The Effect of Party Cues on Labelling
+  Decision by Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15895v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15895v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sebastian Vallejo Vera, Hunter Driggers
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human coders are biased. We test similar biases in Large Language Models
+(LLMs) as annotators. By replicating an experiment run by Ennser-Jedenastik and
+Meyer (2018), we find evidence that LLMs use political information, and
+specifically party cues, to judge political statements. Not only do LLMs use
+relevant information to contextualize whether a statement is positive,
+negative, or neutral based on the party cue, they also reflect the biases of
+the human-generated data upon which they have been trained. We also find that
+unlike humans, who are only biased when faced with statements from extreme
+parties, LLMs exhibit significant bias even when prompted with statements from
+center-left and center-right parties. The implications of our findings are
+discussed in the conclusion.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Persuasion Games using Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15879v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15879v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ganesh Prasath Ramani, Shirish Karande, Santhosh V, Yash Bhatia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have emerged as formidable instruments capable
+of comprehending and producing human-like text. This paper explores the
+potential of LLMs, to shape human perspectives and subsequently influence their
+decisions on particular tasks. This capability finds applications in diverse
+domains such as Investment, Credit cards and Insurance, wherein they assist
+users in selecting appropriate insurance policies, investment plans, Credit
+cards, Retail, as well as in Behavioral Change Support Systems (BCSS).
+  We present a sophisticated multi-agent framework wherein a consortium of
+agents operate in collaborative manner. The primary agent engages directly with
+users through persuasive dialogue, while the auxiliary agents perform tasks
+such as information retrieval, response analysis, development of persuasion
+strategies, and validation of facts. Empirical evidence from our experiments
+demonstrates that this collaborative methodology significantly enhances the
+persuasive efficacy of the LLM. We analyze user resistance to persuasive
+efforts continuously and counteract it by employing a combination of rule-based
+and LLM-based resistance-persuasion mapping techniques.
+  We employ simulated personas and generate conversations in insurance,
+banking, and retail domains to evaluate the proficiency of large language
+models (LLMs) in recognizing, adjusting to, and influencing various personality
+types. Concurrently, we examine the resistance mechanisms employed by LLM
+simulated personas. Persuasion is quantified via measurable surveys before and
+after interaction, LLM-generated scores on conversation, and user decisions
+(purchase or non-purchase).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Knowledge Navigator: LLM-guided Browsing Framework for Exploratory
+  Search in Scientific Literature 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15836v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15836v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Uri Katz, Mosh Levy, Yoav Goldberg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The exponential growth of scientific literature necessitates advanced tools
+for effective knowledge exploration. We present Knowledge Navigator, a system
+designed to enhance exploratory search abilities by organizing and structuring
+the retrieved documents from broad topical queries into a navigable, two-level
+hierarchy of named and descriptive scientific topics and subtopics. This
+structured organization provides an overall view of the research themes in a
+domain, while also enabling iterative search and deeper knowledge discovery
+within specific subtopics by allowing users to refine their focus and retrieve
+additional relevant documents. Knowledge Navigator combines LLM capabilities
+with cluster-based methods to enable an effective browsing method. We
+demonstrate our approach's effectiveness through automatic and manual
+evaluations on two novel benchmarks, CLUSTREC-COVID and SCITOC. Our code,
+prompts, and benchmarks are made publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automatic Differential Diagnosis using <span class="highlight-title">Transformer</span>-Based Multi-Label
+  Sequence Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15827v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15827v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abu Adnan Sadi, Mohammad Ashrafuzzaman Khan, Lubaba Binte Saber
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the field of artificial intelligence progresses, assistive technologies
+are becoming more widely used across all industries. The healthcare industry is
+no different, with numerous studies being done to develop assistive tools for
+healthcare professionals. Automatic diagnostic systems are one such beneficial
+tool that can assist with a variety of tasks, including collecting patient
+information, analyzing test results, and diagnosing patients. However, the idea
+of developing systems that can provide a differential diagnosis has been
+largely overlooked in most of these research studies. In this study, we propose
+a transformer-based approach for providing differential diagnoses based on a
+patient's age, sex, medical history, and symptoms. We use the DDXPlus dataset,
+which provides differential diagnosis information for patients based on 49
+disease types. Firstly, we propose a method to process the tabular patient data
+from the dataset and engineer them into patient reports to make them suitable
+for our research. In addition, we introduce two data modification modules to
+diversify the training data and consequently improve the robustness of the
+models. We approach the task as a multi-label classification problem and
+conduct extensive experiments using four transformer models. All the models
+displayed promising results by achieving over 97% F1 score on the held-out test
+set. Moreover, we design additional behavioral tests to get a broader
+understanding of the models. In particular, for one of our test cases, we
+prepared a custom test set of 100 samples with the assistance of a doctor. The
+results on the custom set showed that our proposed data modification modules
+improved the model's generalization capabilities. We hope our findings will
+provide future researchers with valuable insights and inspire them to develop
+reliable systems for automatic differential diagnosis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scaling Up Summarization: Leveraging Large Language Models for Long Text
+  Extractive Summarization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15801v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15801v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Léo Hemamou, Mehdi Debiane
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In an era where digital text is proliferating at an unprecedented rate,
+efficient summarization tools are becoming indispensable. While Large Language
+Models (LLMs) have been successfully applied in various NLP tasks, their role
+in extractive text summarization remains underexplored. This paper introduces
+EYEGLAXS (Easy Yet Efficient larGe LAnguage model for eXtractive
+Summarization), a framework that leverages LLMs, specifically LLAMA2-7B and
+ChatGLM2-6B, for extractive summarization of lengthy text documents. Instead of
+abstractive methods, which often suffer from issues like factual inaccuracies
+and hallucinations, EYEGLAXS focuses on extractive summarization to ensure
+factual and grammatical integrity. Utilizing state-of-the-art techniques such
+as Flash Attention and Parameter-Efficient Fine-Tuning (PEFT), EYEGLAXS
+addresses the computational and resource challenges typically associated with
+LLMs. The system sets new performance benchmarks on well-known datasets like
+PubMed and ArXiv. Furthermore, we extend our research through additional
+analyses that explore the adaptability of LLMs in handling different sequence
+lengths and their efficiency in training on smaller datasets. These
+contributions not only set a new standard in the field but also open up
+promising avenues for future research in extractive text summarization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Language Adaptation on a Tight Academic Compute Budget: Tokenizer
+  Swapping Works and Pure bfloat16 Is Enough <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15793v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15793v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Konstantin Dobler, Gerard de Melo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate continued pretraining of LLMs for language adaptation on a
+tight academic budget: a setting in which only a few GPUs can be used in
+parallel, for a heavily constrained duration. We focus on adapting Mistral-7B
+to German or Arabic and evaluate several techniques to improve efficiency and
+effectiveness in this setting. Our German models adapted on this tight compute
+budget underperform compared to the base Mistral-7B, while our Arabic models
+outperform several baselines, showing that for sufficiently well-represented
+languages, continued pretraining for specialization is not always helpful. Our
+main findings focus on training precision and tokenizer swapping. Our results
+show that pure bfloat16 training is a viable alternative to mixed-precision
+training, while being much faster when only using a few GPUs. Swapping the
+tokenizer for a specialized one yields more efficient tokenization and is
+competitive with the original tokenizer, which already contains some German
+tokens, but did not significantly increase performance for German. Code and
+model weights are available at on GitHub.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>WANT@ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Interactive Agents: Simulating Counselor-Client Psychological Counseling
+  via Role-Playing LLM-to-LLM Interactions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15787v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15787v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huachuan Qiu, Zhenzhong Lan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Virtual counselors powered by large language models (LLMs) aim to create
+interactive support systems that effectively assist clients struggling with
+mental health challenges. To replicate counselor-client conversations,
+researchers have built an online mental health platform that allows
+professional counselors to provide clients with text-based counseling services
+for about an hour per session. Notwithstanding its effectiveness, challenges
+exist as human annotation is time-consuming, cost-intensive, privacy-protected,
+and not scalable. To address this issue and investigate the applicability of
+LLMs in psychological counseling conversation simulation, we propose a
+framework that employs two LLMs via role-playing for simulating
+counselor-client interactions. Our framework involves two LLMs, one acting as a
+client equipped with a specific and real-life user profile and the other
+playing the role of an experienced counselor, generating professional responses
+using integrative therapy techniques. We implement both the counselor and the
+client by zero-shot prompting the GPT-4 model. In order to assess the
+effectiveness of LLMs in simulating counselor-client interactions and
+understand the disparities between LLM- and human-generated conversations, we
+evaluate the synthetic data from various perspectives. We begin by assessing
+the client's performance through automatic evaluations. Next, we analyze and
+compare the disparities between dialogues generated by the LLM and those
+generated by professional counselors. Furthermore, we conduct extensive
+experiments to thoroughly examine the performance of our LLM-based counselor
+trained with synthetic interactive dialogues by benchmarking against
+state-of-the-art models for mental health.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LogicGame: Benchmarking Rule-Based Reasoning Abilities of Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15778v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15778v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiayi Gui, Yiming Liu, Jiale Cheng, Xiaotao Gu, Xiao Liu, Hongning Wang, Yuxiao Dong, Jie Tang, Minlie Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have demonstrated notable capabilities across
+various tasks, showcasing complex problem-solving abilities. Understanding and
+executing complex rules, along with multi-step planning, are fundamental to
+logical reasoning and critical for practical LLM agents and decision-making
+systems. However, evaluating LLMs as effective rule-based executors and
+planners remains underexplored. In this paper, we introduce LogicGame, a novel
+benchmark designed to evaluate the comprehensive rule understanding, execution,
+and planning capabilities of LLMs. Unlike traditional benchmarks, LogicGame
+provides diverse games that contain a series of rules with an initial state,
+requiring models to comprehend and apply predefined regulations to solve
+problems. We create simulated scenarios in which models execute or plan
+operations to achieve specific outcomes. These game scenarios are specifically
+designed to distinguish logical reasoning from mere knowledge by relying
+exclusively on predefined rules. This separation allows for a pure assessment
+of rule-based reasoning capabilities. The evaluation considers not only final
+outcomes but also intermediate steps, providing a comprehensive assessment of
+model performance. Moreover, these intermediate steps are deterministic and can
+be automatically verified. LogicGame defines game scenarios with varying
+difficulty levels, from simple rule applications to complex reasoning chains,
+in order to offer a precise evaluation of model performance on rule
+understanding and multi-step execution. Utilizing LogicGame, we test various
+LLMs and identify notable shortcomings in their rule-based logical reasoning
+abilities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Survey</span> on Evaluation of Multimodal Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15769v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15769v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaxing Huang, Jingyi Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal Large Language Models (MLLMs) mimic human perception and reasoning
+system by integrating powerful Large Language Models (LLMs) with various
+modality encoders (e.g., vision, audio), positioning LLMs as the "brain" and
+various modality encoders as sensory organs. This framework endows MLLMs with
+human-like capabilities, and suggests a potential pathway towards achieving
+artificial general intelligence (AGI). With the emergence of all-round MLLMs
+like GPT-4V and Gemini, a multitude of evaluation methods have been developed
+to assess their capabilities across different dimensions. This paper presents a
+systematic and comprehensive review of MLLM evaluation methods, covering the
+following key aspects: (1) the background of MLLMs and their evaluation; (2)
+"what to evaluate" that reviews and categorizes existing MLLM evaluation tasks
+based on the capabilities assessed, including general multimodal recognition,
+perception, reasoning and trustworthiness, and domain-specific applications
+such as socioeconomic, natural sciences and engineering, medical usage, AI
+agent, remote sensing, video and audio processing, 3D point cloud analysis, and
+others; (3) "where to evaluate" that summarizes MLLM evaluation benchmarks into
+general and specific benchmarks; (4) "how to evaluate" that reviews and
+illustrates MLLM evaluation steps and metrics; Our overarching goal is to
+provide valuable insights for researchers in the field of MLLM evaluation,
+thereby facilitating the development of more capable and reliable MLLMs. We
+emphasize that evaluation should be regarded as a critical discipline,
+essential for advancing the field of MLLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Harmonized Speculative Sampling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15766v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15766v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lefan Zhang, Xiaodan Wang, Yanhua Huang, Ruiwen Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speculative sampling has proven to be an effective solution to accelerate
+decoding from large language models, where the acceptance rate significantly
+determines the performance. Most previous works on improving the acceptance
+rate focus on aligned training and efficient decoding, implicitly paying less
+attention to the linkage of training and decoding. In this work, we first
+investigate the linkage of training and decoding for speculative sampling and
+then propose a solution named HArmonized Speculative Sampling (HASS). HASS
+improves the acceptance rate without extra inference overhead by harmonizing
+training and decoding on their objectives and contexts. Experiments on three
+LLaMA models demonstrate that HASS achieves 2.81x-3.65x wall-clock time speedup
+ratio averaging across three datasets, which is 8%-15% faster than EAGLE-2.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Form and meaning co-determine the realization of tone in Taiwan Mandarin
+  spontaneous speech: the case of Tone 3 sandhi 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15747v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15747v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxin Lu, Yu-Ying Chuang, R. Harald Baayen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In Standard Chinese, Tone 3 (the dipping tone) becomes Tone 2 (rising tone)
+when followed by another Tone 3. Previous studies have noted that this sandhi
+process may be incomplete, in the sense that the assimilated Tone 3 is still
+distinct from a true Tone 2. While Mandarin Tone 3 sandhi is widely studied
+using carefully controlled laboratory speech (Xu, 1997) and more formal
+registers of Beijing Mandarin (Yuan and Chen, 2014), less is known about its
+realization in spontaneous speech, and about the effect of contextual factors
+on tonal realization. The present study investigates the pitch contours of
+two-character words with T2-T3 and T3-T3 tone patterns in spontaneous Taiwan
+Mandarin conversations. Our analysis makes use of the Generative Additive Mixed
+Model (GAMM, Wood, 2017) to examine fundamental frequency (f0) contours as a
+function of normalized time. We consider various factors known to influence
+pitch contours, including gender, speaking rate, speaker, neighboring tones,
+word position, bigram probability, and also novel predictors, word and word
+sense (Chuang et al., 2024). Our analyses revealed that in spontaneous Taiwan
+Mandarin, T3-T3 words become indistinguishable from T2-T3 words, indicating
+complete sandhi, once the strong effect of word (or word sense) is taken into
+account. For our data, the shape of f0 contours is not co-determined by word
+frequency. In contrast, the effect of word meaning on f0 contours is robust, as
+strong as the effect of adjacent tones, and is present for both T2-T3 and T3-T3
+words.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LM-PUB-QUIZ: A Comprehensive Framework for Zero-Shot Evaluation of
+  Relational Knowledge in Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15729v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15729v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Max Ploner, Jacek Wiland, Sebastian Pohl, Alan Akbik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge probing evaluates the extent to which a language model (LM) has
+acquired relational knowledge during its pre-training phase. It provides a
+cost-effective means of comparing LMs of different sizes and training setups
+and is useful for monitoring knowledge gained or lost during continual learning
+(CL). In prior work, we presented an improved knowledge probe called BEAR
+(Wiland et al., 2024), which enables the comparison of LMs trained with
+different pre-training objectives (causal and masked LMs) and addresses issues
+of skewed distributions in previous probes to deliver a more unbiased reading
+of LM knowledge. With this paper, we present LM-PUB- QUIZ, a Python framework
+and leaderboard built around the BEAR probing mechanism that enables
+researchers and practitioners to apply it in their work. It provides options
+for standalone evaluation and direct integration into the widely-used training
+pipeline of the Hugging Face TRANSFORMERS library. Further, it provides a
+fine-grained analysis of different knowledge types to assist users in better
+understanding the knowledge in each evaluated LM. We publicly release
+LM-PUB-QUIZ as an open-source project.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Evaluation of Sindhi Word Embedding in Semantic Analogies and
+  Downstream Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15720v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15720v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wazir Ali, Saifullah Tumrani, Jay Kumar, Tariq Rahim Soomro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a new word embedding based corpus consisting of
+more than 61 million words crawled from multiple web resources. We design a
+preprocessing pipeline for the filtration of unwanted text from crawled data.
+Afterwards, the cleaned vocabulary is fed to state-of-the-art
+continuous-bag-of-words, skip-gram, and GloVe word embedding algorithms. For
+the evaluation of pretrained embeddings, we use popular intrinsic and extrinsic
+evaluation approaches. The evaluation results reveal that
+continuous-bag-of-words and skip-gram perform better than GloVe and existing
+Sindhi fastText word embedding on both intrinsic and extrinsic evaluation
+approaches
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:1911.12579</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Conan-embedding: General Text Embedding with More and Better Negative
+  Samples 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15710v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15710v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiyu Li, Yang Tang, Shizhe Chen, Xi Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the growing popularity of RAG, the capabilities of embedding models are
+gaining increasing attention. Embedding models are primarily trained through
+contrastive loss learning, with negative examples being a key component.
+Previous work has proposed various hard negative mining strategies, but these
+strategies are typically employed as preprocessing steps. In this paper, we
+propose the conan-embedding model, which maximizes the utilization of more and
+higher-quality negative examples. Specifically, since the model's ability to
+handle preprocessed negative examples evolves during training, we propose
+dynamic hard negative mining method to expose the model to more challenging
+negative examples throughout the training process. Secondly, contrastive
+learning requires as many negative examples as possible but is limited by GPU
+memory constraints. Therefore, we use a Cross-GPU balancing Loss to provide
+more negative examples for embedding training and balance the batch size across
+multiple tasks. Moreover, we also discovered that the prompt-response pairs
+from LLMs can be used for embedding training. Our approach effectively enhances
+the capabilities of embedding models, currently ranking first on the Chinese
+leaderboard of Massive text embedding benchmark
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TempoFormer: A <span class="highlight-title">Transformer</span> for Temporally-aware Representations in
+  Change Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15689v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15689v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Talia Tseriotou, Adam Tsakalidis, Maria Liakata
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dynamic representation learning plays a pivotal role in understanding the
+evolution of linguistic content over time. On this front both context and time
+dynamics as well as their interplay are of prime importance. Current approaches
+model context via pre-trained representations, which are typically temporally
+agnostic. Previous work on modeling context and temporal dynamics has used
+recurrent methods, which are slow and prone to overfitting. Here we introduce
+TempoFormer, the fist task-agnostic transformer-based and temporally-aware
+model for dynamic representation learning. Our approach is jointly trained on
+inter and intra context dynamics and introduces a novel temporal variation of
+rotary positional embeddings. The architecture is flexible and can be used as
+the temporal representation foundation of other models or applied to different
+transformer-based architectures. We show new SOTA performance on three
+different real-time change detection tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ StyleRemix: Interpretable Authorship Obfuscation via Distillation and
+  Perturbation of Style Elements 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15666v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15666v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jillian Fisher, Skyler Hallinan, Ximing Lu, Mitchell Gordon, Zaid Harchaoui, Yejin Choi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Authorship obfuscation, rewriting a text to intentionally obscure the
+identity of the author, is an important but challenging task. Current methods
+using large language models (LLMs) lack interpretability and controllability,
+often ignoring author-specific stylistic features, resulting in less robust
+performance overall.
+  To address this, we develop StyleRemix, an adaptive and interpretable
+obfuscation method that perturbs specific, fine-grained style elements of the
+original input text. StyleRemix uses pre-trained Low Rank Adaptation (LoRA)
+modules to rewrite an input specifically along various stylistic axes (e.g.,
+formality and length) while maintaining low computational cost. StyleRemix
+outperforms state-of-the-art baselines and much larger LLMs in a variety of
+domains as assessed by both automatic and human evaluation.
+  Additionally, we release AuthorMix, a large set of 30K high-quality,
+long-form texts from a diverse set of 14 authors and 4 domains, and DiSC, a
+parallel corpus of 1,500 texts spanning seven style axes in 16 unique
+directions
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Auxiliary-Loss-Free Load Balancing Strategy for Mixture-of-Experts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15664v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15664v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lean Wang, Huazuo Gao, Chenggang Zhao, Xu Sun, Damai Dai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For Mixture-of-Experts (MoE) models, an unbalanced expert load will lead to
+routing collapse or increased computational overhead. Existing methods commonly
+employ an auxiliary loss to encourage load balance, but a large auxiliary loss
+will introduce non-negligible interference gradients into training and thus
+impair the model performance. In order to control load balance while not
+producing undesired gradients during training, we propose Loss-Free Balancing,
+featured by an auxiliary-loss-free load balancing strategy. To be specific,
+before the top-K routing decision, Loss-Free Balancing will first apply an
+expert-wise bias to the routing scores of each expert. By dynamically updating
+the bias of each expert according to its recent load, Loss-Free Balancing can
+consistently maintain a balanced distribution of expert load. In addition,
+since Loss-Free Balancing does not produce any interference gradients, it also
+elevates the upper bound of model performance gained from MoE training. We
+validate the performance of Loss-Free Balancing on MoE models with up to 3B
+parameters trained on up to 200B tokens. Experimental results show that
+Loss-Free Balancing achieves both better performance and better load balance
+compared with traditional auxiliary-loss-controlled load balancing strategies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Harnessing the Intrinsic Knowledge of <span class="highlight-title">Pretrain</span>ed Language Models for
+  Challenging Text Classification Settings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15650v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15650v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lingyu Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text classification is crucial for applications such as sentiment analysis
+and toxic text filtering, but it still faces challenges due to the complexity
+and ambiguity of natural language. Recent advancements in deep learning,
+particularly transformer architectures and large-scale pretraining, have
+achieved inspiring success in NLP fields. Building on these advancements, this
+thesis explores three challenging settings in text classification by leveraging
+the intrinsic knowledge of pretrained language models (PLMs). Firstly, to
+address the challenge of selecting misleading yet incorrect distractors for
+cloze questions, we develop models that utilize features based on
+contextualized word representations from PLMs, achieving performance that
+rivals or surpasses human accuracy. Secondly, to enhance model generalization
+to unseen labels, we create small finetuning datasets with domain-independent
+task label descriptions, improving model performance and robustness. Lastly, we
+tackle the sensitivity of large language models to in-context learning prompts
+by selecting effective demonstrations, focusing on misclassified examples and
+resolving model ambiguity regarding test example labels.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>PhD thesis</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CBF-LLM: Safe Control for LLM Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15625v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15625v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuya Miyaoka, Masaki Inoue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a control-based framework for aligning large language
+models (LLMs) by leveraging a control barrier function (CBF) to ensure
+user-desirable text generation. The presented framework applies the safety
+filter, designed based on the CBF, to the output generation of the baseline
+LLM, i.e., the sequence of the token, with the aim of intervening in the
+generated text. The overall text-generation system is implemented with Llama 3
+and a RoBERTa model, and the source code is available at
+https://github.com/Mya-Mya/CBF-LLM. The experiment demonstrates its control
+ability and effectiveness in reducing the number of interventions needed for
+user-specified alignment tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Beyond Levenshtein: Leveraging Multiple Algorithms for Robust Word Error
+  Rate Computations And Granular Error Classifications <span class="chip">INTERSPEECH 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15616v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15616v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Korbinian Kuhn, Verena Kersken, Gottfried Zimmermann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Word Error Rate (WER) is the common measure of accuracy for Automatic
+Speech Recognition (ASR). Transcripts are usually pre-processed by substituting
+specific characters to account for non-semantic differences. As a result of
+this normalisation, information on the accuracy of punctuation or
+capitalisation is lost. We present a non-destructive, token-based approach
+using an extended Levenshtein distance algorithm to compute a robust WER and
+additional orthographic metrics. Transcription errors are also classified more
+granularly by existing string similarity and phonetic algorithms. An evaluation
+on several datasets demonstrates the practical equivalence of our approach
+compared to common WER computations. We also provide an exemplary analysis of
+derived use cases, such as a punctuation error rate, and a web application for
+interactive use and visualisation of our implementation. The code is available
+open-source.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in INTERSPEECH 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SIaM: Self-Improving Code-Assisted Mathematical Reasoning of Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15565v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15565v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dian Yu, Baolin Peng, Ye Tian, Linfeng Song, Haitao Mi, Dong Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There is a growing trend of teaching large language models (LLMs) to solve
+mathematical problems through coding. Existing studies primarily focus on
+prompting powerful, closed-source models to generate seed training data
+followed by in-domain data augmentation, equipping LLMs with considerable
+capabilities for code-aided mathematical reasoning. However, continually
+training these models on augmented data derived from a few datasets such as
+GSM8K may impair their generalization abilities and restrict their
+effectiveness to a narrow range of question types. Conversely, the potential of
+improving such LLMs by leveraging large-scale, expert-written, diverse math
+question-answer pairs remains unexplored. To utilize these resources and tackle
+unique challenges such as code response assessment, we propose a novel paradigm
+that uses a code-based critic model to guide steps including question-code data
+construction, quality control, and complementary evaluation. We also explore
+different alignment algorithms with self-generated instruction/preference data
+to foster continuous improvement. Experiments across both in-domain (up to
++5.7%) and out-of-domain (+4.4%) benchmarks in English and Chinese demonstrate
+the effectiveness of the proposed paradigm.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Boosting Lossless Speculative Decoding via Feature Sampling and Partial
+  Alignment Distillation <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15562v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15562v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lujun Gui, Bin Xiao, Lei Su, Weipeng Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lossless speculative decoding accelerates target large language model (LLM)
+inference by employing a lightweight draft model for generating tree-structured
+candidates, which are subsequently verified in parallel by the target LLM.
+Currently, effective approaches leverage feature-level rather than token-level
+autoregression within the draft model to facilitate more straightforward
+predictions and enhanced knowledge distillation. In this paper, we reassess
+these approaches and propose FSPAD (Feature Sampling and Partial Alignment
+Distillation for Lossless Speculative Decoding), which introduces two
+straightforward and effective components within the existing framework to boost
+lossless speculative decoding. Firstly, FSPAD utilizes token embeddings to
+sample features of the target LLM in high-dimensional space before feeding them
+into the draft model, due to the inherent uncertainty of the features
+preventing the draft model from obtaining the specific token output by the
+target LLM. Secondly, FSPAD introduces partial alignment distillation to weaken
+the draft model's connection between features and logits, aiming to reduce the
+conflict between feature alignment and logit confidence during training. Our
+experiments include both greedy and non-greedy decoding on the largest and
+smallest models from the Vicuna and LLaMA3-Instruct series, as well as tasks in
+multi-turn conversation, translation, summarization, question answering,
+mathematical reasoning, and retrieval-augmented generation. The results show
+that FSPAD outperforms the state-of-the-art method across all the
+aforementioned tasks and target LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The work was not submitted to AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ WildFeedback: Aligning LLMs With In-situ User Interactions And Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15549v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15549v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taiwei Shi, Zhuoer Wang, Longqi Yang, Ying-Chun Lin, Zexue He, Mengting Wan, Pei Zhou, Sujay Jauhar, Xiaofeng Xu, Xia Song, Jennifer Neville
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As large language models (LLMs) continue to advance, aligning these models
+with human preferences has emerged as a critical challenge. Traditional
+alignment methods, relying on human or LLM annotated datasets, are limited by
+their resource-intensive nature, inherent subjectivity, and the risk of
+feedback loops that amplify model biases. To overcome these limitations, we
+introduce WildFeedback, a novel framework that leverages real-time, in-situ
+user interactions to create preference datasets that more accurately reflect
+authentic human values. WildFeedback operates through a three-step process:
+feedback signal identification, preference data construction, and user-guided
+evaluation. We applied this framework to a large corpus of user-LLM
+conversations, resulting in a rich preference dataset that reflects genuine
+user preferences. This dataset captures the nuances of user preferences by
+identifying and classifying feedback signals within natural conversations,
+thereby enabling the construction of more representative and context-sensitive
+alignment data. Our extensive experiments demonstrate that LLMs fine-tuned on
+WildFeedback exhibit significantly improved alignment with user preferences, as
+evidenced by both traditional benchmarks and our proposed user-guided
+evaluation. By incorporating real-time feedback from actual users, WildFeedback
+addresses the scalability, subjectivity, and bias challenges that plague
+existing approaches, marking a significant step toward developing LLMs that are
+more responsive to the diverse and evolving needs of their users. In summary,
+WildFeedback offers a robust, scalable solution for aligning LLMs with true
+human values, setting a new standard for the development and evaluation of
+user-centric language models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SciLitLLM: How to Adapt LLMs for Scientific Literature Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15545v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15545v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sihang Li, Jian Huang, Jiaxi Zhuang, Yaorui Shi, Xiaochen Cai, Mingjun Xu, Xiang Wang, Linfeng Zhang, Guolin Ke, Hengxing Cai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scientific literature understanding is crucial for extracting targeted
+information and garnering insights, thereby significantly advancing scientific
+discovery. Despite the remarkable success of Large Language Models (LLMs), they
+face challenges in scientific literature understanding, primarily due to (1) a
+lack of scientific knowledge and (2) unfamiliarity with specialized scientific
+tasks.
+  To develop an LLM specialized in scientific literature understanding, we
+propose a hybrid strategy that integrates continual pre-training (CPT) and
+supervised fine-tuning (SFT), to simultaneously infuse scientific domain
+knowledge and enhance instruction-following capabilities for domain-specific
+tasks.cIn this process, we identify two key challenges: (1) constructing
+high-quality CPT corpora, and (2) generating diverse SFT instructions. We
+address these challenges through a meticulous pipeline, including PDF text
+extraction, parsing content error correction, quality filtering, and synthetic
+instruction creation. Applying this strategy, we present a suite of LLMs:
+SciLitLLM, specialized in scientific literature understanding. These models
+demonstrate promising performance on scientific literature understanding
+benchmarks.
+  Our contributions are threefold: (1) We present an effective framework that
+integrates CPT and SFT to adapt LLMs to scientific literature understanding,
+which can also be easily adapted to other domains. (2) We propose an LLM-based
+synthesis method to generate diverse and high-quality scientific instructions,
+resulting in a new instruction set -- SciLitIns -- for supervised fine-tuning
+in less-represented scientific domains. (3) SciLitLLM achieves promising
+performance improvements on scientific literature understanding benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Investigation of Warning Erroneous Chat Translations in Cross-lingual
+  Communication 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15543v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15543v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunmeng Li, Jun Suzuki, Makoto Morishita, Kaori Abe, Kentaro Inui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The complexities of chats pose significant challenges for machine translation
+models. Recognizing the need for a precise evaluation metric to address the
+issues of chat translation, this study introduces Multidimensional Quality
+Metrics for Chat Translation (MQM-Chat). Through the experiments of five models
+using MQM-Chat, we observed that all models generated certain fundamental
+errors, while each of them has different shortcomings, such as omission, overly
+correcting ambiguous source content, and buzzword issues, resulting in the loss
+of stylized information. Our findings underscore the effectiveness of MQM-Chat
+in evaluating chat translation, emphasizing the importance of stylized content
+and dialogue consistency for future studies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LRP4RAG: Detecting Hallucinations in Retrieval-Augmented Generation via
+  Layer-wise Relevance Propagation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15533v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15533v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haichuan Hu, Yuhan Sun, Qunjun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-Augmented Generation (RAG) has become a primary technique for
+mitigating hallucinations in large language models (LLMs). However, incomplete
+knowledge extraction and insufficient understanding can still mislead LLMs to
+produce irrelevant or even contradictory responses, which means hallucinations
+persist in RAG. In this paper, we propose LRP4RAG, a method based on the
+Layer-wise Relevance Propagation (LRP) algorithm for detecting hallucinations
+in RAG. Specifically, we first utilize LRP to compute the relevance between the
+input and output of the RAG generator. We then apply further extraction and
+resampling to the relevance matrix. The processed relevance data are input into
+multiple classifiers to determine whether the output contains hallucinations.
+To the best of our knowledge, this is the first time that LRP has been used for
+detecting RAG hallucinations, and extensive experiments demonstrate that
+LRP4RAG outperforms existing baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dolphin: Long Context as a New Modality for Energy-Efficient On-Device
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15518v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15518v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Chen, Zhiyuan Li, Shuo Xin, Yihao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents Dolphin, a novel decoder-decoder architecture for
+energy-efficient processing of long contexts in language models. Our approach
+addresses the significant energy consumption and latency challenges inherent in
+on-device models. Dolphin employs a compact 0.5B parameter decoder to distill
+extensive contextual information into a memory embedding, substantially
+reducing the input length for the primary 7B parameter decoder model. Inspired
+by vision-language models, we repurpose the image embedding projector to encode
+long textual contexts, effectively treating extended context as a distinct
+modality. This innovative method enables processing of substantially longer
+contexts without the typical computational overhead associated with extended
+input sequences. Empirical evaluations demonstrate a 10-fold improvement in
+energy efficiency and a 5-fold reduction in latency compared to conventional
+full-length context processing methods without losing quality of the response.
+Our work contributes to the development of more sustainable and scalable
+language models for on-device applications, addressing the critical need for
+energy-efficient and responsive AI technologies in resource-constrained
+environments while maintaining the accuracy to understand long contexts. This
+research has implications for the broader field of natural language processing,
+particularly in the domain of efficient model design for resource-limited
+settings. By enabling more sophisticated AI capabilities on edge devices,
+Dolphin paves the way for advanced language processing in a wide range of
+applications where computational resources are at a premium. The Dolphin model
+is publicly available at https://huggingface.co/NexaAIDev/Dolphin.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Fully Autonomous Research Powered by LLMs: Case Study on
+  Simulations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15512v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15512v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhihan Liu, Yubo Chai, Jianfeng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advent of Large Language Models (LLMs) has created new opportunities for
+the automation of scientific research, spanning both experimental processes and
+computational simulations. This study explores the feasibility of constructing
+an autonomous simulation agent (ASA) powered by LLM, through sophisticated API
+integration, to automate the entire research process, from experimental design,
+remote upload and simulation execution, data analysis, to report compilation.
+Using a simulation problem of polymer chain conformations as a case study, we
+assessed the performance of ASAs powered by different LLMs including
+GPT-4-Turbo. Our findings revealed that ASA-GPT-4o achieved near-flawless
+execution on designated research missions, underscoring the potential of LLMs
+to manage complete scientific investigations autonomously. The outlined
+automation can be iteratively performed up to twenty cycles without human
+intervention, illustrating the potential of LLMs for large-scale autonomous
+research endeavors. Additionally, we discussed the intrinsic traits of ASAs in
+managing extensive tasks, focusing on self-validation mechanisms and the
+balance between local attention and global oversight.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>For additional code and data, please visit our GitHub repository:
+  https://github.com/zokaraa/autonomous_simulation_agent</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Measuring the Reliability of Causal Probing Methods: Tradeoffs,
+  Limitations, and the Plight of Nullifying Interventions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15510v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15510v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marc Canby, Adam Davies, Chirag Rastogi, Julia Hockenmaier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Causal probing is an approach to interpreting foundation models, such as
+large language models, by training probes to recognize latent properties of
+interest from embeddings, intervening on probes to modify this representation,
+and analyzing the resulting changes in the model's behavior. While some recent
+works have cast doubt on the theoretical basis of several leading causal
+probing intervention methods, it has been unclear how to systematically and
+empirically evaluate their effectiveness in practice. To address this problem,
+we propose a general empirical analysis framework to evaluate the reliability
+of causal probing interventions, formally defining and quantifying two key
+causal probing desiderata: completeness (fully transforming the representation
+of the target property) and selectivity (minimally impacting other properties).
+Our formalism allows us to make the first direct comparisons between different
+families of causal probing methods (e.g., linear vs. nonlinear or
+counterfactual vs. nullifying interventions). We conduct extensive experiments
+across several leading methods, finding that (1) there is an inherent tradeoff
+between these criteria, and no method is able to consistently satisfy both at
+once; and (2) across the board, nullifying interventions are always far less
+complete than counterfactual interventions, indicating that nullifying methods
+may not be an effective approach to causal probing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ReMamba: Equip Mamba with Effective Long-Sequence Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15496v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15496v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Danlong Yuan, Jiahao Liu, Bei Li, Huishuai Zhang, Jingang Wang, Xunliang Cai, Dongyan Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While the Mamba architecture demonstrates superior inference efficiency and
+competitive performance on short-context natural language processing (NLP)
+tasks, empirical evidence suggests its capacity to comprehend long contexts is
+limited compared to transformer-based models. In this study, we investigate the
+long-context efficiency issues of the Mamba models and propose ReMamba, which
+enhances Mamba's ability to comprehend long contexts. ReMamba incorporates
+selective compression and adaptation techniques within a two-stage re-forward
+process, incurring minimal additional inference costs overhead. Experimental
+results on the LongBench and L-Eval benchmarks demonstrate ReMamba's efficacy,
+improving over the baselines by 3.2 and 1.6 points, respectively, and attaining
+performance almost on par with same-size transformer models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing and Accelerating Large Language Models via Instruction-Aware
+  Contextual Compression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15491v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15491v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haowen Hou, Fei Ma, Binwen Bai, Xinxin Zhu, Fei Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have garnered widespread attention due to their
+remarkable performance across various tasks. However, to mitigate the issue of
+hallucinations, LLMs often incorporate retrieval-augmented pipeline to provide
+them with rich external knowledge and context. Nevertheless, challenges stem
+from inaccurate and coarse-grained context retrieved from the retriever.
+Supplying irrelevant context to the LLMs can result in poorer responses,
+increased inference latency, and higher costs. This paper introduces a method
+called Instruction-Aware Contextual Compression, which filters out less
+informative content, thereby accelerating and enhancing the use of LLMs. The
+experimental results demonstrate that Instruction-Aware Contextual Compression
+notably reduces memory consumption and minimizes generation latency while
+maintaining performance levels comparable to those achieved with the use of the
+full context. Specifically, we achieved a 50% reduction in context-related
+costs, resulting in a 5% reduction in inference memory usage and a 2.2-fold
+increase in inference speed, with only a minor drop of 0.047 in Rouge-1. These
+findings suggest that our method strikes an effective balance between
+efficiency and performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Legilimens: Practical and Unified Content Moderation for Large Language
+  Model Services <span class="chip">CCS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15488v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15488v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jialin Wu, Jiangyi Deng, Shengyuan Pang, Yanjiao Chen, Jiayang Xu, Xinfeng Li, Wenyuan Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given the societal impact of unsafe content generated by large language
+models (LLMs), ensuring that LLM services comply with safety standards is a
+crucial concern for LLM service providers. Common content moderation methods
+are limited by an effectiveness-and-efficiency dilemma, where simple models are
+fragile while sophisticated models consume excessive computational resources.
+In this paper, we reveal for the first time that effective and efficient
+content moderation can be achieved by extracting conceptual features from
+chat-oriented LLMs, despite their initial fine-tuning for conversation rather
+than content moderation. We propose a practical and unified content moderation
+framework for LLM services, named Legilimens, which features both effectiveness
+and efficiency. Our red-team model-based data augmentation enhances the
+robustness of Legilimens against state-of-the-art jailbreaking. Additionally,
+we develop a framework to theoretically analyze the cost-effectiveness of
+Legilimens compared to other methods. We have conducted extensive experiments
+on five host LLMs, seventeen datasets, and nine jailbreaking methods to verify
+the effectiveness, efficiency, and robustness of Legilimens against normal and
+adaptive adversaries. A comparison of Legilimens with both commercial and
+academic baselines demonstrates the superior performance of Legilimens.
+Furthermore, we confirm that Legilimens can be applied to few-shot scenarios
+and extended to multi-label classification tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM Conference on Computer and Communications Security
+  (CCS) 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FRACTURED-SORRY-Bench: Framework for Revealing Attacks in Conversational
+  Turns Undermining Refusal Efficacy and Defenses over SORRY-Bench 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16163v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16163v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aman Priyanshu, Supriti Vijay
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces FRACTURED-SORRY-Bench, a framework for evaluating the
+safety of Large Language Models (LLMs) against multi-turn conversational
+attacks. Building upon the SORRY-Bench dataset, we propose a simple yet
+effective method for generating adversarial prompts by breaking down harmful
+queries into seemingly innocuous sub-questions. Our approach achieves a maximum
+increase of +46.22\% in Attack Success Rates (ASRs) across GPT-4, GPT-4o,
+GPT-4o-mini, and GPT-3.5-Turbo models compared to baseline methods. We
+demonstrate that this technique poses a challenge to current LLM safety
+measures and highlights the need for more robust defenses against subtle,
+multi-turn attacks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating Computational Representations of Character: An Austen
+  Character Similarity Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16131v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16131v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Funing Yang, Carolyn Jane Anderson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Several systems have been developed to extract information about characters
+to aid computational analysis of English literature. We propose character
+similarity grouping as a holistic evaluation task for these pipelines. We
+present AustenAlike, a benchmark suite of character similarities in Jane
+Austen's novels. Our benchmark draws on three notions of character similarity:
+a structurally defined notion of similarity; a socially defined notion of
+similarity; and an expert defined set extracted from literary criticism.
+  We use AustenAlike to evaluate character features extracted using two
+pipelines, BookNLP and FanfictionNLP. We build character representations from
+four kinds of features and compare them to the three AustenAlike benchmarks and
+to GPT-4 similarity rankings. We find that though computational representations
+capture some broad similarities based on shared social and narrative roles, the
+expert pairings in our third benchmark are challenging for all systems,
+highlighting the subtler aspects of similarity noted by human readers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Structured Event Reasoning with Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16098v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16098v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Li Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reasoning about real-life events is a unifying challenge in AI and NLP that
+has profound utility in a variety of domains, while fallacy in high-stake
+applications could be catastrophic. Able to work with diverse text in these
+domains, large language models (LLMs) have proven capable of answering
+questions and solving problems. However, I show that end-to-end LLMs still
+systematically fail to reason about complex events, and they lack
+interpretability due to their black-box nature. To address these issues, I
+propose three general approaches to use LLMs in conjunction with a structured
+representation of events. The first is a language-based representation
+involving relations of sub-events that can be learned by LLMs via fine-tuning.
+The second is a semi-symbolic representation involving states of entities that
+can be predicted and leveraged by LLMs via few-shot prompting. The third is a
+fully symbolic representation that can be predicted by LLMs trained with
+structured data and be executed by symbolic solvers. On a suite of event
+reasoning tasks spanning common-sense inference and planning, I show that each
+approach greatly outperforms end-to-end LLMs with more interpretability. These
+results suggest manners of synergy between LLMs and structured representations
+for event reasoning and beyond.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>PhD thesis</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Is Personality Prediction Possible Based on Reddit Comments? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16089v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16089v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Robert Deimann, Till Preidt, Shaptarshi Roy, Jan Stanicki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this assignment, we examine whether there is a correlation between the
+personality type of a person and the texts they wrote. In order to do this, we
+aggregated datasets of Reddit comments labeled with the Myers-Briggs Type
+Indicator (MBTI) of the author and built different supervised classifiers based
+on BERT to try to predict the personality of an author given a text. Despite
+experiencing issues with the unfiltered character of the dataset, we can
+observe potential in the classification.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Logic-Enhanced Language Model Agents for Trustworthy Social Simulations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16081v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16081v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Agnieszka Mensfelt, Kostas Stathis, Vince Trencsenyi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce the Logic-Enhanced Language Model Agents (LELMA) framework, a
+novel approach to enhance the trustworthiness of social simulations that
+utilize large language models (LLMs). While LLMs have gained attention as
+agents for simulating human behaviour, their applicability in this role is
+limited by issues such as inherent hallucinations and logical inconsistencies.
+LELMA addresses these challenges by integrating LLMs with symbolic AI, enabling
+logical verification of the reasoning generated by LLMs. This verification
+process provides corrective feedback, refining the reasoning output. The
+framework consists of three main components: an LLM-Reasoner for producing
+strategic reasoning, an LLM-Translator for mapping natural language reasoning
+to logic queries, and a Solver for evaluating these queries. This study focuses
+on decision-making in game-theoretic scenarios as a model of human interaction.
+Experiments involving the Hawk-Dove game, Prisoner's Dilemma, and Stag Hunt
+highlight the limitations of state-of-the-art LLMs, GPT-4 Omni and Gemini 1.0
+Pro, in producing correct reasoning in these contexts. LELMA demonstrates high
+accuracy in error detection and improves the reasoning correctness of LLMs via
+self-refinement, particularly in GPT-4 Omni.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Source code: https://github.com/dicelab-rhul/LELMA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Using Large Language Models to Create AI Personas for Replication and
+  Prediction of Media Effects: An Empirical Test of 133 Published Experimental
+  Research Findings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16073v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16073v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leo Yeykelis, Kaavya Pichai, James J. Cummings, Byron Reeves
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This report analyzes the potential for large language models (LLMs) to
+expedite accurate replication of published message effects studies. We tested
+LLM-powered participants (personas) by replicating 133 experimental findings
+from 14 papers containing 45 recent studies in the Journal of Marketing
+(January 2023-May 2024). We used a new software tool, Viewpoints AI
+(https://viewpoints.ai/), that takes study designs, stimuli, and measures as
+input, automatically generates prompts for LLMs to act as a specified sample of
+unique personas, and collects their responses to produce a final output in the
+form of a complete dataset and statistical analysis. The underlying LLM used
+was Anthropic's Claude Sonnet 3.5. We generated 19,447 AI personas to replicate
+these studies with the exact same sample attributes, study designs, stimuli,
+and measures reported in the original human research. Our LLM replications
+successfully reproduced 76% of the original main effects (84 out of 111),
+demonstrating strong potential for AI-assisted replication of studies in which
+people respond to media stimuli. When including interaction effects, the
+overall replication rate was 68% (90 out of 133). The use of LLMs to replicate
+and accelerate marketing research on media effects is discussed with respect to
+the replication crisis in social science, potential solutions to
+generalizability problems in sampling subjects and experimental conditions, and
+the ability to rapidly test consumer responses to various media stimuli. We
+also address the limitations of this approach, particularly in replicating
+complex interaction effects in media response studies, and suggest areas for
+future research and improvement in AI-assisted experimental replication of
+media effects.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 3 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Flextron: Many-in-One Flexible Large Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.10260v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.10260v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruisi Cai, Saurav Muralidharan, Greg Heinrich, Hongxu Yin, Zhangyang Wang, Jan Kautz, Pavlo Molchanov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training modern LLMs is extremely resource intensive, and customizing them
+for various deployment scenarios characterized by limited compute and memory
+resources through repeated training is impractical. In this paper, we introduce
+Flextron, a network architecture and post-training model optimization framework
+supporting flexible model deployment. The Flextron architecture utilizes a
+nested elastic structure to rapidly adapt to specific user-defined latency and
+accuracy targets during inference with no additional fine-tuning required. It
+is also input-adaptive, and can automatically route tokens through its
+sub-networks for improved performance and efficiency. We present a
+sample-efficient training method and associated routing algorithms for
+systematically transforming an existing trained LLM into a Flextron model. We
+evaluate Flextron on the GPT-3 and LLama-2 family of LLMs, and demonstrate
+superior performance over multiple end-to-end trained variants and other
+state-of-the-art elastic networks, all with a single pretraining run that
+consumes a mere 7.63% tokens compared to original pretraining.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Human-Level Text Coding with LLMs: The Case of Fatherhood Roles
+  in Public Policy Documents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.11844v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.11844v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lorenzo Lupo, Oscar Magnusson, Dirk Hovy, Elin Naurin, Lena Wängnerud
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in large language models (LLMs) like GPT-3.5 and GPT-4
+promise automation with better results and less programming, opening up new
+opportunities for text analysis in political science. In this study, we
+evaluate LLMs on three original coding tasks involving typical complexities
+encountered in political science settings: a non-English language, legal and
+political jargon, and complex labels based on abstract constructs. Along the
+paper, we propose a practical workflow to optimize the choice of the model and
+the prompt. We find that the best prompting strategy consists of providing the
+LLMs with a detailed codebook, as the one provided to human coders. In this
+setting, an LLM can be as good as or possibly better than a human annotator
+while being much faster, considerably cheaper, and much easier to scale to
+large amounts of text. We also provide a comparison of GPT and popular
+open-source LLMs, discussing the trade-offs in the model's choice. Our software
+allows LLMs to be easily used as annotators and is publicly available:
+https://github.com/lorelupo/pappa.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HC3 Plus: A Semantic-Invariant Human Chat<span class="highlight-title">GPT</span> Comparison Corpus <span class="chip">CIKM2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.02731v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.02731v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenpeng Su, Xing Wu, Wei Zhou, Guangyuan Ma, Songlin Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  ChatGPT has garnered significant interest due to its impressive performance;
+however, there is growing concern about its potential risks, particularly in
+the detection of AI-generated content (AIGC), which is often challenging for
+untrained individuals to identify. Current datasets used for detecting
+ChatGPT-generated text primarily focus on question-answering tasks, often
+overlooking tasks with semantic-invariant properties, such as summarization,
+translation, and paraphrasing. In this paper, we demonstrate that detecting
+model-generated text in semantic-invariant tasks is more challenging. To
+address this gap, we introduce a more extensive and comprehensive dataset that
+incorporates a wider range of tasks than previous work, including those with
+semantic-invariant properties.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted by CIKM2023 workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From Complexity to Clarity: How AI Enhances Perceptions of Scientists
+  and the Public's Understanding of Science 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.00706v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.00706v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David M. Markowitz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper evaluated the effectiveness of using generative AI to simplify
+science communication and enhance the public's understanding of science. By
+comparing lay summaries of journal articles from PNAS, yoked to those generated
+by AI, this work first assessed linguistic simplicity differences across such
+summaries and public perceptions in follow-up experiments. Specifically, Study
+1a analyzed simplicity features of PNAS abstracts (scientific summaries) and
+significance statements (lay summaries), observing that lay summaries were
+indeed linguistically simpler, but effect size differences were small. Study 1b
+used a large language model, GPT-4, to create significance statements based on
+paper abstracts and this more than doubled the average effect size without
+fine-tuning. Study 2 experimentally demonstrated that simply-written GPT
+summaries facilitated more favorable perceptions of scientists (they were
+perceived as more credible and trustworthy, but less intelligent) than more
+complexly-written human PNAS summaries. Crucially, Study 3 experimentally
+demonstrated that participants comprehended scientific writing better after
+reading simple GPT summaries compared to complex PNAS summaries. In their own
+words, participants also summarized scientific papers in a more detailed and
+concrete manner after reading GPT summaries compared to PNAS summaries of the
+same article. AI has the potential to engage scientific communities and the
+public via a simple language heuristic, advocating for its integration into
+scientific dissemination for a more informed society.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RecurrentGemma: Moving Past <span class="highlight-title">Transformer</span>s for Efficient Open Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.07839v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.07839v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aleksandar Botev, Soham De, Samuel L Smith, Anushan Fernando, George-Cristian Muraru, Ruba Haroun, Leonard Berrada, Razvan Pascanu, Pier Giuseppe Sessa, Robert Dadashi, Léonard Hussenot, Johan Ferret, Sertan Girgin, Olivier Bachem, Alek Andreev, Kathleen Kenealy, Thomas Mesnard, Cassidy Hardin, Surya Bhupatiraju, Shreya Pathak, Laurent Sifre, Morgane Rivière, Mihir Sanjay Kale, Juliette Love, Pouya Tafti, Armand Joulin, Noah Fiedel, Evan Senter, Yutian Chen, Srivatsan Srinivasan, Guillaume Desjardins, David Budden, Arnaud Doucet, Sharad Vikram, Adam Paszke, Trevor Gale, Sebastian Borgeaud, Charlie Chen, Andy Brock, Antonia Paterson, Jenny Brennan, Meg Risdal, Raj Gundluru, Nesh Devanathan, Paul Mooney, Nilay Chauhan, Phil Culliton, Luiz Gustavo Martins, Elisa Bandy, David Huntsperger, Glenn Cameron, Arthur Zucker, Tris Warkentin, Ludovic Peran, Minh Giang, Zoubin Ghahramani, Clément Farabet, Koray Kavukcuoglu, Demis Hassabis, Raia Hadsell, Yee Whye Teh, Nando de Frietas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce RecurrentGemma, a family of open language models which uses
+Google's novel Griffin architecture. Griffin combines linear recurrences with
+local attention to achieve excellent performance on language. It has a
+fixed-sized state, which reduces memory use and enables efficient inference on
+long sequences. We provide two sizes of models, containing 2B and 9B
+parameters, and provide pre-trained and instruction tuned variants for both.
+Our models achieve comparable performance to similarly-sized Gemma baselines
+despite being trained on fewer tokens.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Statistical Framework of Watermarks for Large Language Models: Pivot,
+  Detection Efficiency and Optimal Rules 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.01245v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.01245v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Li, Feng Ruan, Huiyuan Wang, Qi Long, Weijie J. Su
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Since ChatGPT was introduced in November 2022, embedding (nearly)
+unnoticeable statistical signals into text generated by large language models
+(LLMs), also known as watermarking, has been used as a principled approach to
+provable detection of LLM-generated text from its human-written counterpart. In
+this paper, we introduce a general and flexible framework for reasoning about
+the statistical efficiency of watermarks and designing powerful detection
+rules. Inspired by the hypothesis testing formulation of watermark detection,
+our framework starts by selecting a pivotal statistic of the text and a secret
+key -- provided by the LLM to the verifier -- to enable controlling the false
+positive rate (the error of mistakenly detecting human-written text as
+LLM-generated). Next, this framework allows one to evaluate the power of
+watermark detection rules by obtaining a closed-form expression of the
+asymptotic false negative rate (the error of incorrectly classifying
+LLM-generated text as human-written). Our framework further reduces the problem
+of determining the optimal detection rule to solving a minimax optimization
+program. We apply this framework to two representative watermarks -- one of
+which has been internally implemented at OpenAI -- and obtain several findings
+that can be instrumental in guiding the practice of implementing watermarks. In
+particular, we derive optimal detection rules for these watermarks under our
+framework. These theoretically derived detection rules are demonstrated to be
+competitive and sometimes enjoy a higher power than existing detection
+approaches through numerical experiments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Downstream bias mitigation is all you need 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.00612v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.00612v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arkadeep Baksi, Rahul Singh, Tarun Joshi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advent of transformer-based architectures and large language models
+(LLMs) have significantly advanced the performance of natural language
+processing (NLP) models. Since these LLMs are trained on huge corpuses of data
+from the web and other sources, there has been a major concern about harmful
+prejudices that may potentially be transferred from the data. In many
+applications, these pre-trained LLMs are fine-tuned on task specific datasets,
+which can further contribute to biases. This paper studies the extent of biases
+absorbed by LLMs during pre-training as well as task-specific behaviour after
+fine-tuning. We found that controlled interventions on pre-trained LLMs, prior
+to fine-tuning, have minimal effect on lowering biases in classifiers. However,
+the biases present in domain-specific datasets play a much bigger role, and
+hence mitigating them at this stage has a bigger impact. While pre-training
+does matter, but after the model has been pre-trained, even slight changes to
+co-occurrence rates in the fine-tuning dataset has a significant effect on the
+bias of the model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: This work has been withdrawn by arXiv
+  administrators due to inappropriate text reuse from external sources</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Look Before You Leap: Towards Decision-Aware and Generalizable
+  Tool-Usage for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.16696v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.16696v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anchun Gui, Jian Li, Yong Dai, Nan Du, Han Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tool-augmented large language models (LLMs) are attracting widespread
+attention when accessing up-to-date knowledge and alleviating hallucination
+issues. Nowadays, advanced closed-source LLMs (e.g., ChatGPT) have demonstrated
+surprising tool-usage capabilities through prompting and in-context learning
+techniques. To empower the capabilities of open-source LLMs (e.g., LLaMA) in
+manipulating tools, current efforts focus on either template-driven or
+token-triggered tool-usage. However, the former hampers LLMs' flexibility to
+address diverse user's queries due to constrained tool interactions, while the
+latter limits the generalizability when engaging with new tools, since
+tool-usage learning is based on task- and tool-specific datasets. To alleviate
+these concerns, in this paper, we propose a decision-aware and generalizable
+tool-usage framework (DEER). Specifically, we first construct the tool-usage
+samples with multiple decision branches via an automatic generation pipeline,
+thereby inspiring the decision-making awareness of LLMs under diverse
+scenarios. Meanwhile, we propose a novel tool sampling strategy to enhance the
+generalizability of LLMs over unseen tools. Extensive experiments demonstrate
+that our proposed DEER is effective and significantly outperforms baselines
+across various datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 18 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ eRST: A Signaled Graph Theory of Discourse Relations and Organization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.13560v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.13560v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amir Zeldes, Tatsuya Aoyama, Yang Janet Liu, Siyao Peng, Debopam Das, Luke Gessler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this article we present Enhanced Rhetorical Structure Theory (eRST), a new
+theoretical framework for computational discourse analysis, based on an
+expansion of Rhetorical Structure Theory (RST). The framework encompasses
+discourse relation graphs with tree-breaking, non-projective and concurrent
+relations, as well as implicit and explicit signals which give explainable
+rationales to our analyses. We survey shortcomings of RST and other existing
+frameworks, such as Segmented Discourse Representation Theory (SDRT), the Penn
+Discourse Treebank (PDTB) and Discourse Dependencies, and address these using
+constructs in the proposed theory. We provide annotation, search and
+visualization tools for data, and present and evaluate a freely available
+corpus of English annotated according to our framework, encompassing 12 spoken
+and written genres with over 200K tokens. Finally, we discuss automatic
+parsing, evaluation metrics and applications for data in our framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unveiling the Statistical Foundations of Chain-of-Thought <span class="highlight-title">Prompt</span>ing
+  Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14511v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14511v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyang Hu, Fengzhuo Zhang, Siyu Chen, Zhuoran Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chain-of-Thought (CoT) prompting and its variants have gained popularity as
+effective methods for solving multi-step reasoning problems using pretrained
+large language models (LLMs). In this work, we analyze CoT prompting from a
+statistical estimation perspective, providing a comprehensive characterization
+of its sample complexity. To this end, we introduce a multi-step latent
+variable model that encapsulates the reasoning process, where the latent
+variable encodes the task information. Under this framework, we demonstrate
+that when the pretraining dataset is sufficiently large, the estimator formed
+by CoT prompting is equivalent to a Bayesian estimator. This estimator
+effectively solves the multi-step reasoning problem by aggregating a posterior
+distribution inferred from the demonstration examples in the prompt. Moreover,
+we prove that the statistical error of the CoT estimator can be decomposed into
+two main components: (i) a prompting error, which arises from inferring the
+true task using CoT prompts, and (ii) the statistical error of the pretrained
+LLM. We establish that, under appropriate assumptions, the prompting error
+decays exponentially to zero as the number of demonstrations increases.
+Additionally, we explicitly characterize the approximation and generalization
+errors of the pretrained LLM. Notably, we construct a transformer model that
+approximates the target distribution of the multi-step reasoning problem with
+an error that decreases exponentially in the number of transformer blocks. Our
+analysis extends to other variants of CoT, including Self-Consistent CoT,
+Tree-of-Thought, and Selection-Inference, offering a broad perspective on the
+efficacy of these methods. We also provide numerical experiments to validate
+the theoretical findings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>150 pages, 18 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Stick to your Role! Stability of Personal Values Expressed in Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.14846v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.14846v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Grgur Kovač, Rémy Portelas, Masataka Sawayama, Peter Ford Dominey, Pierre-Yves Oudeyer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The standard way to study Large Language Models (LLMs) with benchmarks or
+psychology questionnaires is to provide many different queries from similar
+minimal contexts (e.g. multiple choice questions). However, due to LLMs' highly
+context-dependent nature, conclusions from such minimal-context evaluations may
+be little informative about the model's behavior in deployment (where it will
+be exposed to many new contexts). We argue that context-dependence
+(specifically, value stability) should be studied as a specific property of
+LLMs and used as another dimension of LLM comparison (alongside others such as
+cognitive abilities, knowledge, or model size). We present a case-study on the
+stability of value expression over different contexts (simulated conversations
+on different topics) as measured using a standard psychology questionnaire
+(PVQ) and on behavioral downstream tasks. Reusing methods from psychology, we
+study Rank-order stability on the population (interpersonal) level, and
+Ipsative stability on the individual (intrapersonal) level. We consider two
+settings (with and without instructing LLMs to simulate particular personas),
+two simulated populations, and three downstream tasks. We observe consistent
+trends in the stability of models and model families - Mixtral, Mistral,
+GPT-3.5 and Qwen families are more stable than LLaMa-2 and Phi. The consistency
+of these trends implies that some models exhibit higher value stability than
+others, and that stability can be estimated with the set of introduced
+methodological tools. When instructed to simulate particular personas, LLMs
+exhibit low Rank-order stability, which further diminishes with conversation
+length. This highlights the need for future research on LLMs that coherently
+simulate different personas. This paper provides a foundational step in that
+direction, and, to our knowledge, it is the first study of value stability in
+LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The project website and code are available at
+  https://sites.google.com/view/llmvaluestability Published in PLOS ONE (
+  https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0309114 ),
+  and a shorter version at CogSci 24 (
+  https://escholarship.org/uc/item/7w4823c6 )</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluating Large Language Models on Spatial Tasks: A Multi-Task
+  Benchmarking Study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14438v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14438v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liuchang Xu, Shuo Zhao, Qingming Lin, Luyao Chen, Qianqian Luo, Sensen Wu, Xinyue Ye, Hailin Feng, Zhenhong Du
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advent of large language models such as ChatGPT, Gemini, and others has
+underscored the importance of evaluating their diverse capabilities, ranging
+from natural language understanding to code generation. However, their
+performance on spatial tasks has not been comprehensively assessed. This study
+addresses this gap by introducing a novel multi-task spatial evaluation
+dataset, designed to systematically explore and compare the performance of
+several advanced models on spatial tasks. The dataset encompasses twelve
+distinct task types, including spatial understanding and path planning, each
+with verified, accurate answers. We evaluated multiple models, including
+OpenAI's gpt-3.5-turbo, gpt-4o, and ZhipuAI's glm-4, through a two-phase
+testing approach. Initially, we conducted zero-shot testing, followed by
+categorizing the dataset by difficulty and performing prompt tuning tests.
+Results indicate that gpt-4o achieved the highest overall accuracy in the first
+phase, with an average of 71.3%. Although moonshot-v1-8k slightly
+underperformed overall, it surpassed gpt-4o in place name recognition tasks.
+The study also highlights the impact of prompt strategies on model performance
+in specific tasks. For example, the Chain-of-Thought (COT) strategy increased
+gpt-4o's accuracy in path planning from 12.4% to 87.5%, while a one-shot
+strategy enhanced moonshot-v1-8k's accuracy in mapping tasks from 10.1% to
+76.3%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Language-specific Calibration for Pruning Multilingual Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14398v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14398v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simon Kurz, Jian-Jia Chen, Lucie Flek, Zhixue Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in large language model (LLM) pruning have shown
+state-of-the-art compression results in post-training and retraining-free
+settings while maintaining high predictive performance. However, such research
+mainly considers calibrating pruning using English text, despite the
+multilingual nature of modern LLMs and their frequent uses in non-English
+languages. In this paper, we set out to explore effective strategies for
+calibrating the pruning of multilingual language models. We present the first
+comprehensive empirical study, comparing different calibration languages for
+pruning multilingual models across diverse tasks, models, and state-of-the-art
+pruning techniques. Our results present practical suggestions, for example,
+calibrating in the target language can efficiently yield lower perplexity, but
+does not necessarily benefit downstream tasks. Our further analysis experiments
+unveil that calibration in the target language mainly contributes to preserving
+language-specific features related to fluency and coherence, but might not
+contribute to capturing language-agnostic features such as language
+understanding and reasoning. Last, we provide practical recommendations for
+future practitioners.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evading AI-Generated Content Detectors using Homoglyphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.11239v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.11239v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aldan Creo, Shushanta Pudasaini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advent of large language models (LLMs) has enabled the generation of text
+that increasingly exhibits human-like characteristics. As the detection of such
+content is of significant importance, numerous studies have been conducted with
+the aim of developing reliable AI-generated text detectors. These detectors
+have demonstrated promising results on test data, but recent research has
+revealed that they can be circumvented by employing different techniques. In
+this paper, we present homoglyph-based attacks ($a \rightarrow {\alpha}$) as a
+means of circumventing existing detectors. A comprehensive evaluation was
+conducted to assess the effectiveness of these attacks on seven detectors,
+including ArguGPT, Binoculars, DetectGPT, Fast-DetectGPT, Ghostbuster, OpenAI's
+detector, and watermarking techniques, on five different datasets. Our findings
+demonstrate that homoglyph-based attacks can effectively circumvent
+state-of-the-art detectors, leading them to classify all texts as either
+AI-generated or human-written (decreasing the average Matthews Correlation
+Coefficient from 0.64 to -0.01). We then examine the effectiveness of these
+attacks by analyzing how homoglyphs impact different families of detectors.
+Finally, we discuss the implications of these findings and potential defenses
+against such attacks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deciphering the Impact of <span class="highlight-title">Pretrain</span>ing Data on Large Language Models
+  through Machine Unlearning <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.11537v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.11537v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Zhao, Li Du, Xiao Ding, Kai Xiong, Zhouhao Sun, Jun Shi, Ting Liu, Bing Qin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Through pretraining on a corpus with various sources, Large Language Models
+(LLMs) have gained impressive performance. However, the impact of each
+component of the pretraining corpus remains opaque. As a result, the
+organization of the pretraining corpus is still empirical and may deviate from
+the optimal. To address this issue, we systematically analyze the impact of 48
+datasets from 5 major categories of pretraining data of LLMs and measure their
+impacts on LLMs using benchmarks about nine major categories of model
+capabilities. Our analyses provide empirical results about the contribution of
+multiple corpora on the performances of LLMs, along with their joint impact
+patterns, including complementary, orthogonal, and correlational relationships.
+We also identify a set of ``high-impact data'' such as Books that is
+significantly related to a set of model capabilities. These findings provide
+insights into the organization of data to support more efficient pretraining of
+LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL 2024 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PASH at TREC 2021 Deep Learning Track: Generative Enhanced Model for
+  Multi-stage Ranking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.11245v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.11245v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yixuan Qiao, Hao Chen, Jun Wang, Tuozhen Liu, Xianbin Ye, Xin Tang, Rui Fang, Peng Gao, Wenfeng Xie, Guotong Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper describes the PASH participation in TREC 2021 Deep Learning Track.
+In the recall stage, we adopt a scheme combining sparse and dense retrieval
+method. In the multi-stage ranking phase, point-wise and pair-wise ranking
+strategies are used one after another based on model continual pre-trained on
+general knowledge and document-level data. Compared to TREC 2020 Deep Learning
+Track, we have additionally introduced the generative model T5 to further
+enhance the performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>TREC 2021</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large Language Model Sentinel: LLM Agent for Adversarial Purification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20770v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20770v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guang Lin, Qibin Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Over the past two years, the use of large language models (LLMs) has advanced
+rapidly. While these LLMs offer considerable convenience, they also raise
+security concerns, as LLMs are vulnerable to adversarial attacks by some
+well-designed textual perturbations. In this paper, we introduce a novel
+defense technique named Large LAnguage MOdel Sentinel (LLAMOS), which is
+designed to enhance the adversarial robustness of LLMs by purifying the
+adversarial textual examples before feeding them into the target LLM. Our
+method comprises two main components: a) Agent instruction, which can simulate
+a new agent for adversarial defense, altering minimal characters to maintain
+the original meaning of the sentence while defending against attacks; b)
+Defense guidance, which provides strategies for modifying clean or adversarial
+examples to ensure effective defense and accurate outputs from the target LLMs.
+Remarkably, the defense agent demonstrates robust defensive capabilities even
+without learning from adversarial examples. Additionally, we conduct an
+intriguing adversarial experiment where we develop two agents, one for defense
+and one for attack, and engage them in mutual confrontation. During the
+adversarial interactions, neither agent completely beat the other. Extensive
+experiments on both open-source and closed-source LLMs demonstrate that our
+method effectively defends against adversarial attacks, thereby enhancing
+adversarial robustness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AI-native Memory: A Pathway from LLMs Towards AGI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.18312v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.18312v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingbo Shang, Zai Zheng, Jiale Wei, Xiang Ying, Felix Tao, Mindverse Team
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have demonstrated the world with the sparks of
+artificial general intelligence (AGI). One opinion, especially from some
+startups working on LLMs, argues that an LLM with nearly unlimited context
+length can realize AGI. However, they might be too optimistic about the
+long-context capability of (existing) LLMs -- (1) Recent literature has shown
+that their effective context length is significantly smaller than their claimed
+context length; and (2) Our reasoning-in-a-haystack experiments further
+demonstrate that simultaneously finding the relevant information from a long
+context and conducting (simple) reasoning is nearly impossible. In this paper,
+we envision a pathway from LLMs to AGI through the integration of
+\emph{memory}. We believe that AGI should be a system where LLMs serve as core
+processors. In addition to raw data, the memory in this system would store a
+large number of important conclusions derived from reasoning processes.
+Compared with retrieval-augmented generation (RAG) that merely processing raw
+data, this approach not only connects semantically related information closer,
+but also simplifies complex inferences at the time of querying. As an
+intermediate stage, the memory will likely be in the form of natural language
+descriptions, which can be directly consumed by users too. Ultimately, every
+agent/person should have its own large personal model, a deep neural network
+model (thus \emph{AI-native}) that parameterizes and compresses all types of
+memory, even the ones cannot be described by natural languages. Finally, we
+discuss the significant potential of AI-native memory as the transformative
+infrastructure for (proactive) engagement, personalization, distribution, and
+social in the AGI era, as well as the incurred privacy and security challenges
+with preliminary solutions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SkyScript-100M: 1,000,000,000 Pairs of Scripts and Shooting Scripts for
+  Short Drama 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.09333v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.09333v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jing Tang, Quanlu Jia, Yuqiang Xie, Zeyu Gong, Xiang Wen, Jiayi Zhang, Yalong Guo, Guibin Chen, Jiangping Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generating high-quality shooting scripts containing information such as scene
+and shot language is essential for short drama script generation. We collect
+6,660 popular short drama episodes from the Internet, each with an average of
+100 short episodes, and the total number of short episodes is about 80,000,
+with a total duration of about 2,000 hours and totaling 10 terabytes (TB). We
+perform keyframe extraction and annotation on each episode to obtain about
+10,000,000 shooting scripts. We perform 100 script restorations on the
+extracted shooting scripts based on our self-developed large short drama
+generation model SkyReels. This leads to a dataset containing 1,000,000,000
+pairs of scripts and shooting scripts for short dramas, called SkyScript-100M.
+We compare SkyScript-100M with the existing dataset in detail and demonstrate
+some deeper insights that can be achieved based on SkyScript-100M. Based on
+SkyScript-100M, researchers can achieve several deeper and more far-reaching
+script optimization goals, which may drive a paradigm shift in the entire field
+of text-to-video and significantly advance the field of short drama video
+generation. The data and code are available at
+https://github.com/vaew/SkyScript-100M.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SimpleSpeech 2: Towards Simple and Efficient Text-to-Speech with
+  Flow-based Scalar Latent <span class="highlight-title">Transformer</span> Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13893v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13893v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongchao Yang, Rongjie Huang, Yuanyuan Wang, Haohan Guo, Dading Chong, Songxiang Liu, Xixin Wu, Helen Meng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scaling Text-to-speech (TTS) to large-scale datasets has been demonstrated as
+an effective method for improving the diversity and naturalness of synthesized
+speech. At the high level, previous large-scale TTS models can be categorized
+into either Auto-regressive (AR) based (\textit{e.g.}, VALL-E) or
+Non-auto-regressive (NAR) based models (\textit{e.g.}, NaturalSpeech 2/3).
+Although these works demonstrate good performance, they still have potential
+weaknesses. For instance, AR-based models are plagued by unstable generation
+quality and slow generation speed; meanwhile, some NAR-based models need
+phoneme-level duration alignment information, thereby increasing the complexity
+of data pre-processing, model design, and loss design. In this work, we build
+upon our previous publication by implementing a simple and efficient
+non-autoregressive (NAR) TTS framework, termed SimpleSpeech 2. SimpleSpeech 2
+effectively combines the strengths of both autoregressive (AR) and
+non-autoregressive (NAR) methods, offering the following key advantages: (1)
+simplified data preparation; (2) straightforward model and loss design; and (3)
+stable, high-quality generation performance with fast inference speed. Compared
+to our previous publication, we present ({\romannumeral1}) a detailed analysis
+of the influence of speech tokenizer and noisy label for TTS performance;
+({\romannumeral2}) four distinct types of sentence duration predictors;
+({\romannumeral3}) a novel flow-based scalar latent transformer diffusion
+model. With these improvement, we show a significant improvement in generation
+performance and generation speed compared to our previous work and other
+state-of-the-art (SOTA) large-scale TTS models. Furthermore, we show that
+SimpleSpeech 2 can be seamlessly extended to multilingual TTS by training it on
+multilingual speech datasets. Demos are available on:
+{https://dongchaoyang.top/SimpleSpeech2\_demo/}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submit to TASLP</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ xGen-MM (BLIP-3): A Family of Open Large Multimodal Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.08872v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.08872v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Le Xue, Manli Shu, Anas Awadalla, Jun Wang, An Yan, Senthil Purushwalkam, Honglu Zhou, Viraj Prabhu, Yutong Dai, Michael S Ryoo, Shrikant Kendre, Jieyu Zhang, Can Qin, Shu Zhang, Chia-Chih Chen, Ning Yu, Juntao Tan, Tulika Manoj Awalgaonkar, Shelby Heinecke, Huan Wang, Yejin Choi, Ludwig Schmidt, Zeyuan Chen, Silvio Savarese, Juan Carlos Niebles, Caiming Xiong, Ran Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This report introduces xGen-MM (also known as BLIP-3), a framework for
+developing Large Multimodal Models (LMMs). The framework comprises meticulously
+curated datasets, a training recipe, model architectures, and a resulting suite
+of LMMs. xGen-MM, short for xGen-MultiModal, expands the Salesforce xGen
+initiative on foundation AI models. Our models undergo rigorous evaluation
+across a range of tasks, including both single and multi-image benchmarks. Our
+pre-trained base model exhibits strong in-context learning capabilities and the
+instruction-tuned model demonstrates competitive performance among open-source
+LMMs with similar model sizes. In addition, we introduce a safety-tuned model
+with DPO, aiming to mitigate harmful behaviors such as hallucinations and
+improve safety. We open-source our models, curated large-scale datasets, and
+our fine-tuning codebase to facilitate further advancements in LMM research.
+Associated resources will be available on our project page above.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Survey</span> of Large Language Models for European Languages 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15040v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15040v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wazir Ali, Sampo Pyysalo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have gained significant attention due to their
+high performance on a wide range of natural language tasks since the release of
+ChatGPT. The LLMs learn to understand and generate language by training
+billions of model parameters on vast volumes of text data. Despite being a
+relatively new field, LLM research is rapidly advancing in various directions.
+In this paper, we present an overview of LLM families, including LLaMA, PaLM,
+GPT, and MoE, and the methods developed to create and enhance LLMs for official
+European Union (EU) languages. We provide a comprehensive summary of common
+monolingual and multilingual datasets used for pretraining large language
+models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ WeKnow-RAG: An Adaptive Approach for Retrieval-Augmented Generation
+  Integrating Web Search and Knowledge Graphs <span class="chip">KDD</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.07611v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.07611v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weijian Xie, Xuefeng Liang, Yuhui Liu, Kaihua Ni, Hong Cheng, Zetian Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have greatly contributed to the development of
+adaptive intelligent agents and are positioned as an important way to achieve
+Artificial General Intelligence (AGI). However, LLMs are prone to produce
+factually incorrect information and often produce "phantom" content that
+undermines their reliability, which poses a serious challenge for their
+deployment in real-world scenarios. Enhancing LLMs by combining external
+databases and information retrieval mechanisms is an effective path. To address
+the above challenges, we propose a new approach called WeKnow-RAG, which
+integrates Web search and Knowledge Graphs into a "Retrieval-Augmented
+Generation (RAG)" system. First, the accuracy and reliability of LLM responses
+are improved by combining the structured representation of Knowledge Graphs
+with the flexibility of dense vector retrieval. WeKnow-RAG then utilizes
+domain-specific knowledge graphs to satisfy a variety of queries and domains,
+thereby improving performance on factual information and complex reasoning
+tasks by employing multi-stage web page retrieval techniques using both sparse
+and dense retrieval methods. Our approach effectively balances the efficiency
+and accuracy of information retrieval, thus improving the overall retrieval
+process. Finally, we also integrate a self-assessment mechanism for the LLM to
+evaluate the trustworthiness of the answers it generates. Our approach proves
+its outstanding effectiveness in a wide range of offline experiments and online
+submissions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 2 figures, technical report for 3rd place in Task 3 of Meta
+  KDD Cup 2024 CRAG Challenge</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large Language Models Understand Layout <span class="chip">ECAI-2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.05750v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.05750v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weiming Li, Manni Duan, Dong An, Yan Shao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) demonstrate extraordinary abilities in a wide
+range of natural language processing (NLP) tasks. In this paper, we show that,
+beyond text understanding capability, LLMs are capable of processing text
+layouts that are denoted by spatial markers. They are able to answer questions
+that require explicit spatial perceiving and reasoning, while a drastic
+performance drop is observed when the spatial markers from the original data
+are excluded. We perform a series of experiments with the GPT-3.5, Baichuan2,
+Llama2 and ChatGLM3 models on various types of layout-sensitive datasets for
+further analysis. The experimental results reveal that the layout understanding
+ability of LLMs is mainly introduced by the coding data for pretraining, which
+is further enhanced at the instruction-tuning stage. In addition, layout
+understanding can be enhanced by integrating low-cost, auto-generated data
+approached by a novel text game. Finally, we show that layout understanding
+ability is beneficial for building efficient visual question-answering (VQA)
+systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted by ECAI-2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VHAKG: A Multi-modal Knowledge Graph Based on Synchronized Multi-view
+  Videos of Daily Activities <span class="chip">CIKM2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14895v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14895v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shusaku Egami, Takahiro Ugai, Swe Nwe Nwe Htun, Ken Fukuda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal knowledge graphs (MMKGs), which ground various non-symbolic data
+(e.g., images and videos) into symbols, have attracted attention as resources
+enabling knowledge processing and machine learning across modalities. However,
+the construction of MMKGs for videos consisting of multiple events, such as
+daily activities, is still in the early stages. In this paper, we construct an
+MMKG based on synchronized multi-view simulated videos of daily activities.
+Besides representing the content of daily life videos as event-centric
+knowledge, our MMKG also includes frame-by-frame fine-grained changes, such as
+bounding boxes within video frames. In addition, we provide support tools for
+querying our MMKG. As an application example, we demonstrate that our MMKG
+facilitates benchmarking vision-language models by providing the necessary
+vision-language datasets for a tailored task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 4 figures, accepted by CIKM2024 Resource Track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SurGen: Text-Guided Diffusion Model for Surgical Video Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14028v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14028v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joseph Cho, Samuel Schmidgall, Cyril Zakka, Mrudang Mathur, Rohan Shad, William Hiesinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion-based video generation models have made significant strides,
+producing outputs with improved visual fidelity, temporal coherence, and user
+control. These advancements hold great promise for improving surgical education
+by enabling more realistic, diverse, and interactive simulation environments.
+In this study, we introduce SurGen, a text-guided diffusion model tailored for
+surgical video synthesis, producing the highest resolution and longest duration
+videos among existing surgical video generation models. We validate the visual
+and temporal quality of the outputs using standard image and video generation
+metrics. Additionally, we assess their alignment to the corresponding text
+prompts through a deep learning classifier trained on surgical data. Our
+results demonstrate the potential of diffusion models to serve as valuable
+educational tools for surgical trainees.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">148</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Eagle: Exploring The Design Space for Multimodal LLMs with Mixture of
+  Encoders 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15998v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15998v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Min Shi, Fuxiao Liu, Shihao Wang, Shijia Liao, Subhashree Radhakrishnan, De-An Huang, Hongxu Yin, Karan Sapra, Yaser Yacoob, Humphrey Shi, Bryan Catanzaro, Andrew Tao, Jan Kautz, Zhiding Yu, Guilin Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ability to accurately interpret complex visual information is a crucial
+topic of multimodal large language models (MLLMs). Recent work indicates that
+enhanced visual perception significantly reduces hallucinations and improves
+performance on resolution-sensitive tasks, such as optical character
+recognition and document analysis. A number of recent MLLMs achieve this goal
+using a mixture of vision encoders. Despite their success, there is a lack of
+systematic comparisons and detailed ablation studies addressing critical
+aspects, such as expert selection and the integration of multiple vision
+experts. This study provides an extensive exploration of the design space for
+MLLMs using a mixture of vision encoders and resolutions. Our findings reveal
+several underlying principles common to various existing strategies, leading to
+a streamlined yet effective design approach. We discover that simply
+concatenating visual tokens from a set of complementary vision encoders is as
+effective as more complex mixing architectures or strategies. We additionally
+introduce Pre-Alignment to bridge the gap between vision-focused encoders and
+language tokens, enhancing model coherence. The resulting family of MLLMs,
+Eagle, surpasses other leading open-source models on major MLLM benchmarks.
+Models and code: https://github.com/NVlabs/Eagle
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Github: https://github.com/NVlabs/Eagle, HuggingFace:
+  https://huggingface.co/NVEagle</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spatio-Temporal Context <span class="highlight-title">Prompt</span>ing for Zero-Shot Action Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15996v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15996v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei-Jhe Huang, Min-Hung Chen, Shang-Hong Lai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spatio-temporal action detection encompasses the tasks of localizing and
+classifying individual actions within a video. Recent works aim to enhance this
+process by incorporating interaction modeling, which captures the relationship
+between people and their surrounding context. However, these approaches have
+primarily focused on fully-supervised learning, and the current limitation lies
+in the lack of generalization capability to recognize unseen action categories.
+In this paper, we aim to adapt the pretrained image-language models to detect
+unseen actions. To this end, we propose a method which can effectively leverage
+the rich knowledge of visual-language models to perform Person-Context
+Interaction. Meanwhile, our Context Prompting module will utilize contextual
+information to prompt labels, thereby enhancing the generation of more
+representative text features. Moreover, to address the challenge of recognizing
+distinct actions by multiple people at the same timestamp, we design the
+Interest Token Spotting mechanism which employs pretrained visual knowledge to
+find each person's interest context tokens, and then these tokens will be used
+for prompting to generate text features tailored to each individual. To
+evaluate the ability to detect unseen actions, we propose a comprehensive
+benchmark on J-HMDB, UCF101-24, and AVA datasets. The experiments show that our
+method achieves superior results compared to previous approaches and can be
+further extended to multi-action videos, bringing it closer to real-world
+applications. The code and data can be found in
+https://webber2933.github.io/ST-CLIP-project-page.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TEDRA: Text-based Editing of Dynamic and Photoreal Actors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15995v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15995v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Basavaraj Sunagad, Heming Zhu, Mohit Mendiratta, Adam Kortylewski, Christian Theobalt, Marc Habermann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Over the past years, significant progress has been made in creating
+photorealistic and drivable 3D avatars solely from videos of real humans.
+However, a core remaining challenge is the fine-grained and user-friendly
+editing of clothing styles by means of textual descriptions. To this end, we
+present TEDRA, the first method allowing text-based edits of an avatar, which
+maintains the avatar's high fidelity, space-time coherency, as well as
+dynamics, and enables skeletal pose and view control. We begin by training a
+model to create a controllable and high-fidelity digital replica of the real
+actor. Next, we personalize a pretrained generative diffusion model by
+fine-tuning it on various frames of the real character captured from different
+camera angles, ensuring the digital representation faithfully captures the
+dynamics and movements of the real person. This two-stage process lays the
+foundation for our approach to dynamic human avatar editing. Utilizing this
+personalized diffusion model, we modify the dynamic avatar based on a provided
+text prompt using our Personalized Normal Aligned Score Distillation Sampling
+(PNA-SDS) within a model-based guidance framework. Additionally, we propose a
+time step annealing strategy to ensure high-quality edits. Our results
+demonstrate a clear improvement over prior work in functionality and visual
+quality.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>For project page, see this https://vcai.mpi-inf.mpg.de/projects/Tedra</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Perceive-IR: Learning to Perceive Degradation Better for All-in-One
+  Image Restoration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15994v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15994v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xu Zhang, Jiaqi Ma, Guoli Wang, Qian Zhang, Huan Zhang, Lefei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The limitations of task-specific and general image restoration methods for
+specific degradation have prompted the development of all-in-one image
+restoration techniques. However, the diversity of patterns among multiple
+degradation, along with the significant uncertainties in mapping between
+degraded images of different severities and their corresponding undistorted
+versions, pose significant challenges to the all-in-one restoration tasks. To
+address these challenges, we propose Perceive-IR, an all-in-one image restorer
+designed to achieve fine-grained quality control that enables restored images
+to more closely resemble their undistorted counterparts, regardless of the type
+or severity of degradation. Specifically, Perceive-IR contains two stages: (1)
+prompt learning stage and (2) restoration stage. In the prompt learning stage,
+we leverage prompt learning to acquire a fine-grained quality perceiver capable
+of distinguishing three-tier quality levels by constraining the prompt-image
+similarity in the CLIP perception space. Subsequently, this quality perceiver
+and difficulty-adaptive perceptual loss are integrated as a quality-aware
+learning strategy to realize fine-grained quality control in restoration stage.
+For the restoration stage, a semantic guidance module (SGM) and compact feature
+extraction (CFE) are proposed to further promote the restoration process by
+utilizing the robust semantic information from the pre-trained large scale
+vision models and distinguishing degradation-specific features. Extensive
+experiments demonstrate that our Perceive-IR outperforms state-of-the-art
+methods in all-in-one image restoration tasks and exhibit superior
+generalization ability when dealing with unseen tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ClimDetect: A Benchmark <span class="highlight-title">Dataset</span> for Climate Change Detection and
+  Attribution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15993v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15993v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sungduk Yu, Brian L. White, Anahita Bhiwandiwalla, Musashi Hinck, Matthew Lyle Olson, Tung Nguyen, Vasudev Lal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting and attributing temperature increases due to climate change is
+crucial for understanding global warming and guiding adaptation strategies. The
+complexity of distinguishing human-induced climate signals from natural
+variability has challenged traditional detection and attribution (D&A)
+approaches, which seek to identify specific "fingerprints" in climate response
+variables. Deep learning offers potential for discerning these complex patterns
+in expansive spatial datasets. However, lack of standard protocols has hindered
+consistent comparisons across studies. We introduce ClimDetect, a standardized
+dataset of over 816k daily climate snapshots, designed to enhance model
+accuracy in identifying climate change signals. ClimDetect integrates various
+input and target variables used in past research, ensuring comparability and
+consistency. We also explore the application of vision transformers (ViT) to
+climate data, a novel and modernizing approach in this context. Our open-access
+data and code serve as a benchmark for advancing climate science through
+improved model evaluations. ClimDetect is publicly accessible via Huggingface
+dataet respository at: https://huggingface.co/datasets/ClimDetect/ClimDetect.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CoGen: Learning from Feedback with Coupled Comprehension and Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15992v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15992v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mustafa Omer Gul, Yoav Artzi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Systems with both language comprehension and generation capabilities can
+benefit from the tight connection between the two. This work studies coupling
+comprehension and generation with focus on continually learning from
+interaction with users. We propose techniques to tightly integrate the two
+capabilities for both learning and inference. We situate our studies in
+two-player reference games, and deploy various models for thousands of
+interactions with human users, while learning from interaction feedback
+signals. We show dramatic improvements in performance over time, with
+comprehension-generation coupling leading to performance improvements up to 26%
+in absolute terms and up to 17% higher accuracies compared to a non-coupled
+system. Our analysis also shows coupling has substantial qualitative impact on
+the system's language, making it significantly more human-like.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Distribution Backtracking Builds A Faster Convergence Trajectory for
+  One-step Diffusion Distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15991v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15991v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengyuan Zhang, Ling Yang, Zejian Li, An Zhao, Chenye Meng, Changyuan Yang, Guang Yang, Zhiyuan Yang, Lingyun Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accelerating the sampling speed of diffusion models remains a significant
+challenge. Recent score distillation methods distill a heavy teacher model into
+an one-step student generator, which is optimized by calculating the difference
+between the two score functions on the samples generated by the student model.
+However, there is a score mismatch issue in the early stage of the distillation
+process, because existing methods mainly focus on using the endpoint of
+pre-trained diffusion models as teacher models, overlooking the importance of
+the convergence trajectory between the student generator and the teacher model.
+To address this issue, we extend the score distillation process by introducing
+the entire convergence trajectory of teacher models and propose Distribution
+Backtracking Distillation (DisBack) for distilling student generators. DisBask
+is composed of two stages: Degradation Recording and Distribution Backtracking.
+Degradation Recording is designed to obtain the convergence trajectory of
+teacher models, which records the degradation path from the trained teacher
+model to the untrained initial student generator. The degradation path
+implicitly represents the intermediate distributions of teacher models. Then
+Distribution Backtracking trains a student generator to backtrack the
+intermediate distributions for approximating the convergence trajectory of
+teacher models. Extensive experiments show that DisBack achieves faster and
+better convergence than the existing distillation method and accomplishes
+comparable generation performance. Notably, DisBack is easy to implement and
+can be generalized to existing distillation methods to boost performance. Our
+code is publicly available on https://github.com/SYZhang0805/DisBack.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ More Text, Less Point: Towards 3D Data-Efficient Point-Language
+  Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15966v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15966v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuan Tang, Xu Han, Xianzhi Li, Qiao Yu, Jinfeng Xu, Yixue Hao, Long Hu, Min Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Enabling Large Language Models (LLMs) to comprehend the 3D physical world
+remains a significant challenge. Due to the lack of large-scale 3D-text pair
+datasets, the success of LLMs has yet to be replicated in 3D understanding. In
+this paper, we rethink this issue and propose a new task: 3D Data-Efficient
+Point-Language Understanding. The goal is to enable LLMs to achieve robust 3D
+object understanding with minimal 3D point cloud and text data pairs. To
+address this task, we introduce GreenPLM, which leverages more text data to
+compensate for the lack of 3D data. First, inspired by using CLIP to align
+images and text, we utilize a pre-trained point cloud-text encoder to map the
+3D point cloud space to the text space. This mapping leaves us to seamlessly
+connect the text space with LLMs. Once the point-text-LLM connection is
+established, we further enhance text-LLM alignment by expanding the
+intermediate text space, thereby reducing the reliance on 3D point cloud data.
+Specifically, we generate 6M free-text descriptions of 3D objects, and design a
+three-stage training strategy to help LLMs better explore the intrinsic
+connections between different modalities. To achieve efficient modality
+alignment, we design a zero-parameter cross-attention module for token pooling.
+Extensive experimental results show that GreenPLM requires only 12% of the 3D
+training data used by existing state-of-the-art models to achieve superior 3D
+understanding. Remarkably, GreenPLM also achieves competitive performance using
+text-only data. The code and weights are available at:
+https://github.com/TangYuan96/GreenPLM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient Slice Anomaly Detection Network for 3D Brain MRI Volume 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15958v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15958v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeduo Zhang, Yalda Mohsenzadeh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current anomaly detection methods excel with benchmark industrial data but
+struggle with natural images and medical data due to varying definitions of
+'normal' and 'abnormal.' This makes accurate identification of deviations in
+these fields particularly challenging. Especially for 3D brain MRI data, all
+the state-of-the-art models are reconstruction-based with 3D convolutional
+neural networks which are memory-intensive, time-consuming and producing noisy
+outputs that require further post-processing. We propose a framework called
+Simple Slice-based Network (SimpleSliceNet), which utilizes a model pre-trained
+on ImageNet and fine-tuned on a separate MRI dataset as a 2D slice feature
+extractor to reduce computational cost. We aggregate the extracted features to
+perform anomaly detection tasks on 3D brain MRI volumes. Our model integrates a
+conditional normalizing flow to calculate log likelihood of features and
+employs the Semi-Push-Pull Mechanism to enhance anomaly detection accuracy. The
+results indicate improved performance, showcasing our model's remarkable
+adaptability and effectiveness when addressing the challenges exists in brain
+MRI data. In addition, for the large-scale 3D brain volumes, our model
+SimpleSliceNet outperforms the state-of-the-art 2D and 3D models in terms of
+accuracy, memory usage and time consumption. Code is available at:
+https://anonymous.4open.science/r/SimpleSliceNet-8EA3.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generating Binary Species Range Maps 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15956v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15956v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Filip Dorm, Christian Lange, Scott Loarie, Oisin Mac Aodha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurately predicting the geographic ranges of species is crucial for
+assisting conservation efforts. Traditionally, range maps were manually created
+by experts. However, species distribution models (SDMs) and, more recently,
+deep learning-based variants offer a potential automated alternative. Deep
+learning-based SDMs generate a continuous probability representing the
+predicted presence of a species at a given location, which must be binarized by
+setting per-species thresholds to obtain binary range maps. However, selecting
+appropriate per-species thresholds to binarize these predictions is non-trivial
+as different species can require distinct thresholds. In this work, we evaluate
+different approaches for automatically identifying the best thresholds for
+binarizing range maps using presence-only data. This includes approaches that
+require the generation of additional pseudo-absence data, along with ones that
+only require presence data. We also propose an extension of an existing
+presence-only technique that is more robust to outliers. We perform a detailed
+evaluation of different thresholding techniques on the tasks of binary range
+estimation and large-scale fine-grained visual classification, and we
+demonstrate improved performance over existing pseudo-absence free approaches
+using our method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fall Detection for Smart Living using YOLOv5 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15955v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15955v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gracile Astlin Pereira
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work introduces a fall detection system using the YOLOv5mu model, which
+achieved a mean average precision (mAP) of 0.995, demonstrating exceptional
+accuracy in identifying fall events within smart home environments. Enhanced by
+advanced data augmentation techniques, the model demonstrates significant
+robustness and adaptability across various conditions. The integration of
+YOLOv5mu offers precise, real-time fall detection, which is crucial for
+improving safety and emergency response for residents. Future research will
+focus on refining the system by incorporating contextual data and exploring
+multi-sensor approaches to enhance its performance and practical applicability
+in diverse environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ InstanSeg: an embedding-based instance segmentation algorithm optimized
+  for accurate, efficient and portable cell segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15954v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15954v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thibaut Goldsborough, Ben Philps, Alan O'Callaghan, Fiona Inglis, Leo Leplat, Andrew Filby, Hakan Bilen, Peter Bankhead
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cell and nucleus segmentation are fundamental tasks for quantitative bioimage
+analysis. Despite progress in recent years, biologists and other domain experts
+still require novel algorithms to handle increasingly large and complex
+real-world datasets. These algorithms must not only achieve state-of-the-art
+accuracy, but also be optimized for efficiency, portability and
+user-friendliness. Here, we introduce InstanSeg: a novel embedding-based
+instance segmentation pipeline designed to identify cells and nuclei in
+microscopy images. Using six public cell segmentation datasets, we demonstrate
+that InstanSeg can significantly improve accuracy when compared to the most
+widely used alternative methods, while reducing the processing time by at least
+60%. Furthermore, InstanSeg is designed to be fully serializable as TorchScript
+and supports GPU acceleration on a range of hardware. We provide an open-source
+implementation of InstanSeg in Python, in addition to a user-friendly,
+interactive QuPath extension for inference written in Java. Our code and
+pre-trained models are available at https://github.com/instanseg/instanseg .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages,6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Auxiliary Input in Training: Incorporating Catheter Features into Deep
+  Learning Models for ECG-Free Dynamic Coronary Roadmapping <span class="chip">MICCAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15947v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15947v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yikang Liu, Lin Zhao, Eric Z. Chen, Xiao Chen, Terrence Chen, Shanhui Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dynamic coronary roadmapping is a technology that overlays the vessel maps
+(the "roadmap") extracted from an offline image sequence of X-ray angiography
+onto a live stream of X-ray fluoroscopy in real-time. It aims to offer
+navigational guidance for interventional surgeries without the need for
+repeated contrast agent injections, thereby reducing the risks associated with
+radiation exposure and kidney failure. The precision of the roadmaps is
+contingent upon the accurate alignment of angiographic and fluoroscopic images
+based on their cardiac phases, as well as precise catheter tip tracking. The
+former ensures the selection of a roadmap that closely matches the vessel shape
+in the current frame, while the latter uses catheter tips as reference points
+to adjust for translational motion between the roadmap and the present vessel
+tree. Training deep learning models for both tasks is challenging and
+underexplored. However, incorporating catheter features into the models could
+offer substantial benefits, given humans heavily rely on catheters to complete
+the tasks. To this end, we introduce a simple but effective method, auxiliary
+input in training (AIT), and demonstrate that it enhances model performance
+across both tasks, outperforming baseline methods in knowledge incorporation
+and transfer learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>MICCAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sigma Flows for Image and Data Labeling and Learning Structured
+  Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15946v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15946v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonas Cassel, Bastian Boll, Stefania Petra, Peter Albers, Christoph Schnörr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces the sigma flow model for the prediction of structured
+labelings of data observed on Riemannian manifolds, including Euclidean image
+domains as special case. The approach combines the Laplace-Beltrami framework
+for image denoising and enhancement, introduced by Sochen, Kimmel and Malladi
+about 25 years ago, and the assignment flow approach introduced and studied by
+the authors.
+  The sigma flow arises as Riemannian gradient flow of generalized harmonic
+energies and thus is governed by a nonlinear geometric PDE which determines a
+harmonic map from a closed Riemannian domain manifold to a statistical
+manifold, equipped with the Fisher-Rao metric from information geometry. A
+specific ingredient of the sigma flow is the mutual dependency of the
+Riemannian metric of the domain manifold on the evolving state. This makes the
+approach amenable to machine learning in a specific way, by realizing this
+dependency through a mapping with compact time-variant parametrization that can
+be learned from data. Proof of concept experiments demonstrate the expressivity
+of the sigma flow model and prediction performance.
+  Structural similarities to transformer network architectures and networks
+generated by the geometric integration of sigma flows are pointed out, which
+highlights the connection to deep learning and, conversely, may stimulate the
+use of geometric design principles for structured prediction in other areas of
+scientific machine learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>51 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Local Descriptors Weighted Adaptive Threshold Filtering For Few-Shot
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15924v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15924v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bingchen Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Few-shot image classification is a challenging task in the field of machine
+learning, involving the identification of new categories using a limited number
+of labeled samples. In recent years, methods based on local descriptors have
+made significant progress in this area. However, the key to improving
+classification accuracy lies in effectively filtering background noise and
+accurately selecting critical local descriptors highly relevant to image
+category information.
+  To address this challenge, we propose an innovative weighted adaptive
+threshold filtering (WATF) strategy for local descriptors. This strategy can
+dynamically adjust based on the current task and image context, thereby
+selecting local descriptors most relevant to the image category. This enables
+the model to better focus on category-related information while effectively
+mitigating interference from irrelevant background regions.
+  To evaluate the effectiveness of our method, we adopted the N-way K-shot
+experimental framework. Experimental results show that our method not only
+improves the clustering effect of selected local descriptors but also
+significantly enhances the discriminative ability between image categories.
+Notably, our method maintains a simple and lightweight design philosophy
+without introducing additional learnable parameters. This feature ensures
+consistency in filtering capability during both training and testing phases,
+further enhancing the reliability and practicality of the method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DiffAge3D: Diffusion-based 3D-aware Face Aging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15922v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15922v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junaid Wahid, Fangneng Zhan, Pramod Rao, Christian Theobalt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Face aging is the process of converting an individual's appearance to a
+younger or older version of themselves. Existing face aging techniques have
+been limited to 2D settings, which often weaken their applications as there is
+a growing demand for 3D face modeling. Moreover, existing aging methods
+struggle to perform faithful aging, maintain identity, and retain the fine
+details of the input images. Given these limitations and the need for a
+3D-aware aging method, we propose DiffAge3D, the first 3D-aware aging framework
+that not only performs faithful aging and identity preservation but also
+operates in a 3D setting. Our aging framework allows to model the aging and
+camera pose separately by only taking a single image with a target age. Our
+framework includes a robust 3D-aware aging dataset generation pipeline by
+utilizing a pre-trained 3D GAN and the rich text embedding capabilities within
+CLIP model. Notably, we do not employ any inversion bottleneck in dataset
+generation. Instead, we randomly generate training samples from the latent
+space of 3D GAN, allowing us to manipulate the rich latent space of GAN to
+generate ages even with large gaps. With the generated dataset, we train a
+viewpoint-aware diffusion-based aging model to control the camera pose and
+facial age. Through quantitative and qualitative evaluations, we demonstrate
+that DiffAge3D outperforms existing methods, particularly in
+multiview-consistent aging and fine details preservation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging Open Knowledge for Advancing Task Expertise in Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15915v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15915v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuncheng Yang, Yulei Qin, Tong Wu, Zihan Xu, Gang Li, Pengcheng Guo, Hang Shao, Yucheng Shi, Ke Li, Xing Sun, Jie Yang, Yun Gu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The cultivation of expertise for large language models (LLMs) to solve tasks
+of specific areas often requires special-purpose tuning with calibrated
+behaviors on the expected stable outputs. To avoid huge cost brought by manual
+preparation of instruction datasets and training resources up to hundreds of
+hours, the exploitation of open knowledge including a wealth of low rank
+adaptation (LoRA) models and instruction datasets serves as a good starting
+point. However, existing methods on model and data selection focus on the
+performance of general-purpose capabilities while neglecting the knowledge gap
+exposed in domain-specific deployment. In the present study, we propose to
+bridge such gap by introducing few human-annotated samples (i.e., K-shot) for
+advancing task expertise of LLMs with open knowledge. Specifically, we develop
+an efficient and scalable pipeline to cost-efficiently produce task experts
+where K-shot data intervene in selecting the most promising expert candidates
+and the task-relevant instructions. A mixture-of-expert (MoE) system is built
+to make the best use of individual-yet-complementary knowledge between multiple
+experts. We unveil the two keys to the success of a MoE system, 1) the abidance
+by K-shot, and 2) the insistence on diversity. For the former, we ensure that
+models that truly possess problem-solving abilities on K-shot are selected
+rather than those blind guessers. Besides, during data selection, instructions
+that share task-relevant contexts with K-shot are prioritized. For the latter,
+we highlight the diversity of constituting experts and that of the fine-tuning
+instructions throughout the model and data selection process. Extensive
+experimental results confirm the superiority of our approach over existing
+methods on utilization of open knowledge across various tasks. Codes and models
+will be released later.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, 12 tables, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CoRe: Context-Regularized Text Embedding Learning for Text-to-Image
+  Personalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15914v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15914v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Feize Wu, Yun Pang, Junyi Zhang, Lianyu Pang, Jian Yin, Baoquan Zhao, Qing Li, Xudong Mao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in text-to-image personalization have enabled high-quality
+and controllable image synthesis for user-provided concepts. However, existing
+methods still struggle to balance identity preservation with text alignment.
+Our approach is based on the fact that generating prompt-aligned images
+requires a precise semantic understanding of the prompt, which involves
+accurately processing the interactions between the new concept and its
+surrounding context tokens within the CLIP text encoder. To address this, we
+aim to embed the new concept properly into the input embedding space of the
+text encoder, allowing for seamless integration with existing tokens. We
+introduce Context Regularization (CoRe), which enhances the learning of the new
+concept's text embedding by regularizing its context tokens in the prompt. This
+is based on the insight that appropriate output vectors of the text encoder for
+the context tokens can only be achieved if the new concept's text embedding is
+correctly learned. CoRe can be applied to arbitrary prompts without requiring
+the generation of corresponding images, thus improving the generalization of
+the learned text embedding. Additionally, CoRe can serve as a test-time
+optimization technique to further enhance the generations for specific prompts.
+Comprehensive experiments demonstrate that our method outperforms several
+baseline methods in both identity preservation and text alignment. Code will be
+made publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gen-Swarms: Adapting Deep Generative Models to Swarms of Drones 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15899v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15899v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Carlos Plou, Pablo Pueyo, Ruben Martinez-Cantin, Mac Schwager, Ana C. Murillo, Eduardo Montijano
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gen-Swarms is an innovative method that leverages and combines the
+capabilities of deep generative models with reactive navigation algorithms to
+automate the creation of drone shows. Advancements in deep generative models,
+particularly diffusion models, have demonstrated remarkable effectiveness in
+generating high-quality 2D images. Building on this success, various works have
+extended diffusion models to 3D point cloud generation. In contrast,
+alternative generative models such as flow matching have been proposed,
+offering a simple and intuitive transition from noise to meaningful outputs.
+However, the application of flow matching models to 3D point cloud generation
+remains largely unexplored. Gen-Swarms adapts these models to automatically
+generate drone shows. Existing 3D point cloud generative models create point
+trajectories which are impractical for drone swarms. In contrast, our method
+not only generates accurate 3D shapes but also guides the swarm motion,
+producing smooth trajectories and accounting for potential collisions through a
+reactive navigation algorithm incorporated into the sampling process. For
+example, when given a text category like Airplane, Gen-Swarms can rapidly and
+continuously generate numerous variations of 3D airplane shapes. Our
+experiments demonstrate that this approach is particularly well-suited for
+drone shows, providing feasible trajectories, creating representative final
+shapes, and significantly enhancing the overall performance of drone show
+generation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Disentangled Diffusion Autoencoder for Harmonization of Multi-site
+  Neuroimaging Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15890v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15890v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ayodeji Ijishakin, Ana Lawry Aguila, Elizabeth Levitis, Ahmed Abdulaal, Andre Altmann, James Cole
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Combining neuroimaging datasets from multiple sites and scanners can help
+increase statistical power and thus provide greater insight into subtle
+neuroanatomical effects. However, site-specific effects pose a challenge by
+potentially obscuring the biological signal and introducing unwanted variance.
+Existing harmonization techniques, which use statistical models to remove such
+effects, have been shown to incompletely remove site effects while also failing
+to preserve biological variability. More recently, generative models using GANs
+or autoencoder-based approaches, have been proposed for site adjustment.
+However, such methods are known for instability during training or blurry image
+generation. In recent years, diffusion models have become increasingly popular
+for their ability to generate high-quality synthetic images. In this work, we
+introduce the disentangled diffusion autoencoder (DDAE), a novel diffusion
+model designed for controlling specific aspects of an image. We apply the DDAE
+to the task of harmonizing MR images by generating high-quality site-adjusted
+images that preserve biological variability. We use data from 7 different sites
+and demonstrate the DDAE's superiority in generating high-resolution,
+harmonized 2D MR images over previous approaches. As far as we are aware, this
+work marks the first diffusion-based model for site adjustment of neuroimaging
+data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SpineMamba: Enhancing 3D Spinal Segmentation in Clinical Imaging through
+  Residual Visual Mamba Layers and Shape Priors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15887v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15887v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiqing Zhang, Tianyong Liu, Guojia Fan, Bin Li, Qianjin Feng, Shoujun Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate segmentation of 3D clinical medical images is critical in the
+diagnosis and treatment of spinal diseases. However, the inherent complexity of
+spinal anatomy and uncertainty inherent in current imaging technologies, poses
+significant challenges for semantic segmentation of spinal images. Although
+convolutional neural networks (CNNs) and Transformer-based models have made
+some progress in spinal segmentation, their limitations in handling long-range
+dependencies hinder further improvements in segmentation accuracy.To address
+these challenges, we introduce a residual visual Mamba layer to effectively
+capture and model the deep semantic features and long-range spatial
+dependencies of 3D spinal data. To further enhance the structural semantic
+understanding of the vertebrae, we also propose a novel spinal shape prior
+module that captures specific anatomical information of the spine from medical
+images, significantly enhancing the model's ability to extract structural
+semantic information of the vertebrae. Comparative and ablation experiments on
+two datasets demonstrate that SpineMamba outperforms existing state-of-the-art
+models. On the CT dataset, the average Dice similarity coefficient for
+segmentation reaches as high as 94.40, while on the MR dataset, it reaches
+86.95. Notably, compared to the renowned nnU-Net, SpineMamba achieves superior
+segmentation performance, exceeding it by up to 2 percentage points. This
+underscores its accuracy, robustness, and excellent generalization
+capabilities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLaVA-MoD: Making LLaVA Tiny via MoE Knowledge Distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15881v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15881v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fangxun Shu, Yue Liao, Le Zhuo, Chenning Xu, Guanghao Zhang, Haonan Shi, Long Chen, Tao Zhong, Wanggui He, Siming Fu, Haoyuan Li, Bolin Li, Zhelun Yu, Si Liu, Hongsheng Li, Hao Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce LLaVA-MoD, a novel framework designed to enable the efficient
+training of small-scale Multimodal Language Models (s-MLLM) by distilling
+knowledge from large-scale MLLM (l-MLLM). Our approach tackles two fundamental
+challenges in MLLM distillation. First, we optimize the network structure of
+s-MLLM by integrating a sparse Mixture of Experts (MoE) architecture into the
+language model, striking a balance between computational efficiency and model
+expressiveness. Second, we propose a progressive knowledge transfer strategy to
+ensure comprehensive knowledge migration. This strategy begins with mimic
+distillation, where we minimize the Kullback-Leibler (KL) divergence between
+output distributions to enable the student model to emulate the teacher
+network's understanding. Following this, we introduce preference distillation
+via Direct Preference Optimization (DPO), where the key lies in treating l-MLLM
+as the reference model. During this phase, the s-MLLM's ability to discriminate
+between superior and inferior examples is significantly enhanced beyond l-MLLM,
+leading to a better student that surpasses its teacher, particularly in
+hallucination benchmarks. Extensive experiments demonstrate that LLaVA-MoD
+outperforms existing models across various multimodal benchmarks while
+maintaining a minimal number of activated parameters and low computational
+costs. Remarkably, LLaVA-MoD, with only 2B activated parameters, surpasses
+Qwen-VL-Chat-7B by an average of 8.8% across benchmarks, using merely 0.3% of
+the training data and 23% trainable parameters. These results underscore
+LLaVA-MoD's ability to effectively distill comprehensive knowledge from its
+teacher model, paving the way for the development of more efficient MLLMs. The
+code will be available on: https://github.com/shufangxun/LLaVA-MoD.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unleashing the Temporal-Spatial Reasoning Capacity of <span class="highlight-title">GPT</span> for
+  Training-Free Audio and Language Referenced Video Object Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15876v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15876v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaofei Huang, Rui Ling, Hongyu Li, Tianrui Hui, Zongheng Tang, Xiaoming Wei, Jizhong Han, Si Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose an Audio-Language-Referenced SAM 2 (AL-Ref-SAM 2)
+pipeline to explore the training-free paradigm for audio and
+language-referenced video object segmentation, namely AVS and RVOS tasks. The
+intuitive solution leverages GroundingDINO to identify the target object from a
+single frame and SAM 2 to segment the identified object throughout the video,
+which is less robust to spatiotemporal variations due to a lack of video
+context exploration. Thus, in our AL-Ref-SAM 2 pipeline, we propose a novel
+GPT-assisted Pivot Selection (GPT-PS) module to instruct GPT-4 to perform
+two-step temporal-spatial reasoning for sequentially selecting pivot frames and
+pivot boxes, thereby providing SAM 2 with a high-quality initial object prompt.
+Within GPT-PS, two task-specific Chain-of-Thought prompts are designed to
+unleash GPT's temporal-spatial reasoning capacity by guiding GPT to make
+selections based on a comprehensive understanding of video and reference
+information. Furthermore, we propose a Language-Binded Reference Unification
+(LBRU) module to convert audio signals into language-formatted references,
+thereby unifying the formats of AVS and RVOS tasks in the same pipeline.
+Extensive experiments on both tasks show that our training-free AL-Ref-SAM 2
+pipeline achieves performances comparable to or even better than
+fully-supervised fine-tuning methods. The code is available at:
+https://github.com/appletea233/AL-Ref-SAM2.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GenDDS: Generating Diverse Driving Video Scenarios with <span class="highlight-title">Prompt</span>-to-Video
+  Generative Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15868v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15868v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongjie Fu, Yunlong Li, Xuan Di
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous driving training requires a diverse range of datasets encompassing
+various traffic conditions, weather scenarios, and road types. Traditional data
+augmentation methods often struggle to generate datasets that represent rare
+occurrences. To address this challenge, we propose GenDDS, a novel approach for
+generating driving scenarios generation by leveraging the capabilities of
+Stable Diffusion XL (SDXL), an advanced latent diffusion model. Our methodology
+involves the use of descriptive prompts to guide the synthesis process, aimed
+at producing realistic and diverse driving scenarios. With the power of the
+latest computer vision techniques, such as ControlNet and Hotshot-XL, we have
+built a complete pipeline for video generation together with SDXL. We employ
+the KITTI dataset, which includes real-world driving videos, to train the
+model. Through a series of experiments, we demonstrate that our model can
+generate high-quality driving videos that closely replicate the complexity and
+variability of real-world driving scenarios. This research contributes to the
+development of sophisticated training data for autonomous driving systems and
+opens new avenues for creating virtual environments for simulation and
+validation purposes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ microYOLO: Towards Single-Shot Object Detection on Microcontrollers <span class="chip">ECML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15865v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15865v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mark Deutel, Christopher Mutschler, Jürgen Teich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work-in-progress paper presents results on the feasibility of
+single-shot object detection on microcontrollers using YOLO. Single-shot object
+detectors like YOLO are widely used, however due to their complexity mainly on
+larger GPU-based platforms. We present microYOLO, which can be used on Cortex-M
+based microcontrollers, such as the OpenMV H7 R2, achieving about 3.5 FPS when
+classifying 128x128 RGB images while using less than 800 KB Flash and less than
+350 KB RAM. Furthermore, we share experimental results for three different
+object detection tasks, analyzing the accuracy of microYOLO on them.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at the ECML PKDD Conference 2023, at the 4th Workshop on
+  IoT, Edge, and Mobile for Embedded Machine Learning</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ What is YOLOv8: An In-Depth Exploration of the Internal Features of the
+  Next-Generation Object Detector 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15857v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15857v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Yaseen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study presents a detailed analysis of the YOLOv8 object detection model,
+focusing on its architecture, training techniques, and performance improvements
+over previous iterations like YOLOv5. Key innovations, including the CSPNet
+backbone for enhanced feature extraction, the FPN+PAN neck for superior
+multi-scale object detection, and the transition to an anchor-free approach,
+are thoroughly examined. The paper reviews YOLOv8's performance across
+benchmarks like Microsoft COCO and Roboflow 100, highlighting its high accuracy
+and real-time capabilities across diverse hardware platforms. Additionally, the
+study explores YOLOv8's developer-friendly enhancements, such as its unified
+Python package and CLI, which streamline model training and deployment.
+Overall, this research positions YOLOv8 as a state-of-the-art solution in the
+evolving object detection field.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Shot Segmentation Based on Von Neumann Entropy for Key Frame Extraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15844v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15844v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xueqing Zhang. Di Fu, Naihao Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video key frame extraction is important in various fields, such as video
+summary, retrieval, and compression. Therefore, we suggest a video key frame
+extraction algorithm based on shot segmentation using Von Neumann entropy. The
+segmentation of shots is achieved through the computation of Von Neumann
+entropy of the similarity matrix among frames within the video sequence. The
+initial frame of each shot is selected as key frames, which combines the
+temporal sequence information of frames. The experimental results show the
+extracted key frames can fully and accurately represent the original video
+content while minimizing the number of repeated frames.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Network transferability of adversarial patches in real-time object
+  detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15833v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15833v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jens Bayer, Stefan Becker, David Münch, Michael Arens
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adversarial patches in computer vision can be used, to fool deep neural
+networks and manipulate their decision-making process. One of the most
+prominent examples of adversarial patches are evasion attacks for object
+detectors. By covering parts of objects of interest, these patches suppress the
+detections and thus make the target object 'invisible' to the object detector.
+Since these patches are usually optimized on a specific network with a specific
+train dataset, the transferability across multiple networks and datasets is not
+given. This paper addresses these issues and investigates the transferability
+across numerous object detector architectures. Our extensive evaluation across
+various models on two distinct datasets indicates that patches optimized with
+larger models provide better network transferability than patches that are
+optimized with smaller models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 6 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SI<span class="highlight-title">Transformer</span>: Shared Information-Guided <span class="highlight-title">Transformer</span> for Extreme
+  Multimodal Summarization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15829v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15829v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sicheng Liu, Lintao Wang, Xiaogan Zhu, Xuequan Lu, Zhiyong Wang, Kun Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Extreme Multimodal Summarization with Multimodal Output (XMSMO) becomes an
+attractive summarization approach by integrating various types of information
+to create extremely concise yet informative summaries for individual
+modalities. Existing methods overlook the issue that multimodal data often
+contains more topic irrelevant information, which can mislead the model into
+producing inaccurate summaries especially for extremely short ones. In this
+paper, we propose SITransformer, a \textbf{S}hared \textbf{I}nformation-guided
+\textbf{T}ransformer for extreme multimodal summarization. It has a shared
+information guided pipeline which involves a cross-modal shared information
+extractor and a cross-modal interaction module. The extractor formulates
+semantically shared salient information from different modalities by devising a
+novel filtering process consisting of a differentiable top-k selector and a
+shared-information guided gating unit. As a result, the common, salient, and
+relevant contents across modalities are identified. Next, a transformer with
+cross-modal attentions is developed for intra- and inter-modality learning with
+the shared information guidance to produce the extreme summary. Comprehensive
+experiments demonstrate that SITransformer significantly enhances the
+summarization quality for both video and text summaries for XMSMO. Our code
+will be publicly available at https://github.com/SichengLeoLiu/MMAsia24-XMSMO.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 5 figures, submitted to ACM Multimedia Asia 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Benchmarking foundation models as feature extractors for
+  weakly-supervised computational pathology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15823v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15823v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peter Neidlinger, Omar S. M. El Nahhas, Hannah Sophie Muti, Tim Lenz, Michael Hoffmeister, Hermann Brenner, Marko van Treeck, Rupert Langer, Bastian Dislich, Hans Michael Behrens, Christoph Röcken, Sebastian Foersch, Daniel Truhn, Antonio Marra, Oliver Lester Saldanha, Jakob Nikolas Kather
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advancements in artificial intelligence have driven the development of
+numerous pathology foundation models capable of extracting clinically relevant
+information. However, there is currently limited literature independently
+evaluating these foundation models on truly external cohorts and
+clinically-relevant tasks to uncover adjustments for future improvements. In
+this study, we benchmarked ten histopathology foundation models on 13 patient
+cohorts with 6,791 patients and 9,493 slides from lung, colorectal, gastric,
+and breast cancers. The models were evaluated on weakly-supervised tasks
+related to biomarkers, morphological properties, and prognostic outcomes. We
+show that a vision-language foundation model, CONCH, yielded the highest
+performance in 42% of tasks when compared to vision-only foundation models. The
+experiments reveal that foundation models trained on distinct cohorts learn
+complementary features to predict the same label, and can be fused to
+outperform the current state of the art. Creating an ensemble of complementary
+foundation models outperformed CONCH in 66% of tasks. Moreover, our findings
+suggest that data diversity outweighs data volume for foundation models. Our
+work highlights actionable adjustments to improve pathology foundation models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mining Field Data for Tree Species Recognition at Scale 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15816v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15816v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dimitri Gominski, Daniel Ortiz-Gonzalo, Martin Brandt, Maurice Mugabowindekwe, Rasmus Fensholt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Individual tree species labels are particularly hard to acquire due to the
+expert knowledge needed and the limitations of photointerpretation. Here, we
+present a methodology to automatically mine species labels from public forest
+inventory data, using available pretrained tree detection models. We identify
+tree instances in aerial imagery and match them with field data with close to
+zero human involvement. We conduct a series of experiments on the resulting
+dataset, and show a beneficial effect when adding noisy or even unlabeled data
+points, highlighting a strong potential for large-scale individual species
+mapping.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DQFormer: Towards Unified LiDAR Panoptic Segmentation with Decoupled
+  Queries 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15813v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15813v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Yang, Jianbiao Mei, Liang Liu, Siliang Du, Yilin Xiao, Jongwon Ra, Yong Liu, Xiao Xu, Huifeng Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  LiDAR panoptic segmentation, which jointly performs instance and semantic
+segmentation for things and stuff classes, plays a fundamental role in LiDAR
+perception tasks. While most existing methods explicitly separate these two
+segmentation tasks and utilize different branches (i.e., semantic and instance
+branches), some recent methods have embraced the query-based paradigm to unify
+LiDAR panoptic segmentation. However, the distinct spatial distribution and
+inherent characteristics of objects(things) and their surroundings(stuff) in 3D
+scenes lead to challenges, including the mutual competition of things/stuff and
+the ambiguity of classification/segmentation. In this paper, we propose
+decoupling things/stuff queries according to their intrinsic properties for
+individual decoding and disentangling classification/segmentation to mitigate
+ambiguity. To this end, we propose a novel framework dubbed DQFormer to
+implement semantic and instance segmentation in a unified workflow.
+Specifically, we design a decoupled query generator to propose informative
+queries with semantics by localizing things/stuff positions and fusing
+multi-level BEV embeddings. Moreover, a query-oriented mask decoder is
+introduced to decode corresponding segmentation masks by performing masked
+cross-attention between queries and mask embeddings. Finally, the decoded masks
+are combined with the semantics of the queries to produce panoptic results.
+Extensive experiments on nuScenes and SemanticKITTI datasets demonstrate the
+superiority of our DQFormer framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-view Pose Fusion for Occlusion-Aware 3D Human Pose Estimation <span class="chip">ECCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15810v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15810v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laura Bragagnolo, Matteo Terreran, Davide Allegro, Stefano Ghidoni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robust 3D human pose estimation is crucial to ensure safe and effective
+human-robot collaboration. Accurate human perception,however, is particularly
+challenging in these scenarios due to strong occlusions and limited camera
+viewpoints. Current 3D human pose estimation approaches are rather vulnerable
+in such conditions. In this work we present a novel approach for robust 3D
+human pose estimation in the context of human-robot collaboration. Instead of
+relying on noisy 2D features triangulation, we perform multi-view fusion on 3D
+skeletons provided by absolute monocular methods. Accurate 3D pose estimation
+is then obtained via reprojection error optimization, introducing limbs length
+symmetry constraints. We evaluate our approach on the public dataset Human3.6M
+and on a novel version Human3.6M-Occluded, derived adding synthetic occlusions
+on the camera views with the purpose of testing pose estimation algorithms
+under severe occlusions. We further validate our method on real human-robot
+collaboration workcells, in which we strongly surpass current 3D human pose
+estimation methods. Our approach outperforms state-of-the-art multi-view human
+pose estimation techniques and demonstrates superior capabilities in handling
+challenging scenarios with strong occlusions, representing a reliable and
+effective solution for real human-robot collaboration setups.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ECCV workshops 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Object Detection for Vehicle Dashcams using <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15809v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15809v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Osama Mustafa, Khizer Ali, Anam Bibi, Imran Siddiqi, Momina Moetesum
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The use of intelligent automation is growing significantly in the automotive
+industry, as it assists drivers and fleet management companies, thus increasing
+their productivity. Dash cams are now been used for this purpose which enables
+the instant identification and understanding of multiple objects and
+occurrences in the surroundings. In this paper, we propose a novel approach for
+object detection in dashcams using transformers. Our system is based on the
+state-of-the-art DEtection TRansformer (DETR), which has demonstrated strong
+performance in a variety of conditions, including different weather and
+illumination scenarios. The use of transformers allows for the consideration of
+contextual information in decisionmaking, improving the accuracy of object
+detection. To validate our approach, we have trained our DETR model on a
+dataset that represents real-world conditions. Our results show that the use of
+intelligent automation through transformers can significantly enhance the
+capabilities of dashcam systems. The model achieves an mAP of 0.95 on
+detection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 Pages, and 6 Figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Visual <span class="highlight-title">Prompt</span> Engineering for Medical Vision Language Models in
+  Radiology <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15802v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15802v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stefan Denner, Markus Bujotzek, Dimitrios Bounias, David Zimmerer, Raphael Stock, Paul F. Jäger, Klaus Maier-Hein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical image classification in radiology faces significant challenges,
+particularly in generalizing to unseen pathologies. In contrast, CLIP offers a
+promising solution by leveraging multimodal learning to improve zero-shot
+classification performance. However, in the medical domain, lesions can be
+small and might not be well represented in the embedding space. Therefore, in
+this paper, we explore the potential of visual prompt engineering to enhance
+the capabilities of Vision Language Models (VLMs) in radiology. Leveraging
+BiomedCLIP, trained on extensive biomedical image-text pairs, we investigate
+the impact of embedding visual markers directly within radiological images to
+guide the model's attention to critical regions. Our evaluation on the JSRT
+dataset, focusing on lung nodule malignancy classification, demonstrates that
+incorporating visual prompts $\unicode{x2013}$ such as arrows, circles, and
+contours $\unicode{x2013}$ significantly improves classification metrics
+including AUROC, AUPRC, F1 score, and accuracy. Moreover, the study provides
+attention maps, showcasing enhanced model interpretability and focus on
+clinically relevant areas. These findings underscore the efficacy of visual
+prompt engineering as a straightforward yet powerful approach to advance VLM
+performance in medical image analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ECCV 2024 Workshop on Emergent Visual Abilities and
+  Limits of Foundation Models</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Survey</span> on Facial Expression Recognition of Static and Dynamic Emotions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15777v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15777v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yan Wang, Shaoqi Yan, Yang Liu, Wei Song, Jing Liu, Yang Chang, Xinji Mai, Xiping Hu, Wenqiang Zhang, Zhongxue Gan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Facial expression recognition (FER) aims to analyze emotional states from
+static images and dynamic sequences, which is pivotal in enhancing
+anthropomorphic communication among humans, robots, and digital avatars by
+leveraging AI technologies. As the FER field evolves from controlled laboratory
+environments to more complex in-the-wild scenarios, advanced methods have been
+rapidly developed and new challenges and apporaches are encounted, which are
+not well addressed in existing reviews of FER. This paper offers a
+comprehensive survey of both image-based static FER (SFER) and video-based
+dynamic FER (DFER) methods, analyzing from model-oriented development to
+challenge-focused categorization. We begin with a critical comparison of recent
+reviews, an introduction to common datasets and evaluation criteria, and an
+in-depth workflow on FER to establish a robust research foundation. We then
+systematically review representative approaches addressing eight main
+challenges in SFER (such as expression disturbance, uncertainties, compound
+emotions, and cross-domain inconsistency) as well as seven main challenges in
+DFER (such as key frame sampling, expression intensity variations, and
+cross-modal alignment). Additionally, we analyze recent advancements, benchmark
+performances, major applications, and ethical considerations. Finally, we
+propose five promising future directions and development trends to guide
+ongoing research. The project page for this paper can be found at
+https://github.com/wangyanckxx/SurveyFER.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Survey</span> on Evaluation of Multimodal Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15769v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15769v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaxing Huang, Jingyi Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal Large Language Models (MLLMs) mimic human perception and reasoning
+system by integrating powerful Large Language Models (LLMs) with various
+modality encoders (e.g., vision, audio), positioning LLMs as the "brain" and
+various modality encoders as sensory organs. This framework endows MLLMs with
+human-like capabilities, and suggests a potential pathway towards achieving
+artificial general intelligence (AGI). With the emergence of all-round MLLMs
+like GPT-4V and Gemini, a multitude of evaluation methods have been developed
+to assess their capabilities across different dimensions. This paper presents a
+systematic and comprehensive review of MLLM evaluation methods, covering the
+following key aspects: (1) the background of MLLMs and their evaluation; (2)
+"what to evaluate" that reviews and categorizes existing MLLM evaluation tasks
+based on the capabilities assessed, including general multimodal recognition,
+perception, reasoning and trustworthiness, and domain-specific applications
+such as socioeconomic, natural sciences and engineering, medical usage, AI
+agent, remote sensing, video and audio processing, 3D point cloud analysis, and
+others; (3) "where to evaluate" that summarizes MLLM evaluation benchmarks into
+general and specific benchmarks; (4) "how to evaluate" that reviews and
+illustrates MLLM evaluation steps and metrics; Our overarching goal is to
+provide valuable insights for researchers in the field of MLLM evaluation,
+thereby facilitating the development of more capable and reliable MLLMs. We
+emphasize that evaluation should be regarded as a critical discipline,
+essential for advancing the field of MLLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Addressing the challenges of loop detection in agricultural environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15761v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15761v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicolás Soncini, Javier Civera, Taihú Pire
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While visual SLAM systems are well studied and achieve impressive results in
+indoor and urban settings, natural, outdoor and open-field environments are
+much less explored and still present relevant research challenges. Visual
+navigation and local mapping have shown a relatively good performance in
+open-field environments. However, globally consistent mapping and long-term
+localization still depend on the robustness of loop detection and closure, for
+which the literature is scarce. In this work we propose a novel method to pave
+the way towards robust loop detection in open fields, particularly in
+agricultural settings, based on local feature search and stereo geometric
+refinement, with a final stage of relative pose estimation. Our method
+consistently achieves good loop detections, with a median error of 15cm. We aim
+to characterize open fields as a novel environment for loop detection,
+understanding the limitations and problems that arise when dealing with them.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Str-L Pose: Integrating Point and Structured Line for Relative Pose
+  Estimation in Dual-Graph 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15750v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15750v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zherong Zhang, Chunyu Lin, Shujuan Huang, Shangrong Yang, Yao Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Relative pose estimation is crucial for various computer vision applications,
+including Robotic and Autonomous Driving. Current methods primarily depend on
+selecting and matching feature points prone to incorrect matches, leading to
+poor performance. Consequently, relying solely on point-matching relationships
+for pose estimation is a huge challenge. To overcome these limitations, we
+propose a Geometric Correspondence Graph neural network that integrates point
+features with extra structured line segments. This integration of matched
+points and line segments further exploits the geometry constraints and enhances
+model performance across different environments. We employ the Dual-Graph
+module and Feature Weighted Fusion Module to aggregate geometric and visual
+features effectively, facilitating complex scene understanding. We demonstrate
+our approach through extensive experiments on the DeMoN and KITTI Odometry
+datasets. The results show that our method is competitive with state-of-the-art
+techniques.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Segmentation-guided Layer-wise Image Vectorization with Gradient Fills 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15741v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15741v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hengyu Zhou, Hui Zhang, Bin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The widespread use of vector graphics creates a significant demand for
+vectorization methods. While recent learning-based techniques have shown their
+capability to create vector images of clear topology, filling these primitives
+with gradients remains a challenge. In this paper, we propose a
+segmentation-guided vectorization framework to convert raster images into
+concise vector graphics with radial gradient fills. With the guidance of an
+embedded gradient-aware segmentation subroutine, our approach progressively
+appends gradient-filled B\'ezier paths to the output, where primitive
+parameters are initiated with our newly designed initialization technique and
+are optimized to minimize our novel loss function. We build our method on a
+differentiable renderer with traditional segmentation algorithms to develop it
+as a model-free tool for raster-to-vector conversion. It is tested on various
+inputs to demonstrate its feasibility, independent of datasets, to synthesize
+vector graphics with improved visual quality and layer-wise topology compared
+to prior work.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MambaPlace:Text-to-Point-Cloud Cross-Modal Place Recognition with
+  Attention Mamba Mechanisms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15740v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15740v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyi Shang, Zhenyu Li, Wenhao Pei, Pengjie Xu, ZhaoJun Deng, Fanchen Kong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision Language Place Recognition (VLVPR) enhances robot localization
+performance by incorporating natural language descriptions from images. By
+utilizing language information, VLVPR directs robot place matching, overcoming
+the constraint of solely depending on vision. The essence of multimodal fusion
+lies in mining the complementary information between different modalities.
+However, general fusion methods rely on traditional neural architectures and
+are not well equipped to capture the dynamics of cross modal interactions,
+especially in the presence of complex intra modal and inter modal correlations.
+To this end, this paper proposes a novel coarse to fine and end to end
+connected cross modal place recognition framework, called MambaPlace. In the
+coarse localization stage, the text description and 3D point cloud are encoded
+by the pretrained T5 and instance encoder, respectively. They are then
+processed using Text Attention Mamba (TAM) and Point Clouds Mamba (PCM) for
+data enhancement and alignment. In the subsequent fine localization stage, the
+features of the text description and 3D point cloud are cross modally fused and
+further enhanced through cascaded Cross Attention Mamba (CCAM). Finally, we
+predict the positional offset from the fused text point cloud features,
+achieving the most accurate localization. Extensive experiments show that
+MambaPlace achieves improved localization accuracy on the KITTI360Pose dataset
+compared to the state of the art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Defending Text-to-image Diffusion Models: Surprising Efficacy of Textual
+  Perturbations Against Backdoor Attacks <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15721v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15721v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oscar Chew, Po-Yi Lu, Jayden Lin, Hsuan-Tien Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-image diffusion models have been widely adopted in real-world
+applications due to their ability to generate realistic images from textual
+descriptions. However, recent studies have shown that these methods are
+vulnerable to backdoor attacks. Despite the significant threat posed by
+backdoor attacks on text-to-image diffusion models, countermeasures remain
+under-explored. In this paper, we address this research gap by demonstrating
+that state-of-the-art backdoor attacks against text-to-image diffusion models
+can be effectively mitigated by a surprisingly simple defense strategy -
+textual perturbation. Experiments show that textual perturbations are effective
+in defending against state-of-the-art backdoor attacks with minimal sacrifice
+to generation quality. We analyze the efficacy of textual perturbation from two
+angles: text embedding space and cross-attention maps. They further explain how
+backdoor attacks have compromised text-to-image diffusion models, providing
+insights for studying future attack and defense strategies. Our code is
+available at https://github.com/oscarchew/t2i-backdoor-defense.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ECCV 2024 Workshop The Dark Side of Generative AIs and Beyond</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Pixels to Prose: Understanding the art of Image Captioning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15714v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15714v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hrishikesh Singh, Aarti Sharma, Millie Pant
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the era of evolving artificial intelligence, machines are increasingly
+emulating human-like capabilities, including visual perception and linguistic
+expression. Image captioning stands at the intersection of these domains,
+enabling machines to interpret visual content and generate descriptive text.
+This paper provides a thorough review of image captioning techniques, catering
+to individuals entering the field of machine learning who seek a comprehensive
+understanding of available options, from foundational methods to
+state-of-the-art approaches. Beginning with an exploration of primitive
+architectures, the review traces the evolution of image captioning models to
+the latest cutting-edge solutions. By dissecting the components of these
+architectures, readers gain insights into the underlying mechanisms and can
+select suitable approaches tailored to specific problem requirements without
+duplicating efforts. The paper also delves into the application of image
+captioning in the medical domain, illuminating its significance in various
+real-world scenarios.
+  Furthermore, the review offers guidance on evaluating the performance of
+image captioning systems, highlighting key metrics for assessment. By
+synthesizing theoretical concepts with practical application, this paper equips
+readers with the knowledge needed to navigate the complex landscape of image
+captioning and harness its potential for diverse applications in machine
+learning and beyond.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Realistic Example-based Modeling via 3D Gaussian Stitching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15708v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15708v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyu Gao, Ziyi Yang, Bingchen Gong, Xiaoguang Han, Sipeng Yang, Xiaogang Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Using parts of existing models to rebuild new models, commonly termed as
+example-based modeling, is a classical methodology in the realm of computer
+graphics. Previous works mostly focus on shape composition, making them very
+hard to use for realistic composition of 3D objects captured from real-world
+scenes. This leads to combining multiple NeRFs into a single 3D scene to
+achieve seamless appearance blending. However, the current SeamlessNeRF method
+struggles to achieve interactive editing and harmonious stitching for
+real-world scenes due to its gradient-based strategy and grid-based
+representation. To this end, we present an example-based modeling method that
+combines multiple Gaussian fields in a point-based representation using
+sample-guided synthesis. Specifically, as for composition, we create a GUI to
+segment and transform multiple fields in real time, easily obtaining a
+semantically meaningful composition of models represented by 3D Gaussian
+Splatting (3DGS). For texture blending, due to the discrete and irregular
+nature of 3DGS, straightforwardly applying gradient propagation as SeamlssNeRF
+is not supported. Thus, a novel sampling-based cloning method is proposed to
+harmonize the blending while preserving the original rich texture and content.
+Our workflow consists of three steps: 1) real-time segmentation and
+transformation of a Gaussian model using a well-tailored GUI, 2) KNN analysis
+to identify boundary points in the intersecting area between the source and
+target models, and 3) two-phase optimization of the target model using
+sampling-based cloning and gradient constraints. Extensive experimental results
+validate that our approach significantly outperforms previous works in terms of
+realistic synthesis, demonstrating its practicality. More demos are available
+at https://ingra14m.github.io/gs_stitching_website.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ G-Style: Stylized Gaussian Splatting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15695v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15695v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Áron Samuel Kovács, Pedro Hermosilla, Renata G. Raidou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce G-Style, a novel algorithm designed to transfer the style of an
+image onto a 3D scene represented using Gaussian Splatting. Gaussian Splatting
+is a powerful 3D representation for novel view synthesis, as -- compared to
+other approaches based on Neural Radiance Fields -- it provides fast scene
+renderings and user control over the scene. Recent pre-prints have demonstrated
+that the style of Gaussian Splatting scenes can be modified using an image
+exemplar. However, since the scene geometry remains fixed during the
+stylization process, current solutions fall short of producing satisfactory
+results. Our algorithm aims to address these limitations by following a
+three-step process: In a pre-processing step, we remove undesirable Gaussians
+with large projection areas or highly elongated shapes. Subsequently, we
+combine several losses carefully designed to preserve different scales of the
+style in the image, while maintaining as much as possible the integrity of the
+original scene content. During the stylization process and following the
+original design of Gaussian Splatting, we split Gaussians where additional
+detail is necessary within our scene by tracking the gradient of the stylized
+color. Our experiments demonstrate that G-Style generates high-quality
+stylizations within just a few minutes, outperforming existing methods both
+qualitatively and quantitatively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Synthetic Forehead-creases Biometric Generation for Reliable User
+  Verification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15693v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15693v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhishek Tandon, Geetanjali Sharma, Gaurav Jaswal, Aditya Nigam, Raghavendra Ramachandra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent studies have emphasized the potential of forehead-crease patterns as
+an alternative for face, iris, and periocular recognition, presenting
+contactless and convenient solutions, particularly in situations where faces
+are covered by surgical masks. However, collecting forehead data presents
+challenges, including cost and time constraints, as developing and optimizing
+forehead verification methods requires a substantial number of high-quality
+images. To tackle these challenges, the generation of synthetic biometric data
+has gained traction due to its ability to protect privacy while enabling
+effective training of deep learning-based biometric verification methods. In
+this paper, we present a new framework to synthesize forehead-crease image data
+while maintaining important features, such as uniqueness and realism. The
+proposed framework consists of two main modules: a Subject-Specific Generation
+Module (SSGM), based on an image-to-image Brownian Bridge Diffusion Model
+(BBDM), which learns a one-to-many mapping between image pairs to generate
+identity-aware synthetic forehead creases corresponding to real subjects, and a
+Subject-Agnostic Generation Module (SAGM), which samples new synthetic
+identities with assistance from the SSGM. We evaluate the diversity and realism
+of the generated forehead-crease images primarily using the Fr\'echet Inception
+Distance (FID) and the Structural Similarity Index Measure (SSIM). In addition,
+we assess the utility of synthetically generated forehead-crease images using a
+forehead-crease verification system (FHCVS). The results indicate an
+improvement in the verification accuracy of the FHCVS by utilizing synthetic
+data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Generative AI for Futuristic Biometrics - IJCB'24 Special
+  Session</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A quantitative model of takeover request time budget for conditionally
+  automated driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15682v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15682v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Foghor Tanshi, Dirk Söffker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In conditional automation, the automated driving system assumes full control
+and only issues a takeover request to a human driver to resume driving in
+critical situations. Previous studies have concluded that the time budget
+required by drivers to resume driving after a takeover request varies with
+situations and different takeover variables. However, no comprehensive
+generalized approaches for estimating in advance the time budget required by
+drivers to takeover have been provided. In this contribution, fixed (7 s) and
+variable time budgets (6 s, 5 s, and 4 s) with and without visual imagery
+assistance were investigated for suitability in three takeover scenarios using
+performance measures such as average lateral displacement. The results indicate
+that 7 s is suitable for two of the studied scenarios based on their
+characteristics. Using the obtained results and known relations between
+takeover variables, a mathematical formula for estimating takeover request time
+budget is proposed. The proposed formula integrates individual stimulus
+response time, driving experience, scenario specific requirements and allows
+increased safety for takeover maneuvers. Furthermore, the visual imagery
+resulted in increased takeover time which invariably increases the time budget.
+Thus the time demand of the visualized information if applicable (such as
+visual imagery) should be included in the time budget.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Manuscript: 12 pages, 12 figures, 7 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DEAR: Depth-Enhanced Action Recognition <span class="chip">ECCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15679v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15679v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sadegh Rahmaniboldaji, Filip Rybansky, Quoc Vuong, Frank Guerin, Andrew Gilbert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting actions in videos, particularly within cluttered scenes, poses
+significant challenges due to the limitations of 2D frame analysis from a
+camera perspective. Unlike human vision, which benefits from 3D understanding,
+recognizing actions in such environments can be difficult. This research
+introduces a novel approach integrating 3D features and depth maps alongside
+RGB features to enhance action recognition accuracy. Our method involves
+processing estimated depth maps through a separate branch from the RGB feature
+encoder and fusing the features to understand the scene and actions
+comprehensively. Using the Side4Video framework and VideoMamba, which employ
+CLIP and VisionMamba for spatial feature extraction, our approach outperformed
+our implementation of the Side4Video network on the Something-Something V2
+dataset. Our code is available at: https://github.com/SadeghRahmaniB/DEAR
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 1 figure, 1 table, accepted at Human-inspired Computer
+  Vision, ECCV</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Learning Based Speckle Filtering for Polarimetric SAR Images.
+  Application to Sentinel-1 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15678v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15678v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alejandro Mestre-Quereda, Juan M. Lopez-Sanchez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speckle suppression in synthetic aperture radar (SAR) images is a key
+processing step which continues to be a research topic. A wide variety of
+methods, using either spatially-based approaches or transform-based strategies,
+have been developed and have shown to provide outstanding results. However,
+recent advances in deep learning techniques and their application to SAR image
+despeckling have been demonstrated to offer state-of-the-art results.
+Unfortunately, they have been mostly applied to single-polarimetric images. The
+extension of a deep learning-based approach for speckle removal to polarimetric
+SAR (PolSAR) images is complicated because of the complex nature of the
+measured covariance matrices for every image pixel, the properties of which
+must be preserved during filtering. In this work, we propose a complete
+framework to remove speckle in polarimetric SAR images using a convolutional
+neural network. The methodology includes a reversible transformation of the
+original complex covariance matrix to obtain a set of real-valued intensity
+bands which are fed to the neural network. In addition, the proposed method
+includes a change detection strategy to avoid the neural network to learn
+erroneous features in areas strongly affected by temporal changes, so that the
+network only learns the underlying speckle component present in the data. The
+method is implemented and tested with dual-polarimetric images acquired by
+Sentinel-1. Experiments show that the proposed approach offers exceptional
+results in both speckle reduction and resolution preservation. More
+importantly, it is also shown that the neural network is not generating
+artifacts or introducing bias in the filtered images, making them suitable for
+further polarimetric processing and exploitation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 32 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards reliable respiratory disease diagnosis based on cough sounds and
+  vision <span class="highlight-title">transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15667v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15667v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qian Wang, Zhaoyang Bu, Jiaxuan Mao, Wenyu Zhu, Jingya Zhao, Wei Du, Guochao Shi, Min Zhou, Si Chen, Jieming Qu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in deep learning techniques have sparked performance
+boosts in various real-world applications including disease diagnosis based on
+multi-modal medical data. Cough sound data-based respiratory disease (e.g.,
+COVID-19 and Chronic Obstructive Pulmonary Disease) diagnosis has also
+attracted much attention. However, existing works usually utilise traditional
+machine learning or deep models of moderate scales. On the other hand, the
+developed approaches are trained and evaluated on small-scale data due to the
+difficulty of curating and annotating clinical data on scale. To address these
+issues in prior works, we create a unified framework to evaluate various deep
+models from lightweight Convolutional Neural Networks (e.g., ResNet18) to
+modern vision transformers and compare their performance in respiratory disease
+classification. Based on the observations from such an extensive empirical
+study, we propose a novel approach to cough-based disease classification based
+on both self-supervised and supervised learning on a large-scale cough data
+set. Experimental results demonstrate our proposed approach outperforms prior
+arts consistently on two benchmark datasets for COVID-19 diagnosis and a
+proprietary dataset for COPD/non-COPD classification with an AUROC of 92.5%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Merging and Splitting Diffusion Paths for Semantically Coherent
+  Panoramas <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15660v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15660v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabio Quattrini, Vittorio Pippi, Silvia Cascianelli, Rita Cucchiara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have become the State-of-the-Art for text-to-image
+generation, and increasing research effort has been dedicated to adapting the
+inference process of pretrained diffusion models to achieve zero-shot
+capabilities. An example is the generation of panorama images, which has been
+tackled in recent works by combining independent diffusion paths over
+overlapping latent features, which is referred to as joint diffusion, obtaining
+perceptually aligned panoramas. However, these methods often yield semantically
+incoherent outputs and trade-off diversity for uniformity. To overcome this
+limitation, we propose the Merge-Attend-Diffuse operator, which can be plugged
+into different types of pretrained diffusion models used in a joint diffusion
+setting to improve the perceptual and semantical coherence of the generated
+panorama images. Specifically, we merge the diffusion paths, reprogramming
+self- and cross-attention to operate on the aggregated latent space. Extensive
+quantitative and qualitative experimental analysis, together with a user study,
+demonstrate that our method maintains compatibility with the input prompt and
+visual quality of the generated images while increasing their semantic
+coherence. We release the code at https://github.com/aimagelab/MAD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ECCV 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TeFF: Tracking-enhanced Forgetting-free Few-shot 3D LiDAR Semantic
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15657v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15657v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junbao Zhou, Jilin Mei, Pengze Wu, Liang Chen, Fangzhou Zhao, Xijun Zhao, Yu Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In autonomous driving, 3D LiDAR plays a crucial role in understanding the
+vehicle's surroundings. However, the newly emerged, unannotated objects
+presents few-shot learning problem for semantic segmentation. This paper
+addresses the limitations of current few-shot semantic segmentation by
+exploiting the temporal continuity of LiDAR data. Employing a tracking model to
+generate pseudo-ground-truths from a sequence of LiDAR frames, our method
+significantly augments the dataset, enhancing the model's ability to learn on
+novel classes. However, this approach introduces a data imbalance biased to
+novel data that presents a new challenge of catastrophic forgetting. To
+mitigate this, we incorporate LoRA, a technique that reduces the number of
+trainable parameters, thereby preserving the model's performance on base
+classes while improving its adaptability to novel classes. This work represents
+a significant step forward in few-shot 3D LiDAR semantic segmentation for
+autonomous driving. Our code is available at
+https://github.com/junbao-zhou/Track-no-forgetting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Realigned Softmax Warping for Deep Metric Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15656v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15656v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael G. DeMoor, John J. Prevost
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Metric Learning (DML) loss functions traditionally aim to control the
+forces of separability and compactness within an embedding space so that the
+same class data points are pulled together and different class ones are pushed
+apart. Within the context of DML, a softmax operation will typically normalize
+distances into a probability for optimization, thus coupling all the push/pull
+forces together. This paper proposes a potential new class of loss functions
+that operate within a euclidean domain and aim to take full advantage of the
+coupled forces governing embedding space formation under a softmax. These
+forces of compactness and separability can be boosted or mitigated within
+controlled locations at will by using a warping function. In this work, we
+provide a simple example of a warping function and use it to achieve
+competitive, state-of-the-art results on various metric learning benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Online <span class="highlight-title">pre-train</span>ing with long-form videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15651v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15651v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Itsuki Kato, Kodai Kamiya, Toru Tamaki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we investigate the impact of online pre-training with
+continuous video clips. We will examine three methods for pre-training (masked
+image modeling, contrastive learning, and knowledge distillation), and assess
+the performance on downstream action recognition tasks. As a result, online
+pre-training with contrast learning showed the highest performance in
+downstream tasks. Our findings suggest that learning from long-form videos can
+be helpful for action recognition with short videos.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>GCCE2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging Persistent Homology for Differential Diagnosis of Mild
+  Cognitive Impairment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15647v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15647v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ninad Aithal, Debanjali Bhattacharya, Neelam Sinha, Thomas Gregor Issac
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mild cognitive impairment (MCI) is characterized by subtle changes in
+cognitive functions, often associated with disruptions in brain connectivity.
+The present study introduces a novel fine-grained analysis to examine
+topological alterations in neurodegeneration pertaining to six different brain
+networks of MCI subjects (Early/Late MCI). To achieve this, fMRI time series
+from two distinct populations are investigated: (i) the publicly accessible
+ADNI dataset and (ii) our in-house dataset. The study utilizes sliding window
+embedding to convert each fMRI time series into a sequence of 3-dimensional
+vectors, facilitating the assessment of changes in regional brain topology.
+Distinct persistence diagrams are computed for Betti descriptors of
+dimension-0, 1, and 2. Wasserstein distance metric is used to quantify
+differences in topological characteristics. We have examined both (i)
+ROI-specific inter-subject interactions and (ii) subject-specific inter-ROI
+interactions. Further, a new deep learning model is proposed for
+classification, achieving a maximum classification accuracy of 95% for the ADNI
+dataset and 85% for the in-house dataset. This methodology is further adapted
+for the differential diagnosis of MCI sub-types, resulting in a peak accuracy
+of 76.5%, 91.1% and 80% in classifying HC Vs. EMCI, HC Vs. LMCI and EMCI Vs.
+LMCI, respectively. We showed that the proposed approach surpasses current
+state-of-the-art techniques designed for classifying MCI and its sub-types
+using fMRI.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 6 figures, 3 tables, accepted at International Conference
+  on Pattern Recognition 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ μgat: Improving Single-Page Document Parsing by Providing Multi-Page
+  Context <span class="chip">ECCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15646v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15646v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabio Quattrini, Carmine Zaccagnino, Silvia Cascianelli, Laura Righi, Rita Cucchiara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Regesta are catalogs of summaries of other documents and, in some cases, are
+the only source of information about the content of such full-length documents.
+For this reason, they are of great interest to scholars in many social and
+humanities fields. In this work, we focus on Regesta Pontificum Romanum, a
+large collection of papal registers. Regesta are visually rich documents, where
+the layout is as important as the text content to convey the contained
+information through the structure, and are inherently multi-page documents.
+Among Digital Humanities techniques that can help scholars efficiently exploit
+regesta and other documental sources in the form of scanned documents, Document
+Parsing has emerged as a task to process document images and convert them into
+machine-readable structured representations, usually markup language. However,
+current models focus on scientific and business documents, and most of them
+consider only single-paged documents. To overcome this limitation, in this
+work, we propose {\mu}gat, an extension of the recently proposed Document
+parsing Nougat architecture, which can handle elements spanning over the single
+page limits. Specifically, we adapt Nougat to process a larger, multi-page
+context, consisting of the previous and the following page, while parsing the
+current page. Experimental results, both qualitative and quantitative,
+demonstrate the effectiveness of our proposed approach also in the case of the
+challenging Regesta Pontificum Romanorum.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ECCV Workshop "AI4DH: Artificial Intelligence for Digital
+  Humanities"</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RIDE: Boosting 3D Object Detection for LiDAR Point Clouds via
+  Rotation-Invariant Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15643v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15643v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhaoxuan Wang, Xu Han, Hongxin Liu, Xianzhi Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rotation robustness property has drawn much attention to point cloud
+analysis, whereas it still poses a critical challenge in 3D object detection.
+When subjected to arbitrary rotation, most existing detectors fail to produce
+expected outputs due to the poor rotation robustness. In this paper, we present
+RIDE, a pioneering exploration of Rotation-Invariance for the 3D
+LiDAR-point-based object DEtector, with the key idea of designing
+rotation-invariant features from LiDAR scenes and then effectively
+incorporating them into existing 3D detectors. Specifically, we design a
+bi-feature extractor that extracts (i) object-aware features though sensitive
+to rotation but preserve geometry well, and (ii) rotation-invariant features,
+which lose geometric information to a certain extent but are robust to
+rotation. These two kinds of features complement each other to decode 3D
+proposals that are robust to arbitrary rotations. Particularly, our RIDE is
+compatible and easy to plug into the existing one-stage and two-stage 3D
+detectors, and boosts both detection performance and rotation robustness.
+Extensive experiments on the standard benchmarks showcase that the mean average
+precision (mAP) and rotation robustness can be significantly boosted by
+integrating with our RIDE, with +5.6% mAP and 53% rotation robustness
+improvement on KITTI, +5.1% and 28% improvement correspondingly on nuScenes.
+The code will be available soon.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can SAR improve RSVQA performance? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15642v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15642v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucrezia Tosato, Sylvain Lobry, Flora Weissgerber, Laurent Wendling
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Remote sensing visual question answering (RSVQA) has been involved in several
+research in recent years, leading to an increase in new methods. RSVQA
+automatically extracts information from satellite images, so far only optical,
+and a question to automatically search for the answer in the image and provide
+it in a textual form. In our research, we study whether Synthetic Aperture
+Radar (SAR) images can be beneficial to this field. We divide our study into
+three phases which include classification methods and VQA. In the first one, we
+explore the classification results of SAR alone and investigate the best method
+to extract information from SAR data. Then, we study the combination of SAR and
+optical data. In the last phase, we investigate how SAR images and a
+combination of different modalities behave in RSVQA compared to a method only
+using optical images. We conclude that adding the SAR modality leads to
+improved performances, although further research on using SAR data to
+automatically answer questions is needed as well as more balanced datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MMDRFuse: Distilled Mini-Model with Dynamic Refresh for Multi-Modality
+  Image Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15641v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15641v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanglin Deng, Tianyang Xu, Chunyang Cheng, Xiao-Jun Wu, Josef Kittler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, Multi-Modality Image Fusion (MMIF) has been applied to many
+fields, which has attracted many scholars to endeavour to improve the fusion
+performance. However, the prevailing focus has predominantly been on the
+architecture design, rather than the training strategies. As a low-level vision
+task, image fusion is supposed to quickly deliver output images for observation
+and supporting downstream tasks. Thus, superfluous computational and storage
+overheads should be avoided. In this work, a lightweight Distilled Mini-Model
+with a Dynamic Refresh strategy (MMDRFuse) is proposed to achieve this
+objective. To pursue model parsimony, an extremely small convolutional network
+with a total of 113 trainable parameters (0.44 KB) is obtained by three
+carefully designed supervisions. First, digestible distillation is constructed
+by emphasising external spatial feature consistency, delivering soft
+supervision with balanced details and saliency for the target network. Second,
+we develop a comprehensive loss to balance the pixel, gradient, and perception
+clues from the source images. Third, an innovative dynamic refresh training
+strategy is used to collaborate history parameters and current supervision
+during training, together with an adaptive adjust function to optimise the
+fusion network. Extensive experiments on several public datasets demonstrate
+that our method exhibits promising advantages in terms of model efficiency and
+complexity, with superior performance in multiple image fusion tasks and
+downstream pedestrian detection application. The code of this work is publicly
+available at https://github.com/yanglinDeng/MMDRFuse.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 8 figures, accpeted by ACM International Conference on
+  Multimedia 2024(Oral)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Transfer Learning from Simulated to Real Scenes for Monocular 3D Object
+  Detection <span class="chip">ECCV'24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15637v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15637v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sondos Mohamed, Walter Zimmer, Ross Greer, Ahmed Alaaeldin Ghita, Modesto Castrillón-Santana, Mohan Trivedi, Alois Knoll, Salvatore Mario Carta, Mirko Marras
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurately detecting 3D objects from monocular images in dynamic roadside
+scenarios remains a challenging problem due to varying camera perspectives and
+unpredictable scene conditions. This paper introduces a two-stage training
+strategy to address these challenges. Our approach initially trains a model on
+the large-scale synthetic dataset, RoadSense3D, which offers a diverse range of
+scenarios for robust feature learning. Subsequently, we fine-tune the model on
+a combination of real-world datasets to enhance its adaptability to practical
+conditions. Experimental results of the Cube R-CNN model on challenging public
+benchmarks show a remarkable improvement in detection performance, with a mean
+average precision rising from 0.26 to 12.76 on the TUM Traffic A9 Highway
+dataset and from 2.09 to 6.60 on the DAIR-V2X-I dataset when performing
+transfer learning. Code, data, and qualitative video results are available on
+the project website: https://roadsense3d.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages. Accepted for ECVA European Conference on Computer Vision
+  2024 (ECCV'24)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CSAD: Unsupervised Component Segmentation for Logical Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15628v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15628v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu-Hsuan Hsieh, Shang-Hong Lai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To improve logical anomaly detection, some previous works have integrated
+segmentation techniques with conventional anomaly detection methods. Although
+these methods are effective, they frequently lead to unsatisfactory
+segmentation results and require manual annotations. To address these
+drawbacks, we develop an unsupervised component segmentation technique that
+leverages foundation models to autonomously generate training labels for a
+lightweight segmentation network without human labeling. Integrating this new
+segmentation technique with our proposed Patch Histogram module and the
+Local-Global Student-Teacher (LGST) module, we achieve a detection AUROC of
+95.3% in the MVTec LOCO AD dataset, which surpasses previous SOTA methods.
+Furthermore, our proposed method provides lower latency and higher throughput
+than most existing approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can Visual Language Models Replace OCR-Based Visual Question Answering
+  Pipelines in Production? A Case Study in Retail 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15626v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15626v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bianca Lamm, Janis Keuper
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most production-level deployments for Visual Question Answering (VQA) tasks
+are still build as processing pipelines of independent steps including image
+pre-processing, object- and text detection, Optical Character Recognition (OCR)
+and (mostly supervised) object classification. However, the recent advances in
+vision Foundation Models [25] and Vision Language Models (VLMs) [23] raise the
+question if these custom trained, multi-step approaches can be replaced with
+pre-trained, single-step VLMs. This paper analyzes the performance and limits
+of various VLMs in the context of VQA and OCR [5, 9, 12] tasks in a
+production-level scenario. Using data from the Retail-786k [10] dataset, we
+investigate the capabilities of pre-trained VLMs to answer detailed questions
+about advertised products in images. Our study includes two commercial models,
+GPT-4V [16] and GPT-4o [17], as well as four open-source models: InternVL [5],
+LLaVA 1.5 [12], LLaVA-NeXT [13], and CogAgent [9]. Our initial results show,
+that there is in general no big performance gap between open-source and
+commercial models. However, we observe a strong task dependent variance in VLM
+performance: while most models are able to answer questions regarding the
+product brand and price with high accuracy, they completely fail at the same
+time to correctly identity the specific product name or discount. This
+indicates the problem of VLMs to solve fine-grained classification tasks as
+well to model the more abstract concept of discounts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Geometry-guided Feature Learning and Fusion for Indoor Scene
+  Reconstruction <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15608v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15608v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruihong Yin, Sezer Karaoglu, Theo Gevers
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In addition to color and textural information, geometry provides important
+cues for 3D scene reconstruction. However, current reconstruction methods only
+include geometry at the feature level thus not fully exploiting the geometric
+information.
+  In contrast, this paper proposes a novel geometry integration mechanism for
+3D scene reconstruction. Our approach incorporates 3D geometry at three levels,
+i.e. feature learning, feature fusion, and network supervision. First,
+geometry-guided feature learning encodes geometric priors to contain
+view-dependent information. Second, a geometry-guided adaptive feature fusion
+is introduced which utilizes the geometric priors as a guidance to adaptively
+generate weights for multiple views. Third, at the supervision level, taking
+the consistency between 2D and 3D normals into account, a consistent 3D normal
+loss is designed to add local constraints.
+  Large-scale experiments are conducted on the ScanNet dataset, showing that
+volumetric methods with our geometry integration mechanism outperform
+state-of-the-art methods quantitatively as well as qualitatively. Volumetric
+methods with ours also show good generalization on the 7-Scenes and TUM RGB-D
+datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ES-PTAM: Event-based Stereo Parallel Tracking and Mapping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15605v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15605v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Suman Ghosh, Valentina Cavinato, Guillermo Gallego
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Odometry (VO) and SLAM are fundamental components for spatial
+perception in mobile robots. Despite enormous progress in the field, current
+VO/SLAM systems are limited by their sensors' capability. Event cameras are
+novel visual sensors that offer advantages to overcome the limitations of
+standard cameras, enabling robots to expand their operating range to
+challenging scenarios, such as high-speed motion and high dynamic range
+illumination. We propose a novel event-based stereo VO system by combining two
+ideas: a correspondence-free mapping module that estimates depth by maximizing
+ray density fusion and a tracking module that estimates camera poses by
+maximizing edge-map alignment. We evaluate the system comprehensively on five
+real-world datasets, spanning a variety of camera types (manufacturers and
+spatial resolutions) and scenarios (driving, flying drone, hand-held,
+egocentric, etc). The quantitative and qualitative results demonstrate that our
+method outperforms the state of the art in majority of the test sequences by a
+margin, e.g., trajectory error reduction of 45% on RPG dataset, 61% on DSEC
+dataset, and 21% on TUM-VIE dataset. To benefit the community and foster
+research on event-based perception systems, we release the source code and
+results: https://github.com/tub-rip/ES-PTAM
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 7 figures, 4 tables, https://github.com/tub-rip/ES-PTAM</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Benefits of Visual Stabilization for Frame- and Event-based
+  Perception 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15602v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15602v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juan Pablo Rodriguez-Gomez, Jose Ramiro Martinez-de Dios, Anibal Ollero, Guillermo Gallego
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-based perception systems are typically exposed to large orientation
+changes in different robot applications. In such conditions, their performance
+might be compromised due to the inherent complexity of processing data captured
+under challenging motion. Integration of mechanical stabilizers to compensate
+for the camera rotation is not always possible due to the robot payload
+constraints. This paper presents a processing-based stabilization approach to
+compensate the camera's rotational motion both on events and on frames (i.e.,
+images). Assuming that the camera's attitude is available, we evaluate the
+benefits of stabilization in two perception applications: feature tracking and
+estimating the translation component of the camera's ego-motion. The validation
+is performed using synthetic data and sequences from well-known event-based
+vision datasets. The experiments unveil that stabilization can improve feature
+tracking and camera ego-motion estimation accuracy in 27.37% and 34.82%,
+respectively. Concurrently, stabilization can reduce the processing time of
+computing the camera's linear velocity by at least 25%. Code is available at
+https://github.com/tub-rip/visual_stabilization
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 4 figures, 4 tables,
+  https://github.com/tub-rip/visual_stabilization</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hierarchical Visual Categories Modeling: A Joint Representation Learning
+  and Density Estimation Framework for Out-of-Distribution Detection <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15580v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15580v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinglun Li, Xinyu Zhou, Pinxue Guo, Yixuan Sun, Yiwen Huang, Weifeng Ge, Wenqiang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting out-of-distribution inputs for visual recognition models has become
+critical in safe deep learning. This paper proposes a novel hierarchical visual
+category modeling scheme to separate out-of-distribution data from
+in-distribution data through joint representation learning and statistical
+modeling. We learn a mixture of Gaussian models for each in-distribution
+category. There are many Gaussian mixture models to model different visual
+categories. With these Gaussian models, we design an in-distribution score
+function by aggregating multiple Mahalanobis-based metrics. We don't use any
+auxiliary outlier data as training samples, which may hurt the generalization
+ability of out-of-distribution detection algorithms. We split the ImageNet-1k
+dataset into ten folds randomly. We use one fold as the in-distribution dataset
+and the others as out-of-distribution datasets to evaluate the proposed method.
+We also conduct experiments on seven popular benchmarks, including CIFAR,
+iNaturalist, SUN, Places, Textures, ImageNet-O, and OpenImage-O. Extensive
+experiments indicate that the proposed method outperforms state-of-the-art
+algorithms clearly. Meanwhile, we find that our visual representation has a
+competitive performance when compared with features learned by classical
+methods. These results demonstrate that the proposed method hasn't weakened the
+discriminative ability of visual recognition models and keeps high efficiency
+in detecting out-of-distribution samples.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Temporal Attention for Cross-View Sequential Image Localization <span class="chip">IROS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15569v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15569v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dong Yuan, Frederic Maire, Feras Dayoub
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a novel approach to enhancing cross-view localization,
+focusing on the fine-grained, sequential localization of street-view images
+within a single known satellite image patch, a significant departure from
+traditional one-to-one image retrieval methods. By expanding to sequential
+image fine-grained localization, our model, equipped with a novel Temporal
+Attention Module (TAM), leverages contextual information to significantly
+improve sequential image localization accuracy. Our method shows substantial
+reductions in both mean and median localization errors on the Cross-View Image
+Sequence (CVIS) dataset, outperforming current state-of-the-art single-image
+localization techniques. Additionally, by adapting the KITTI-CVL dataset into
+sequential image sets, we not only offer a more realistic dataset for future
+research but also demonstrate our model's robust generalization capabilities
+across varying times and areas, evidenced by a 75.3% reduction in mean distance
+error in cross-view sequential image localization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IROS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TagOOD: A Novel Approach to Out-of-Distribution Detection via
+  Vision-Language Representations and Class Center Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15566v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15566v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinglun Li, Xinyu Zhou, Kaixun Jiang, Lingyi Hong, Pinxue Guo, Zhaoyu Chen, Weifeng Ge, Wenqiang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal fusion, leveraging data like vision and language, is rapidly
+gaining traction. This enriched data representation improves performance across
+various tasks. Existing methods for out-of-distribution (OOD) detection, a
+critical area where AI models encounter unseen data in real-world scenarios,
+rely heavily on whole-image features. These image-level features can include
+irrelevant information that hinders the detection of OOD samples, ultimately
+limiting overall performance. In this paper, we propose \textbf{TagOOD}, a
+novel approach for OOD detection that leverages vision-language representations
+to achieve label-free object feature decoupling from whole images. This
+decomposition enables a more focused analysis of object semantics, enhancing
+OOD detection performance. Subsequently, TagOOD trains a lightweight network on
+the extracted object features to learn representative class centers. These
+centers capture the central tendencies of IND object classes, minimizing the
+influence of irrelevant image features during OOD detection. Finally, our
+approach efficiently detects OOD samples by calculating distance-based metrics
+as OOD scores between learned centers and test samples. We conduct extensive
+experiments to evaluate TagOOD on several benchmark datasets and demonstrate
+its superior performance compared to existing OOD detection methods. This work
+presents a novel perspective for further exploration of multimodal information
+utilization in OOD detection, with potential applications across various tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACMMM2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generalization Capabilities of Neural Cellular Automata for Medical
+  Image Segmentation: A Robust and Lightweight Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15557v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15557v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Steven Korevaar, Ruwan Tennakoon, Alireza Bab-Hadiashar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the field of medical imaging, the U-Net architecture, along with its
+variants, has established itself as a cornerstone for image segmentation tasks,
+particularly due to its strong performance when trained on limited datasets.
+Despite its impressive performance on identically distributed (in-domain) data,
+U-Nets exhibit a significant decline in performance when tested on data that
+deviates from the training distribution, out-of-distribution (out-of-domain)
+data. Current methodologies predominantly address this issue by employing
+generalization techniques that hinge on various forms of regularization, which
+have demonstrated moderate success in specific scenarios. This paper, however,
+ventures into uncharted territory by investigating the implications of
+utilizing models that are smaller by three orders of magnitude (i.e., x1000)
+compared to a conventional U-Net. A reduction of this size in U-net parameters
+typically adversely affects both in-domain and out-of-domain performance,
+possibly due to a significantly reduced receptive field. To circumvent this
+issue, we explore the concept of Neural Cellular Automata (NCA), which, despite
+its simpler model structure, can attain larger receptive fields through
+recursive processes. Experimental results on two distinct datasets reveal that
+NCA outperforms traditional methods in terms of generalization, while still
+maintaining a commendable IID performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Divide, Conquer and Combine: A Training-Free Framework for
+  High-Resolution Image Perception in Multimodal Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15556v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15556v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenbin Wang, Liang Ding, Minyan Zeng, Xiabin Zhou, Li Shen, Yong Luo, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal large language models (MLLMs) have experienced significant
+advancements recently, but still struggle to recognize and interpret intricate
+details in high-resolution (HR) images effectively. While state-of-the-art
+(SOTA) MLLMs claim to process images at 4K resolution, existing MLLM benchmarks
+only support up to 2K, leaving the capabilities of SOTA models on true HR
+images largely untested. Furthermore, existing methods for enhancing HR image
+perception in MLLMs rely on computationally expensive visual instruction
+tuning. To address these limitations, we introduce HR-Bench, the first
+deliberately designed benchmark to rigorously evaluate MLLM performance on
+4K&8K images. Through extensive experiments, we demonstrate that while
+downsampling HR images leads to vision information loss, leveraging
+complementary modalities, e.g., text, can effectively compensate for this loss.
+Building upon this insight, we propose Divide, Conquer and Combine (DC$^2$), a
+novel training-free framework for enhancing MLLM perception of HR images.
+DC$^2$ follows a three-staged approach: 1) Divide: recursively partitioning the
+HR image into patches and merging similar patches to minimize computational
+overhead, 2) Conquer: leveraging the MLLM to generate accurate textual
+descriptions for each image patch, and 3) Combine: utilizing the generated text
+descriptions to enhance the MLLM's understanding of the overall HR image.
+Extensive experiments show that: 1) the SOTA MLLM achieves 63% accuracy, which
+is markedly lower than the 87% accuracy achieved by humans on HR-Bench; 2) our
+DC$^2$ brings consistent and significant improvements (a relative increase of
++6% on HR-Bench and +8% on general multimodal benchmarks). The benchmark and
+code will be released to facilitate the multimodal R&D community.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Latent Relationship Mining of Glaucoma Biomarkers: a TRI-LSTM based Deep
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15555v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15555v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng Huang, Junhao Shen, Qiuyu Luo, Karanjit Kooner, Tsengdar Lee, Yishen Liu, Jia Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recently years, a significant amount of research has been conducted on
+applying deep learning methods for glaucoma classification and detection.
+However, the explainability of those established machine learning models
+remains a big concern. In this research, in contrast, we learn from cognitive
+science concept and study how ophthalmologists judge glaucoma detection.
+Simulating experts' efforts, we propose a hierarchical decision making system,
+centered around a holistic set of carefully designed biomarker-oriented machine
+learning models. While biomarkers represent the key indicators of how
+ophthalmologists identify glaucoma, they usually exhibit latent
+inter-relations. We thus construct a time series model, named TRI-LSTM, capable
+of calculating and uncovering potential and latent relationships among various
+biomarkers of glaucoma. Our model is among the first efforts to explore the
+intrinsic connections among glaucoma biomarkers. We monitor temporal
+relationships in patients' disease states over time and to capture and retain
+the progression of disease-relevant clinical information from prior visits,
+thereby enriching biomarker's potential relationships. Extensive experiments
+over real-world dataset have demonstrated the effectiveness of the proposed
+model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 4 images</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ConsistencyTrack: A Robust Multi-Object Tracker with a Generation
+  Strategy of Consistency Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15548v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15548v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lifan Jiang, Zhihui Wang, Siqi Yin, Guangxiao Ma, Peng Zhang, Boxi Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-object tracking (MOT) is a critical technology in computer vision,
+designed to detect multiple targets in video sequences and assign each target a
+unique ID per frame. Existed MOT methods excel at accurately tracking multiple
+objects in real-time across various scenarios. However, these methods still
+face challenges such as poor noise resistance and frequent ID switches. In this
+research, we propose a novel ConsistencyTrack, joint detection and
+tracking(JDT) framework that formulates detection and association as a
+denoising diffusion process on perturbed bounding boxes. This progressive
+denoising strategy significantly improves the model's noise resistance. During
+the training phase, paired object boxes within two adjacent frames are diffused
+from ground-truth boxes to a random distribution, and then the model learns to
+detect and track by reversing this process. In inference, the model refines
+randomly generated boxes into detection and tracking results through minimal
+denoising steps. ConsistencyTrack also introduces an innovative target
+association strategy to address target occlusion. Experiments on the MOT17 and
+DanceTrack datasets demonstrate that ConsistencyTrack outperforms other
+compared methods, especially better than DiffusionTrack in inference speed and
+other performance metrics. Our code is available at
+https://github.com/Tankowa/ConsistencyTrack.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2308.09905 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Kangaroo: A Powerful Video-Language Model Supporting Long-context Video
+  Input 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15542v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15542v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiajun Liu, Yibing Wang, Hanghang Ma, Xiaoping Wu, Xiaoqi Ma, Xiaoming Wei, Jianbin Jiao, Enhua Wu, Jie Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Rapid advancements have been made in extending Large Language Models (LLMs)
+to Large Multi-modal Models (LMMs). However, extending input modality of LLMs
+to video data remains a challenging endeavor, especially for long videos. Due
+to insufficient access to large-scale high-quality video data and the excessive
+compression of visual features, current methods exhibit limitations in
+effectively processing long videos. In this paper, we introduce Kangaroo, a
+powerful Video LMM aimed at addressing these challenges. Confronted with issue
+of inadequate training data, we develop a data curation system to build a
+large-scale dataset with high-quality annotations for vision-language
+pre-training and instruction tuning. In addition, we design a curriculum
+training pipeline with gradually increasing resolution and number of input
+frames to accommodate long videos. Evaluation results demonstrate that, with 8B
+parameters, Kangaroo achieves state-of-the-art performance across a variety of
+video understanding benchmarks while exhibiting competitive results on others.
+Particularly, on benchmarks specialized for long videos, Kangaroo excels some
+larger models with over 10B parameters and proprietary models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ray-Distance Volume Rendering for Neural Scene Reconstruction <span class="chip">ECCV2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15524v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15524v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruihong Yin, Yunlu Chen, Sezer Karaoglu, Theo Gevers
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing methods in neural scene reconstruction utilize the Signed Distance
+Function (SDF) to model the density function. However, in indoor scenes, the
+density computed from the SDF for a sampled point may not consistently reflect
+its real importance in volume rendering, often due to the influence of
+neighboring objects. To tackle this issue, our work proposes a novel approach
+for indoor scene reconstruction, which instead parameterizes the density
+function with the Signed Ray Distance Function (SRDF). Firstly, the SRDF is
+predicted by the network and transformed to a ray-conditioned density function
+for volume rendering. We argue that the ray-specific SRDF only considers the
+surface along the camera ray, from which the derived density function is more
+consistent to the real occupancy than that from the SDF. Secondly, although
+SRDF and SDF represent different aspects of scene geometries, their values
+should share the same sign indicating the underlying spatial occupancy.
+Therefore, this work introduces a SRDF-SDF consistency loss to constrain the
+signs of the SRDF and SDF outputs. Thirdly, this work proposes a
+self-supervised visibility task, introducing the physical visibility geometry
+to the reconstruction task. The visibility task combines prior from predicted
+SRDF and SDF as pseudo labels, and contributes to generating more accurate 3D
+geometry. Our method implemented with different representations has been
+validated on indoor datasets, achieving improved performance in both
+reconstruction and view synthesis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ECCV2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Simple Baseline with Single-encoder for Referring Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15521v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15521v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seonghoon Yu, Ilchae Jung, Byeongju Han, Taeoh Kim, Yunho Kim, Dongyoon Wee, Jeany Son
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Referring image segmentation (RIS) requires dense vision-language
+interactions between visual pixels and textual words to segment objects based
+on a given description. However, commonly adapted dual-encoders in RIS, e.g.,
+Swin transformer and BERT (uni-modal encoders) or CLIP (a multi-modal
+dual-encoder), lack dense multi-modal interactions during pre-training, leading
+to a gap with a pixel-level RIS task. To bridge this gap, existing RIS methods
+often rely on multi-modal fusion modules that interact two encoders, but this
+approach leads to high computational costs. In this paper, we present a novel
+RIS method with a single-encoder, i.e., BEiT-3, maximizing the potential of
+shared self-attention across all framework components. This enables seamless
+interactions of two modalities from input to final prediction, producing
+granularly aligned multi-modal features. Furthermore, we propose lightweight
+yet effective decoder modules, a Shared FPN and a Shared Mask Decoder, which
+contribute to the high efficiency of our model. Our simple baseline with a
+single encoder achieves outstanding performances on the RIS benchmark datasets
+while maintaining computational efficiency, compared to the most recent SoTA
+methods based on dual-encoders.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ArXiv pre-print</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Depth-Weighted Detection of Behaviours of Risk in People with Dementia
+  using Cameras 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15519v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15519v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pratik K. Mishra, Irene Ballester, Andrea Iaboni, Bing Ye, Kristine Newman, Alex Mihailidis, Shehroz S. Khan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The behavioural and psychological symptoms of dementia, such as agitation and
+aggression, present a significant health and safety risk in residential care
+settings. Many care facilities have video cameras in place for digital
+monitoring of public spaces, which can be leveraged to develop an automated
+behaviours of risk detection system that can alert the staff to enable timely
+intervention and prevent the situation from escalating. However, one of the
+challenges in our previous study was the presence of false alarms due to
+obstruction of view by activities happening close to the camera. To address
+this issue, we proposed a novel depth-weighted loss function to train a
+customized convolutional autoencoder to enforce equivalent importance to the
+events happening both near and far from the cameras; thus, helping to reduce
+false alarms and making the method more suitable for real-world deployment. The
+proposed method was trained using data from nine participants with dementia
+across three cameras situated in a specialized dementia unit and achieved an
+area under the curve of receiver operating characteristic of $0.852$, $0.81$
+and $0.768$ for the three cameras. Ablation analysis was conducted for the
+individual components of the proposed method and the performance of the
+proposed method was investigated for participant-specific and sex-specific
+behaviours of risk detection. The proposed method performed reasonably well in
+detecting behaviours of risk in people with dementia motivating further
+research toward the development of a behaviours of risk detection system
+suitable for deployment in video surveillance systems in care facilities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Continual-learning-based framework for structural damage recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15513v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15513v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiangpeng Shu, Jiawei Zhang, Reachsak Ly, Fangzheng Lin, Yuanfeng Duan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-damage is common in reinforced concrete structures and leads to the
+requirement of large number of neural networks, parameters and data storage, if
+convolutional neural network (CNN) is used for damage recognition. In addition,
+conventional CNN experiences catastrophic forgetting and training inefficiency
+as the number of tasks increases during continual learning, leading to large
+accuracy decrease of previous learned tasks. To address these problems, this
+study proposes a continuallearning-based damage recognition model (CLDRM) which
+integrates the learning without forgetting continual learning method into the
+ResNet-34 architecture for the recognition of damages in RC structures as well
+as relevant structural components. Three experiments for four recognition tasks
+were designed to validate the feasibility and effectiveness of the CLDRM
+framework. In this way, it reduces both the prediction time and data storage by
+about 75% in four tasks of continuous learning. Three experiments for four
+recognition tasks were designed to validate the feasibility and effectiveness
+of the CLDRM framework. By gradual feature fusion, CLDRM outperformed other
+methods by managed to achieve high accuracy in the damage recognition and
+classification. As the number of recognition tasks increased, CLDRM also
+experienced smaller decrease of the previous learned tasks. Results indicate
+that the CLDRM framework successfully performs damage recognition and
+classification with reasonable accuracy and effectiveness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RoboSense: Large-scale <span class="highlight-title">Dataset</span> and Benchmark for Multi-sensor Low-speed
+  Autonomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15503v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15503v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haisheng Su, Feixiang Song, Cong Ma, Panpan Cai, Wei Wu, Cewu Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robust object detection and tracking under arbitrary sight of view is
+challenging yet essential for the development of Autonomous Vehicle technology.
+With the growing demand of unmanned function vehicles, near-field scene
+understanding becomes an important research topic in the areas of low-speed
+autonomous driving. Due to the complexity of driving conditions and diversity
+of near obstacles such as blind spots and high occlusion, the perception
+capability of near-field environment is still inferior than its farther
+counterpart. To further enhance the intelligent ability of unmanned vehicles,
+in this paper, we construct a multimodal data collection platform based on 3
+main types of sensors (Camera, LiDAR and Fisheye), which supports flexible
+sensor configurations to enable dynamic sight of view for ego vehicle, either
+global view or local view. Meanwhile, a large-scale multi-sensor dataset is
+built, named RoboSense, to facilitate near-field scene understanding. RoboSense
+contains more than 133K synchronized data with 1.4M 3D bounding box and IDs
+annotated in the full $360^{\circ}$ view, forming 216K trajectories across 7.6K
+temporal sequences. It has $270\times$ and $18\times$ as many annotations of
+near-field obstacles within 5$m$ as the previous single-vehicle datasets such
+as KITTI and nuScenes. Moreover, we define a novel matching criterion for
+near-field 3D perception and prediction metrics. Based on RoboSense, we
+formulate 6 popular tasks to facilitate the future development of related
+research, where the detailed data analysis as well as benchmarks are also
+provided accordingly.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NAS-BNN: Neural Architecture Search for Binary Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15484v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15484v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhihao Lin, Yongtao Wang, Jinhe Zhang, Xiaojie Chu, Haibin Ling
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Binary Neural Networks (BNNs) have gained extensive attention for their
+superior inferencing efficiency and compression ratio compared to traditional
+full-precision networks. However, due to the unique characteristics of BNNs,
+designing a powerful binary architecture is challenging and often requires
+significant manpower. A promising solution is to utilize Neural Architecture
+Search (NAS) to assist in designing BNNs, but current NAS methods for BNNs are
+relatively straightforward and leave a performance gap between the searched
+models and manually designed ones. To address this gap, we propose a novel
+neural architecture search scheme for binary neural networks, named NAS-BNN. We
+first carefully design a search space based on the unique characteristics of
+BNNs. Then, we present three training strategies, which significantly enhance
+the training of supernet and boost the performance of all subnets. Our
+discovered binary model family outperforms previous BNNs for a wide range of
+operations (OPs) from 20M to 200M. For instance, we achieve 68.20% top-1
+accuracy on ImageNet with only 57M OPs. In addition, we validate the
+transferability of these searched BNNs on the object detection task, and our
+binary detectors with the searched BNNs achieve a novel state-of-the-art
+result, e.g., 31.6% mAP with 370M OPs, on MS COCO dataset. The source code and
+models will be released at https://github.com/VDIGPKU/NAS-BNN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dynamic Reconstruction from Neuromorphic Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15465v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15465v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Harbir Antil, Daniel Blauvelt, David Sayre
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unlike traditional cameras which synchronously register pixel intensity,
+neuromorphic sensors only register `changes' at pixels where a change is
+occurring asynchronously. This enables neuromorphic sensors to sample at a
+micro-second level and efficiently capture the dynamics. Since, only sequences
+of asynchronous event changes are recorded rather than brightness intensities
+over time, many traditional image processing techniques cannot be directly
+applied. Furthermore, existing approaches, including the ones recently
+introduced by the authors, use traditional images combined with neuromorphic
+event data to carry out reconstructions. The aim of this work is introduce an
+optimization based approach to reconstruct images and dynamics only from the
+neuromoprhic event data without any additional knowledge of the events. Each
+pixel is modeled temporally. The experimental results on real data highlight
+the efficacy of the presented approach, paving the way for efficient and
+accurate processing of neuromorphic sensor data in real-world applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hand1000: Generating Realistic Hands from Text with Only 1,000 Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15461v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15461v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haozhuo Zhang, Bin Zhu, Yu Cao, Yanbin Hao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-image generation models have achieved remarkable advancements in
+recent years, aiming to produce realistic images from textual descriptions.
+However, these models often struggle with generating anatomically accurate
+representations of human hands. The resulting images frequently exhibit issues
+such as incorrect numbers of fingers, unnatural twisting or interlacing of
+fingers, or blurred and indistinct hands. These issues stem from the inherent
+complexity of hand structures and the difficulty in aligning textual
+descriptions with precise visual depictions of hands. To address these
+challenges, we propose a novel approach named Hand1000 that enables the
+generation of realistic hand images with target gesture using only 1,000
+training samples. The training of Hand1000 is divided into three stages with
+the first stage aiming to enhance the model's understanding of hand anatomy by
+using a pre-trained hand gesture recognition model to extract gesture
+representation. The second stage further optimizes text embedding by
+incorporating the extracted hand gesture representation, to improve alignment
+between the textual descriptions and the generated hand images. The third stage
+utilizes the optimized embedding to fine-tune the Stable Diffusion model to
+generate realistic hand images. In addition, we construct the first publicly
+available dataset specifically designed for text-to-hand image generation.
+Based on the existing hand gesture recognition dataset, we adopt advanced image
+captioning models and LLaMA3 to generate high-quality textual descriptions
+enriched with detailed gesture information. Extensive experiments demonstrate
+that Hand1000 significantly outperforms existing models in producing
+anatomically correct hand images while faithfully representing other details in
+the text, such as faces, clothing, and colors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page https://haozhuo-zhang.github.io/Hand1000-project-page/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Avoiding Generative Model Writer's Block With Embedding Nudging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15450v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15450v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Zand, Milad Nasr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative image models, since introduction, have become a global phenomenon.
+From new arts becoming possible to new vectors of abuse, many new capabilities
+have become available. One of the challenging issues with generative models is
+controlling the generation process specially to prevent specific generations
+classes or instances . There are several reasons why one may want to control
+the output of generative models, ranging from privacy and safety concerns to
+application limitations or user preferences
+  To address memorization and privacy challenges, there has been considerable
+research dedicated to filtering prompts or filtering the outputs of these
+models. What all these solutions have in common is that at the end of the day
+they stop the model from producing anything, hence limiting the usability of
+the model. In this paper, we propose a method for addressing this usability
+issue by making it possible to steer away from unwanted concepts (when detected
+in model's output) and still generating outputs. In particular we focus on the
+latent diffusion image generative models and how one can prevent them to
+generate particular images while generating similar images with limited
+overhead.
+  We focus on mitigating issues like image memorization, demonstrating our
+technique's effectiveness through qualitative and quantitative evaluations. Our
+method successfully prevents the generation of memorized training images while
+maintaining comparable image quality and relevance to the unmodified model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VLM4Bio: A Benchmark <span class="highlight-title">Dataset</span> to Evaluate <span class="highlight-title">Pretrain</span>ed Vision-Language
+  Models for Trait Discovery from Biological Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16176v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16176v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        M. Maruf, Arka Daw, Kazi Sajeed Mehrab, Harish Babu Manogaran, Abhilash Neog, Medha Sawhney, Mridul Khurana, James P. Balhoff, Yasin Bakis, Bahadir Altintas, Matthew J. Thompson, Elizabeth G. Campolongo, Josef C. Uyeda, Hilmar Lapp, Henry L. Bart, Paula M. Mabee, Yu Su, Wei-Lun Chao, Charles Stewart, Tanya Berger-Wolf, Wasila Dahdul, Anuj Karpatne
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Images are increasingly becoming the currency for documenting biodiversity on
+the planet, providing novel opportunities for accelerating scientific
+discoveries in the field of organismal biology, especially with the advent of
+large vision-language models (VLMs). We ask if pre-trained VLMs can aid
+scientists in answering a range of biologically relevant questions without any
+additional fine-tuning. In this paper, we evaluate the effectiveness of 12
+state-of-the-art (SOTA) VLMs in the field of organismal biology using a novel
+dataset, VLM4Bio, consisting of 469K question-answer pairs involving 30K images
+from three groups of organisms: fishes, birds, and butterflies, covering five
+biologically relevant tasks. We also explore the effects of applying prompting
+techniques and tests for reasoning hallucination on the performance of VLMs,
+shedding new light on the capabilities of current SOTA VLMs in answering
+biologically relevant questions using images. The code and datasets for running
+all the analyses reported in this paper can be found at
+https://github.com/sammarfy/VLM4Bio.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>36 pages, 37 figures, 7 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Does Data-Efficient Generalization Exacerbate Bias in Foundation Models? <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16154v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16154v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dilermando Queiroz, Anderson Carlos, Maíra Fatoretto, André Anjos, Lilian Berton, Luis Filipe Nakayama
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation models have emerged as robust models with label efficiency in
+diverse domains. In medical imaging, these models contribute to the advancement
+of medical diagnoses due to the difficulty in obtaining labeled data. However,
+it is unclear whether using a large amount of unlabeled data, biased by the
+presence of sensitive attributes during pre-training, influences the fairness
+of the model. This research examines the bias in the Foundation model
+(RetFound) when it is applied to fine-tune the Brazilian Multilabel
+Ophthalmological Dataset (BRSET), which has a different population than the
+pre-training dataset. The model evaluation, in comparison with supervised
+learning, shows that the Foundation Model has the potential to reduce the gap
+between the maximum AUC and minimum AUC evaluations across gender and age
+groups. However, in a data-efficient generalization, the model increases the
+bias when the data amount decreases. These findings suggest that when deploying
+a Foundation Model in real-life scenarios with limited data, the possibility of
+fairness issues should be considered.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint of paper to be presented at Fairness and Ethics Towards
+  Transparent AI: Facing the Challenge through Model Debiasing (FAILED) during
+  ECCV 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Single-Photon 3D Imaging with Equi-Depth Photon Histograms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16150v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16150v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaustubh Sadekar, David Maier, Atul Ingle
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Single-photon cameras present a promising avenue for high-resolution 3D
+imaging. They have ultra-high sensitivity -- down to individual photons -- and
+can record photon arrival times with extremely high (sub-nanosecond)
+resolution. Single-photon 3D cameras estimate the round-trip time of a laser
+pulse by forming equi-width (EW) histograms of detected photon timestamps.
+Acquiring and transferring such EW histograms requires high bandwidth and
+in-pixel memory, making SPCs less attractive in resource-constrained settings
+such as mobile devices and AR/VR headsets. In this work we propose a 3D sensing
+technique based on equi-depth (ED) histograms. ED histograms compress timestamp
+data more efficiently than EW histograms, reducing the bandwidth requirement.
+Moreover, to reduce the in-pixel memory requirement, we propose a lightweight
+algorithm to estimate ED histograms in an online fashion without explicitly
+storing the photon timestamps. This algorithm is amenable to future in-pixel
+implementations. We propose algorithms that process ED histograms to perform 3D
+computer-vision tasks of estimating scene distance maps and performing visual
+odometry under challenging conditions such as high ambient light. Our work
+paves the way towards lower bandwidth and reduced in-pixel memory requirements
+for SPCs, making them attractive for resource-constrained 3D vision
+applications. Project page:
+$\href{https://www.computational.camera/pedh}{https://www.computational.camera/pedh}$
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Using Backbone Foundation Model for Evaluating Fairness in Chest
+  Radiography Without Demographic Data <span class="chip">MICCAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16130v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16130v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dilermando Queiroz, André Anjos, Lilian Berton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ensuring consistent performance across diverse populations and incorporating
+fairness into machine learning models are crucial for advancing medical image
+diagnostics and promoting equitable healthcare. However, many databases do not
+provide protected attributes or contain unbalanced representations of
+demographic groups, complicating the evaluation of model performance across
+different demographics and the application of bias mitigation techniques that
+rely on these attributes. This study aims to investigate the effectiveness of
+using the backbone of Foundation Models as an embedding extractor for creating
+groups that represent protected attributes, such as gender and age. We propose
+utilizing these groups in different stages of bias mitigation, including
+pre-processing, in-processing, and evaluation. Using databases in and
+out-of-distribution scenarios, it is possible to identify that the method can
+create groups that represent gender in both databases and reduce in 4.44% the
+difference between the gender attribute in-distribution and 6.16% in
+out-of-distribution. However, the model lacks robustness in handling age
+attributes, underscoring the need for more fundamentally fair and robust
+Foundation models. These findings suggest a role in promoting fairness
+assessment in scenarios where we lack knowledge of attributes, contributing to
+the development of more equitable medical diagnostics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint of paper to be presented at Fairness of AI in Medical
+  Imaging (FAIMI) during MICCAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ChartEye: A Deep Learning Framework for Chart Information Extraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16123v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16123v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Osama Mustafa, Muhammad Khizer Ali, Momina Moetesum, Imran Siddiqi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The widespread use of charts and infographics as a means of data
+visualization in various domains has inspired recent research in automated
+chart understanding. However, information extraction from chart images is a
+complex multitasked process due to style variations and, as a consequence, it
+is challenging to design an end-to-end system. In this study, we propose a deep
+learning-based framework that provides a solution for key steps in the chart
+information extraction pipeline. The proposed framework utilizes hierarchal
+vision transformers for the tasks of chart-type and text-role classification,
+while YOLOv7 for text detection. The detected text is then enhanced using Super
+Resolution Generative Adversarial Networks to improve the recognition output of
+the OCR. Experimental results on a benchmark dataset show that our proposed
+framework achieves excellent performance at every stage with F1-scores of 0.97
+for chart-type classification, 0.91 for text-role classification, and a mean
+Average Precision of 0.95 for text detection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 Pages, and 11 Figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Alternating Direction Method of Multipliers for Negative Binomial Model
+  with The Weighted Difference of Anisotropic and Isotropic Total Variation <span class="chip">ICME</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16117v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16117v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Lu, Kevin Bui, Roummel F. Marcia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In many applications such as medical imaging, the measurement data represent
+counts of photons hitting a detector. Such counts in low-photon settings are
+often modeled using a Poisson distribution. However, this model assumes that
+the mean and variance of the signal's noise distribution are equal. For
+overdispersed data where the variance is greater than the mean, the negative
+binomial distribution is a more appropriate statistical model. In this paper,
+we propose an optimization approach for recovering images corrupted by
+overdispersed Poisson noise. In particular, we incorporate a weighted
+anisotropic-isotropic total variation regularizer, which avoids staircasing
+artifacts that are introduced by a regular total variation penalty. We use an
+alternating direction method of multipliers, where each subproblem has a
+closed-form solution. Numerical experiments demonstrate the effectiveness of
+our proposed approach, especially in very photon-limited settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, Accepted by the IEEE International Conference on Multimedia
+  and Expo (ICME)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Negative Binomial Matrix Completion <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16113v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16113v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Lu, Kevin Bui, Roummel F. Marcia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Matrix completion focuses on recovering missing or incomplete information in
+matrices. This problem arises in various applications, including image
+processing and network analysis. Previous research proposed Poisson matrix
+completion for count data with noise that follows a Poisson distribution, which
+assumes that the mean and variance are equal. Since overdispersed count data,
+whose variance is greater than the mean, is more likely to occur in realistic
+settings, we assume that the noise follows the negative binomial (NB)
+distribution, which can be more general than the Poisson distribution. In this
+paper, we introduce NB matrix completion by proposing a nuclear-norm
+regularized model that can be solved by proximal gradient descent. In our
+experiments, we demonstrate that the NB model outperforms Poisson matrix
+completion in various noise and missing data settings on real data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, Accepted by the IEEE International Workshop on Machine
+  Learning for Signal Processing (MLSP)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 3D Reconstruction with Spatial Memory 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16061v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16061v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hengyi Wang, Lourdes Agapito
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Spann3R, a novel approach for dense 3D reconstruction from ordered
+or unordered image collections. Built on the DUSt3R paradigm, Spann3R uses a
+transformer-based architecture to directly regress pointmaps from images
+without any prior knowledge of the scene or camera parameters. Unlike DUSt3R,
+which predicts per image-pair pointmaps each expressed in its local coordinate
+frame, Spann3R can predict per-image pointmaps expressed in a global coordinate
+system, thus eliminating the need for optimization-based global alignment. The
+key idea of Spann3R is to manage an external spatial memory that learns to keep
+track of all previous relevant 3D information. Spann3R then queries this
+spatial memory to predict the 3D structure of the next frame in a global
+coordinate system. Taking advantage of DUSt3R's pre-trained weights, and
+further fine-tuning on a subset of datasets, Spann3R shows competitive
+performance and generalization ability on various unseen datasets and can
+process ordered image collections in real time. Project page:
+\url{https://hengyiwang.github.io/projects/spanner}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: \url{https://hengyiwang.github.io/projects/spanner}</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HER2 and FISH Status Prediction in Breast Biopsy H&E-Stained Images
+  Using Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13818v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13818v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ardhendu Sekhar, Vrinda Goel, Garima Jain, Abhijeet Patil, Ravi Kant Gupta, Amit Sethi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The current standard for detecting human epidermal growth factor receptor 2
+(HER2) status in breast cancer patients relies on HER2 amplification,
+identified through fluorescence in situ hybridization (FISH) or
+immunohistochemistry (IHC). However, hematoxylin and eosin (H\&E) tumor stains
+are more widely available, and accurately predicting HER2 status using H\&E
+could reduce costs and expedite treatment selection. Deep Learning algorithms
+for H&E have shown effectiveness in predicting various cancer features and
+clinical outcomes, including moderate success in HER2 status prediction. In
+this work, we employed a customized weak supervision classification technique
+combined with MoCo-v2 contrastive learning to predict HER2 status. We trained
+our pipeline on 182 publicly available H&E Whole Slide Images (WSIs) from The
+Cancer Genome Atlas (TCGA), for which annotations by the pathology team at Yale
+School of Medicine are publicly available. Our pipeline achieved an Area Under
+the Curve (AUC) of 0.85 across four different test folds. Additionally, we
+tested our model on 44 H&E slides from the TCGA-BRCA dataset, which had an HER2
+score of 2+ and included corresponding HER2 status and FISH test results. These
+cases are considered equivocal for IHC, requiring an expensive FISH test on
+their IHC slides for disambiguation. Our pipeline demonstrated an AUC of 0.81
+on these challenging H&E slides. Reducing the need for FISH test can have
+significant implications in cancer treatment equity for underserved
+populations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SCP: Soft Conditional <span class="highlight-title">Prompt</span> Learning for Aerial Video Action
+  Recognition <span class="chip">IROS2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.12437v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.12437v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xijun Wang, Ruiqi Xian, Tianrui Guan, Fuxiao Liu, Dinesh Manocha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a new learning approach, Soft Conditional Prompt Learning (SCP),
+which leverages the strengths of prompt learning for aerial video action
+recognition. Our approach is designed to predict the action of each agent by
+helping the models focus on the descriptions or instructions associated with
+actions in the input videos for aerial/robot visual perception. Our formulation
+supports various prompts, including learnable prompts, auxiliary visual
+information, and large vision models to improve the recognition performance. We
+present a soft conditional prompt method that learns to dynamically generate
+prompts from a pool of prompt experts under different video inputs. By sharing
+the same objective with the task, our proposed SCP can optimize prompts that
+guide the model's predictions while explicitly learning input-invariant (prompt
+experts pool) and input-specific (data-dependent) prompt knowledge. In
+practice, we observe a 3.17-10.2% accuracy improvement on the aerial video
+datasets (Okutama, NECDrone), which consist of scenes with single-agent and
+multi-agent actions. We further evaluate our approach on ground camera videos
+to verify the effectiveness and generalization and achieve a 1.0-3.6%
+improvement on dataset SSV2. We integrate our method into the ROS2 as well.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IROS2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Examining Pathological Bias in a Generative Adversarial Network
+  Discriminator: A Case Study on a StyleGAN3 Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.09786v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.09786v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alvin Grissom II, Ryan F. Lei, Matt Gusdorff, Jeova Farias Sales Rocha Neto, Bailey Lin, Ryan Trotter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative adversarial networks (GANs) generate photorealistic faces that are
+often indistinguishable by humans from real faces. While biases in machine
+learning models are often assumed to be due to biases in training data, we find
+pathological internal color and luminance biases in the discriminator of a
+pre-trained StyleGAN3-r model that are not explicable by the training data. We
+also find that the discriminator systematically stratifies scores by both
+image- and face-level qualities and that this disproportionately affects images
+across gender, race, and other categories. We examine axes common in research
+on stereotyping in social psychology.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Infusion: internal diffusion for inpainting of dynamic textures and
+  complex motion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.01090v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.01090v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicolas Cherel, Andrés Almansa, Yann Gousseau, Alasdair Newson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video inpainting is the task of filling a region in a video in a visually
+convincing manner. It is very challenging due to the high dimensionality of the
+data and the temporal consistency required for obtaining convincing results.
+Recently, diffusion models have shown impressive results in modeling complex
+data distributions, including images and videos. Such models remain nonetheless
+very expensive to train and to perform inference with, which strongly reduce
+their applicability to videos, and yields unreasonable computational loads. We
+show that in the case of video inpainting, thanks to the highly auto-similar
+nature of videos, the training data of a diffusion model can be restricted to
+the input video and still produce very satisfying results. This leads us to
+adopt an internal learning approach, which also allows us to greatly reduce the
+neural network size by about three orders of magnitude less than current
+diffusion models used for image inpainting. We also introduce a new method for
+efficient training and inference of diffusion models in the context of internal
+learning, by splitting the diffusion process into different learning intervals
+corresponding to different noise levels of the diffusion process. To the best
+of our knowledge, this is the first video inpainting method based purely on
+diffusion. Other methods require additional components such as optical flow
+estimation, which limits their performance in the case of dynamic textures and
+complex motions. We show qualitative and quantitative results, demonstrating
+that our method reaches state of the art performance in the case of dynamic
+textures and complex dynamic backgrounds.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Provable Probabilistic Imaging using Score-Based Generative Priors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10835v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10835v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Sun, Zihui Wu, Yifan Chen, Berthy T. Feng, Katherine L. Bouman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Estimating high-quality images while also quantifying their uncertainty are
+two desired features in an image reconstruction algorithm for solving ill-posed
+inverse problems. In this paper, we propose plug-and-play Monte Carlo (PMC) as
+a principled framework for characterizing the space of possible solutions to a
+general inverse problem. PMC is able to incorporate expressive score-based
+generative priors for high-quality image reconstruction while also performing
+uncertainty quantification via posterior sampling. In particular, we develop
+two PMC algorithms that can be viewed as the sampling analogues of the
+traditional plug-and-play priors (PnP) and regularization by denoising (RED)
+algorithms. To improve the sampling efficiency, we introduce weighted annealing
+into these PMC algorithms, further developing two additional annealed PMC
+algorithms (APMC). We establish a theoretical analysis for characterizing the
+convergence behavior of PMC algorithms. Our analysis provides non-asymptotic
+stationarity guarantees in terms of the Fisher information, fully compatible
+with the joint presence of weighted annealing, potentially non-log-concave
+likelihoods, and imperfect score networks. We demonstrate the performance of
+the PMC algorithms on multiple representative inverse problems with both linear
+and nonlinear forward models. Experimental results show that PMC significantly
+improves reconstruction quality and enables high-fidelity uncertainty
+quantification.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Imperceptible Protection against Style Imitation from Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.19254v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.19254v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Namhyuk Ahn, Wonhyuk Ahn, KiYoon Yoo, Daesik Kim, Seung-Hun Nam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent progress in diffusion models has profoundly enhanced the fidelity of
+image generation, but it has raised concerns about copyright infringements.
+While prior methods have introduced adversarial perturbations to prevent style
+imitation, most are accompanied by the degradation of artworks' visual quality.
+Recognizing the importance of maintaining this, we introduce a visually
+improved protection method while preserving its protection capability. To this
+end, we devise a perceptual map to highlight areas sensitive to human eyes,
+guided by instance-aware refinement, which refines the protection intensity
+accordingly. We also introduce a difficulty-aware protection by predicting how
+difficult the artwork is to protect and dynamically adjusting the intensity
+based on this. Lastly, we integrate a perceptual constraints bank to further
+improve the imperceptibility. Results show that our method substantially
+elevates the quality of the protected image without compromising on protection
+efficacy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ u-LLaVA: Unifying Multi-Modal Tasks via Large Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.05348v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.05348v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinjin Xu, Liwu Xu, Yuzhe Yang, Xiang Li, Fanyi Wang, Yanchun Xie, Yi-Jie Huang, Yaqian Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in multi-modal large language models (MLLMs) have led to
+substantial improvements in visual understanding, primarily driven by
+sophisticated modality alignment strategies. However, predominant approaches
+prioritize global or regional comprehension, with less focus on fine-grained,
+pixel-level tasks. To address this gap, we introduce u-LLaVA, an innovative
+unifying multi-task framework that integrates pixel, regional, and global
+features to refine the perceptual faculties of MLLMs. We commence by leveraging
+an efficient modality alignment approach, harnessing both image and video
+datasets to bolster the model's foundational understanding across diverse
+visual contexts. Subsequently, a joint instruction tuning method with
+task-specific projectors and decoders for end-to-end downstream training is
+presented. Furthermore, this work contributes a novel mask-based multi-task
+dataset comprising 277K samples, crafted to challenge and assess the
+fine-grained perception capabilities of MLLMs. The overall framework is simple,
+effective, and achieves state-of-the-art performance across multiple
+benchmarks. We also make our model, data, and code publicly accessible at
+https://github.com/OPPOMKLab/u-LLaVA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automated Real-World Sustainability Data Generation from Images of
+  Buildings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.18064v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.18064v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peter J Bentley, Soo Ling Lim, Rajat Mathur, Sid Narang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When data on building features is unavailable, the task of determining how to
+improve that building in terms of carbon emissions becomes infeasible. We show
+that from only a set of images, a Large Language Model with appropriate prompt
+engineering and domain knowledge can successfully estimate a range of building
+features relevant for sustainability calculations. We compare our novel
+image-to-data method with a ground truth comprising real building data for 47
+apartments and achieve accuracy better than a human performing the same task.
+We also demonstrate that the method can generate tailored recommendations to
+the owner on how best to improve their properties and discuss methods to scale
+the approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ When Multi-Task Learning Meets Partial Supervision: A Computer Vision
+  <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14382v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14382v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maxime Fontana, Michael Spratling, Miaojing Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-Task Learning (MTL) aims to learn multiple tasks simultaneously while
+exploiting their mutual relationships. By using shared resources to
+simultaneously calculate multiple outputs, this learning paradigm has the
+potential to have lower memory requirements and inference times compared to the
+traditional approach of using separate methods for each task. Previous work in
+MTL has mainly focused on fully-supervised methods, as task relationships can
+not only be leveraged to lower the level of data-dependency of those methods
+but they can also improve performance. However, MTL introduces a set of
+challenges due to a complex optimisation scheme and a higher labeling
+requirement. This review focuses on how MTL could be utilised under different
+partial supervision settings to address these challenges. First, this review
+analyses how MTL traditionally uses different parameter sharing techniques to
+transfer knowledge in between tasks. Second, it presents the different
+challenges arising from such a multi-objective optimisation scheme. Third, it
+introduces how task groupings can be achieved by analysing task relationships.
+Fourth, it focuses on how partially supervised methods applied to MTL can
+tackle the aforementioned challenges. Lastly, this review presents the
+available datasets, tools and benchmarking results of such methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Proceedings of the IEEE</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Research on the Spatial Data Intelligent Foundation Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19730v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19730v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaohua Wang, Xing Xie, Yong Li, Danhuai Guo, Zhi Cai, Yu Liu, Yang Yue, Xiao Pan, Feng Lu, Huayi Wu, Zhipeng Gui, Zhiming Ding, Bolong Zheng, Fuzheng Zhang, Jingyuan Wang, Zhengchao Chen, Hao Lu, Jiayi Li, Peng Yue, Wenhao Yu, Yao Yao, Leilei Sun, Yong Zhang, Longbiao Chen, Xiaoping Du, Xiang Li, Xueying Zhang, Kun Qin, Zhaoya Gong, Weihua Dong, Xiaofeng Meng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This report focuses on spatial data intelligent large models, delving into
+the principles, methods, and cutting-edge applications of these models. It
+provides an in-depth discussion on the definition, development history, current
+status, and trends of spatial data intelligent large models, as well as the
+challenges they face. The report systematically elucidates the key technologies
+of spatial data intelligent large models and their applications in urban
+environments, aerospace remote sensing, geography, transportation, and other
+scenarios. Additionally, it summarizes the latest application cases of spatial
+data intelligent large models in themes such as urban development, multimodal
+systems, remote sensing, smart transportation, and resource environments.
+Finally, the report concludes with an overview and outlook on the development
+prospects of spatial data intelligent large models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>V1 and V2 are in Chinese language, other versions are in English</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FRAME: A Modular Framework for Autonomous Map Merging: Advancements in
+  the Field 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.18006v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.18006v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikolaos Stathoulopoulos, Björn Lindqvist, Anton Koval, Ali-akbar Agha-mohammadi, George Nikolakopoulos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this article, a novel approach for merging 3D point cloud maps in the
+context of egocentric multi-robot exploration is presented. Unlike traditional
+methods, the proposed approach leverages state-of-the-art place recognition and
+learned descriptors to efficiently detect overlap between maps, eliminating the
+need for the time-consuming global feature extraction and feature matching
+process. The estimated overlapping regions are used to calculate a homogeneous
+rigid transform, which serves as an initial condition for the GICP point cloud
+registration algorithm to refine the alignment between the maps. The advantages
+of this approach include faster processing time, improved accuracy, and
+increased robustness in challenging environments. Furthermore, the
+effectiveness of the proposed framework is successfully demonstrated through
+multiple field missions of robot exploration in a variety of different
+underground environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, 24 figures. Accepted to the IEEE Transactions on Field
+  Robotics</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Re-Nerfing: Improving Novel View Synthesis through Novel View Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.02255v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.02255v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Felix Tristram, Stefano Gasperini, Nassir Navab, Federico Tombari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent neural rendering and reconstruction techniques, such as NeRFs or
+Gaussian Splatting, have shown remarkable novel view synthesis capabilities but
+require hundreds of images of the scene from diverse viewpoints to render
+high-quality novel views. With fewer images available, these methods start to
+fail since they can no longer correctly triangulate the underlying 3D geometry
+and converge to a non-optimal solution. These failures can manifest as floaters
+or blurry renderings in sparsely observed areas of the scene. In this paper, we
+propose Re-Nerfing, a simple and general add-on approach that leverages novel
+view synthesis itself to tackle this problem. Using an already trained NVS
+method, we render novel views between existing ones and augment the training
+data to optimize a second model. This introduces additional multi-view
+constraints and allows the second model to converge to a better solution. With
+Re-Nerfing we achieve significant improvements upon multiple pipelines based on
+NeRF and Gaussian-Splatting in sparse view settings of the mip-NeRF 360 and
+LLFF datasets. Notably, Re-Nerfing does not require prior knowledge or extra
+supervision signals, making it a flexible and practical add-on.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code will be released upon acceptance</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FAST-LIVO2: Fast, Direct LiDAR-Inertial-Visual Odometry 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14035v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14035v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chunran Zheng, Wei Xu, Zuhao Zou, Tong Hua, Chongjian Yuan, Dongjiao He, Bingyang Zhou, Zheng Liu, Jiarong Lin, Fangcheng Zhu, Yunfan Ren, Rong Wang, Fanle Meng, Fu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes FAST-LIVO2: a fast, direct LiDAR-inertial-visual odometry
+framework to achieve accurate and robust state estimation in SLAM tasks and
+provide great potential in real-time, onboard robotic applications. FAST-LIVO2
+fuses the IMU, LiDAR and image measurements efficiently through an ESIKF. To
+address the dimension mismatch between the heterogeneous LiDAR and image
+measurements, we use a sequential update strategy in the Kalman filter. To
+enhance the efficiency, we use direct methods for both the visual and LiDAR
+fusion, where the LiDAR module registers raw points without extracting edge or
+plane features and the visual module minimizes direct photometric errors
+without extracting ORB or FAST corner features. The fusion of both visual and
+LiDAR measurements is based on a single unified voxel map where the LiDAR
+module constructs the geometric structure for registering new LiDAR scans and
+the visual module attaches image patches to the LiDAR points. To enhance the
+accuracy of image alignment, we use plane priors from the LiDAR points in the
+voxel map (and even refine the plane prior) and update the reference patch
+dynamically after new images are aligned. Furthermore, to enhance the
+robustness of image alignment, FAST-LIVO2 employs an on-demanding raycast
+operation and estimates the image exposure time in real time. Lastly, we detail
+three applications of FAST-LIVO2: UAV onboard navigation demonstrating the
+system's computation efficiency for real-time onboard navigation, airborne
+mapping showcasing the system's mapping accuracy, and 3D model rendering
+(mesh-based and NeRF-based) underscoring the suitability of our reconstructed
+dense map for subsequent rendering tasks. We open source our code, dataset and
+application on GitHub to benefit the robotics community.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages, 31 figures, due to the limitation that 'The abstract field
+  cannot exceed 1,920 characters', the abstract presented here is shorter than
+  the one in the PDF file</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automated Label Unification for Multi-<span class="highlight-title">Dataset</span> Semantic Segmentation with
+  GNNs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.10534v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.10534v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rong Ma, Jie Chen, Xiangyang Xue, Jian Pu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep supervised models possess significant capability to assimilate extensive
+training data, thereby presenting an opportunity to enhance model performance
+through training on multiple datasets. However, conflicts arising from
+different label spaces among datasets may adversely affect model performance.
+In this paper, we propose a novel approach to automatically construct a unified
+label space across multiple datasets using graph neural networks. This enables
+semantic segmentation models to be trained simultaneously on multiple datasets,
+resulting in performance improvements. Unlike existing methods, our approach
+facilitates seamless training without the need for additional manual
+reannotation or taxonomy reconciliation. This significantly enhances the
+efficiency and effectiveness of multi-dataset segmentation model training. The
+results demonstrate that our method significantly outperforms other
+multi-dataset training methods when trained on seven datasets simultaneously,
+and achieves state-of-the-art performance on the WildDash 2 benchmark.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AIM 2024 Challenge on Compressed Video Quality Assessment: Methods and
+  Results 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.11982v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.11982v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maksim Smirnov, Aleksandr Gushchin, Anastasia Antsiferova, Dmitry Vatolin, Radu Timofte, Ziheng Jia, Zicheng Zhang, Wei Sun, Jiaying Qian, Yuqin Cao, Yinan Sun, Yuxin Zhu, Xiongkuo Min, Guangtao Zhai, Kanjar De, Qing Luo, Ao-Xiang Zhang, Peng Zhang, Haibo Lei, Linyan Jiang, Yaqing Li, Wenhui Meng, Xiaoheng Tan, Haiqiang Wang, Xiaozhong Xu, Shan Liu, Zhenzhong Chen, Zhengxue Cheng, Jiahao Xiao, Jun Xu, Chenlong He, Qi Zheng, Ruoxi Zhu, Min Li, Yibo Fan, Zhengzhong Tu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video quality assessment (VQA) is a crucial task in the development of video
+compression standards, as it directly impacts the viewer experience. This paper
+presents the results of the Compressed Video Quality Assessment challenge, held
+in conjunction with the Advances in Image Manipulation (AIM) workshop at ECCV
+2024. The challenge aimed to evaluate the performance of VQA methods on a
+diverse dataset of 459 videos, encoded with 14 codecs of various compression
+standards (AVC/H.264, HEVC/H.265, AV1, and VVC/H.266) and containing a
+comprehensive collection of compression artifacts. To measure the methods
+performance, we employed traditional correlation coefficients between their
+predictions and subjective scores, which were collected via large-scale
+crowdsourced pairwise human comparisons. For training purposes, participants
+were provided with the Compressed Video Quality Assessment Dataset (CVQAD), a
+previously developed dataset of 1022 videos. Up to 30 participating teams
+registered for the challenge, while we report the results of 6 teams, which
+submitted valid final solutions and code for reproducing the results. Moreover,
+we calculated and present the performance of state-of-the-art VQA methods on
+the developed dataset, providing a comprehensive benchmark for future research.
+The dataset, results, and online leaderboard are publicly available at
+https://challenges.videoprocessing.ai/challenges/compressedvideo-quality-assessment.html.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DeepMIF: Deep Monotonic Implicit Fields for Large-Scale LiDAR 3D Mapping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.17550v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.17550v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kutay Yılmaz, Matthias Nießner, Anastasiia Kornilova, Alexey Artemov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, significant progress has been achieved in sensing real large-scale
+outdoor 3D environments, particularly by using modern acquisition equipment
+such as LiDAR sensors. Unfortunately, they are fundamentally limited in their
+ability to produce dense, complete 3D scenes. To address this issue, recent
+learning-based methods integrate neural implicit representations and
+optimizable feature grids to approximate surfaces of 3D scenes. However,
+naively fitting samples along raw LiDAR rays leads to noisy 3D mapping results
+due to the nature of sparse, conflicting LiDAR measurements. Instead, in this
+work we depart from fitting LiDAR data exactly, instead letting the network
+optimize a non-metric monotonic implicit field defined in 3D space. To fit our
+field, we design a learning system integrating a monotonicity loss that enables
+optimizing neural monotonic fields and leverages recent progress in large-scale
+3D mapping. Our algorithm achieves high-quality dense 3D mapping performance as
+captured by multiple quantitative and perceptual measures and visual results
+obtained for Mai City, Newer College, and KITTI benchmarks. The code of our
+approach will be made publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FERGI: Automatic Annotation of User Preferences for Text-to-Image
+  Generation from Spontaneous Facial Expression Reaction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.03187v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.03187v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuangquan Feng, Junhua Ma, Virginia R. de Sa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Researchers have proposed to use data of human preference feedback to
+fine-tune text-to-image generative models. However, the scalability of human
+feedback collection has been limited by its reliance on manual annotation.
+Therefore, we develop and test a method to automatically score user preferences
+from their spontaneous facial expression reaction to the generated images. We
+collect a dataset of Facial Expression Reaction to Generated Images (FERGI) and
+show that the activations of multiple facial action units (AUs) are highly
+correlated with user evaluations of the generated images. We develop an FAU-Net
+(Facial Action Units Neural Network), which receives inputs from an AU
+estimation model, to automatically score user preferences for text-to-image
+generation based on their facial expression reactions, which is complementary
+to the pre-trained scoring models based on the input text prompts and generated
+images. Integrating our FAU-Net valence score with the pre-trained scoring
+models improves their consistency with human preferences. This method of
+automatic annotation with facial expression analysis can be potentially
+generalized to other generation tasks. The code is available at
+https://github.com/ShuangquanFeng/FERGI, and the dataset is also available at
+the same link for research purposes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CMTA: Cross-Modal Temporal Alignment for Event-guided Video Deblurring <span class="chip">ECCV2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14930v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14930v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taewoo Kim, Hoonhee Cho, Kuk-Jin Yoon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video deblurring aims to enhance the quality of restored results in
+motion-blurred videos by effectively gathering information from adjacent video
+frames to compensate for the insufficient data in a single blurred frame.
+However, when faced with consecutively severe motion blur situations,
+frame-based video deblurring methods often fail to find accurate temporal
+correspondence among neighboring video frames, leading to diminished
+performance. To address this limitation, we aim to solve the video deblurring
+task by leveraging an event camera with micro-second temporal resolution. To
+fully exploit the dense temporal resolution of the event camera, we propose two
+modules: 1) Intra-frame feature enhancement operates within the exposure time
+of a single blurred frame, iteratively enhancing cross-modality features in a
+recurrent manner to better utilize the rich temporal information of events, 2)
+Inter-frame temporal feature alignment gathers valuable long-range temporal
+information to target frames, aggregating sharp features leveraging the
+advantages of the events. In addition, we present a novel dataset composed of
+real-world blurred RGB videos, corresponding sharp videos, and event data. This
+dataset serves as a valuable resource for evaluating event-guided deblurring
+methods. We demonstrate that our proposed methods outperform state-of-the-art
+frame-based and event-based motion deblurring methods through extensive
+experiments conducted on both synthetic and real-world deblurring datasets. The
+code and dataset are available at https://github.com/intelpro/CMTA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in ECCV2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Training-Free Action Recognition and Goal Inference with Dynamic Frame
+  Selection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12471v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12471v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ee Yeo Keat, Zhang Hao, Alexander Matyasko, Basura Fernando
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce VidTFS, a Training-free, open-vocabulary video goal and action
+inference framework that combines the frozen vision foundational model (VFM)
+and large language model (LLM) with a novel dynamic Frame Selection module. Our
+experiments demonstrate that the proposed frame selection module improves the
+performance of the framework significantly. We validate the performance of the
+proposed VidTFS on four widely used video datasets, including CrossTask, COIN,
+UCF101, and ActivityNet, covering goal inference and action recognition tasks
+under open-vocabulary settings without requiring any training or fine-tuning.
+The results show that VidTFS outperforms pretrained and instruction-tuned
+multimodal language models that directly stack LLM and VFM for downstream video
+inference tasks. Our VidTFS with its adaptability shows the future potential
+for generalizing to new training-free video inference tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unrecognizable Yet Identifiable: Image Distortion with Preserved
+  Embeddings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.15048v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.15048v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dmytro Zakharov, Oleksandr Kuznetsov, Emanuele Frontoni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Biometric authentication systems play a crucial role in modern security
+systems. However, maintaining the balance of privacy and integrity of stored
+biometrics derivative data while achieving high recognition accuracy is often
+challenging. Addressing this issue, we introduce an innovative image
+transformation technique that effectively renders facial images unrecognizable
+to the eye while maintaining their identifiability by neural network models,
+which allows the distorted photo version to be stored for further verification.
+While initially intended for biometrics systems, the proposed methodology can
+be used in various artificial intelligence applications to distort the visual
+data and keep the derived features close. By experimenting with widely used
+datasets LFW and MNIST, we show that it is possible to build the distortion
+that changes the image content by more than 70% while maintaining the same
+recognition accuracy. We compare our method with previously state-of-the-art
+approaches. We publically release the source code.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ How Physics and Background Attributes Impact Video <span class="highlight-title">Transformer</span>s in
+  Robotic Manipulation: A Case Study on Planar Pushing <span class="chip">IROS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02044v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02044v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shutong Jin, Ruiyu Wang, Muhammad Zahid, Florian T. Pokorny
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As model and dataset sizes continue to scale in robot learning, the need to
+understand how the composition and properties of a dataset affect model
+performance becomes increasingly urgent to ensure cost-effective data
+collection and model performance. In this work, we empirically investigate how
+physics attributes (color, friction coefficient, shape) and scene background
+characteristics, such as the complexity and dynamics of interactions with
+background objects, influence the performance of Video Transformers in
+predicting planar pushing trajectories. We investigate three primary questions:
+How do physics attributes and background scene characteristics influence model
+performance? What kind of changes in attributes are most detrimental to model
+generalization? What proportion of fine-tuning data is required to adapt models
+to novel scenarios? To facilitate this research, we present
+CloudGripper-Push-1K, a large real-world vision-based robot pushing dataset
+comprising 1278 hours and 460,000 videos of planar pushing interactions with
+objects with different physics and background attributes. We also propose Video
+Occlusion Transformer (VOT), a generic modular video-transformer-based
+trajectory prediction framework which features 3 choices of 2D-spatial encoders
+as the subject of our case study. The dataset and source code are available at
+https://cloudgripper.org.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE/RSJ IROS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evidential Deep Partial Multi-View Classification With Discount Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13123v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13123v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haojian Huang, Zhe Liu, Sukumar Letchmunan, Muhammet Deveci, Mingwei Lin, Weizhong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Incomplete multi-view data classification poses significant challenges due to
+the common issue of missing views in real-world scenarios. Despite
+advancements, existing methods often fail to provide reliable predictions,
+largely due to the uncertainty of missing views and the inconsistent quality of
+imputed data. To tackle these problems, we propose a novel framework called
+Evidential Deep Partial Multi-View Classification (EDP-MVC). Initially, we use
+K-means imputation to address missing views, creating a complete set of
+multi-view data. However, the potential conflicts and uncertainties within this
+imputed data can affect the reliability of downstream inferences. To manage
+this, we introduce a Conflict-Aware Evidential Fusion Network (CAEFN), which
+dynamically adjusts based on the reliability of the evidence, ensuring
+trustworthy discount fusion and producing reliable inference outcomes.
+Comprehensive experiments on various benchmark datasets reveal EDP-MVC not only
+matches but often surpasses the performance of state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Ongoing work. 13 pages, 3 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Urdu Digital Text Word Optical Character Recognition Using Permuted Auto
+  Regressive Sequence Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15119v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15119v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmed Mustafa, Muhammad Tahir Rafique, Muhammad Ijlal Baig, Hasan Sajid, Muhammad Jawad Khan, Karam Dad Kallu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This research paper presents a novel word-level Optical Character Recognition
+(OCR) model developed specifically for digital Urdu text. The model utilizes
+transformer-based architectures and attention mechanisms to address the unique
+challenges of recognizing Urdu script, which includes handling a diverse range
+of text styles, fonts, and variations. Trained on a comprehensive dataset of
+approximately 160,000 Urdu text images, the model incorporates a permuted
+autoregressive sequence (PARSeq) architecture. This design enables
+context-aware inference and iterative refinement by leveraging bidirectional
+context information, significantly enhancing its ability to accurately
+recognize Urdu characters. The model achieves a character error rate (CER) of
+0.178, highlighting its effectiveness and precision in real-world applications.
+However, the model has some limitations, such as difficulties with blurred
+images, non-horizontal orientations, and the presence of trailing punctuation
+marks, which can introduce noise into the recognition process. Addressing these
+challenges will be a key focus of future work. Future research will aim to
+further refine the model through advanced data augmentation techniques,
+optimization of hyperparameters, and the integration of context-aware language
+models, ultimately enhancing the model's performance and robustness in Urdu
+text recognition.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ When ControlNet Meets Inexplicit Masks: A Case Study of ControlNet on
+  its Contour-following Ability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.00467v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.00467v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenjie Xuan, Yufei Xu, Shanshan Zhao, Chaoyue Wang, Juhua Liu, Bo Du, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  ControlNet excels at creating content that closely matches precise contours
+in user-provided masks. However, when these masks contain noise, as a frequent
+occurrence with non-expert users, the output would include unwanted artifacts.
+This paper first highlights the crucial role of controlling the impact of these
+inexplicit masks with diverse deterioration levels through in-depth analysis.
+Subsequently, to enhance controllability with inexplicit masks, an advanced
+Shape-aware ControlNet consisting of a deterioration estimator and a
+shape-prior modulation block is devised. The deterioration estimator assesses
+the deterioration factor of the provided masks. Then this factor is utilized in
+the modulation block to adaptively modulate the model's contour-following
+ability, which helps it dismiss the noise part in the inexplicit masks.
+Extensive experiments prove its effectiveness in encouraging ControlNet to
+interpret inaccurate spatial conditions robustly rather than blindly following
+the given contours. We showcase application scenarios like modifying shape
+priors and composable shape-controllable generation. Codes are soon available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM-MM 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Learning for Computer Vision based Activity Recognition and Fall
+  Detection of the Elderly: a Systematic <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11790v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11790v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        F. Xavier Gaya-Morey, Cristina Manresa-Yee, Jose M. Buades-Rubio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the percentage of elderly people in developed countries increases
+worldwide, the healthcare of this collective is a worrying matter, especially
+if it includes the preservation of their autonomy. In this direction, many
+studies are being published on Ambient Assisted Living (AAL) systems, which
+help to reduce the preoccupations raised by the independent living of the
+elderly. In this study, a systematic review of the literature is presented on
+fall detection and Human Activity Recognition (HAR) for the elderly, as the two
+main tasks to solve to guarantee the safety of elderly people living alone. To
+address the current tendency to perform these two tasks, the review focuses on
+the use of Deep Learning (DL) based approaches on computer vision data. In
+addition, different collections of data like DL models, datasets or hardware
+(e.g. depth or thermal cameras) are gathered from the reviewed studies and
+provided for reference in future studies. Strengths and weaknesses of existing
+approaches are also discussed and, based on them, our recommendations for
+future works are provided.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLaVA-VSD: Large Language-and-Vision Assistant for Visual Spatial
+  Description 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.04957v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.04957v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yizhang Jin, Jian Li, Jiangning Zhang, Jianlong Hu, Zhenye Gan, Xin Tan, Yong Liu, Yabiao Wang, Chengjie Wang, Lizhuang Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Spatial Description (VSD) aims to generate texts that describe the
+spatial relationships between objects within images. Traditional visual spatial
+relationship classification (VSRC) methods typically output the spatial
+relationship between two objects in an image, often neglecting world knowledge
+and lacking general language capabilities. In this paper, we propose a Large
+Language-and-Vision Assistant for Visual Spatial Description, named LLaVA-VSD,
+which is designed for the classification, description, and open-ended
+description of visual spatial relationships. Specifically, the model first
+constructs a VSD instruction-following dataset using given figure-caption pairs
+for the three tasks. It then employs LoRA to fine-tune a Large Language and
+Vision Assistant for VSD, which has 13 billion parameters and supports
+high-resolution images. Finally, a large language model (Qwen-2) is used to
+refine the generated sentences, enhancing their diversity and accuracy.
+LLaVA-VSD demonstrates excellent multimodal conversational capabilities and can
+follow open-ended instructions to assist with inquiries about object
+relationships in images.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>We have discovered a significant error in the paper that affects the
+  main conclusions. To ensure the accuracy of our research, we have decided to
+  withdraw this paper and will resubmit it after making the necessary
+  corrections</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Solid Waste Detection, Monitoring and Mapping in Remote Sensing Images:
+  A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.09066v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.09066v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Piero Fraternali, Luca Morandini, Sergio Luis Herrera González
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The detection and characterization of illegal solid waste disposal sites are
+essential for environmental protection, particularly for mitigating pollution
+and health hazards. Improperly managed landfills contaminate soil and
+groundwater via rainwater infiltration, posing threats to both animals and
+humans. Traditional landfill identification approaches, such as on-site
+inspections, are time-consuming and expensive. Remote sensing is a
+cost-effective solution for the identification and monitoring of solid waste
+disposal sites that enables broad coverage and repeated acquisitions over time.
+Earth Observation (EO) satellites, equipped with an array of sensors and
+imaging capabilities, have been providing high-resolution data for several
+decades. Researchers proposed specialized techniques that leverage remote
+sensing imagery to perform a range of tasks such as waste site detection,
+dumping site monitoring, and assessment of suitable locations for new
+landfills. This review aims to provide a detailed illustration of the most
+relevant proposals for the detection and monitoring of solid waste sites by
+describing and comparing the approaches, the implemented techniques, and the
+employed data. Furthermore, since the data sources are of the utmost importance
+for developing an effective solid waste detection model, a comprehensive
+overview of the satellites and publicly available data sets is presented.
+Finally, this paper identifies the open issues in the state-of-the-art and
+discusses the relevant research directions for reducing the costs and improving
+the effectiveness of novel solid waste detection methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TokenPacker: Efficient Visual Projector for Multimodal LLM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.02392v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.02392v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wentong Li, Yuqian Yuan, Jian Liu, Dongqi Tang, Song Wang, Jie Qin, Jianke Zhu, Lei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The visual projector serves as an essential bridge between the visual encoder
+and the Large Language Model (LLM) in a Multimodal LLM (MLLM). Typically, MLLMs
+adopt a simple MLP to preserve all visual contexts via one-to-one
+transformation. However, the visual tokens are redundant and can be
+considerably increased when dealing with high-resolution images, impairing the
+efficiency of MLLMs significantly. Some recent works have introduced resampler
+or abstractor to reduce the number of resulting visual tokens. Unfortunately,
+they fail to capture finer details and undermine the visual reasoning
+capabilities of MLLMs. In this work, we propose a novel visual projector, which
+adopts a coarse-to-fine scheme to inject the enriched characteristics to
+generate the condensed visual tokens. In specific, we first interpolate the
+visual features as a low-resolution point query, providing the overall visual
+representation as the foundation. Then, we introduce a region-to-point
+injection module that utilizes high-resolution, multi-level region-based cues
+as fine-grained reference keys and values, allowing them to be fully absorbed
+within the corresponding local context region. This step effectively updates
+the coarse point query, transforming it into an enriched one for the subsequent
+LLM reasoning. Extensive experiments demonstrate that our approach compresses
+the visual tokens by 75%~89%, while achieves comparable or even better
+performance across diverse benchmarks with significantly higher efficiency. The
+source codes can be found at https://github.com/CircleRadon/TokenPacker.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, Codes:https://github.com/CircleRadon/TokenPacker</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unveiling the Human-like Similarities of Automatic Facial Expression
+  Recognition: An Empirical Exploration through Explainable AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11835v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11835v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        F. Xavier Gaya-Morey, Silvia Ramis-Guarinos, Cristina Manresa-Yee, Jose M. Buades-Rubio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Facial expression recognition is vital for human behavior analysis, and deep
+learning has enabled models that can outperform humans. However, it is unclear
+how closely they mimic human processing. This study aims to explore the
+similarity between deep neural networks and human perception by comparing
+twelve different networks, including both general object classifiers and
+FER-specific models. We employ an innovative global explainable AI method to
+generate heatmaps, revealing crucial facial regions for the twelve networks
+trained on six facial expressions. We assess these results both quantitatively
+and qualitatively, comparing them to ground truth masks based on Friesen and
+Ekman's description and among them. We use Intersection over Union (IoU) and
+normalized correlation coefficients for comparisons. We generate 72 heatmaps to
+highlight critical regions for each expression and architecture. Qualitatively,
+models with pre-trained weights show more similarity in heatmaps compared to
+those without pre-training. Specifically, eye and nose areas influence certain
+facial expressions, while the mouth is consistently important across all models
+and expressions. Quantitatively, we find low average IoU values (avg. 0.2702)
+across all expressions and architectures. The best-performing architecture
+averages 0.3269, while the worst-performing one averages 0.2066. Dendrograms,
+built with the normalized correlation coefficient, reveal two main clusters for
+most expressions: models with pre-training and models without pre-training.
+Findings suggest limited alignment between human and AI facial expression
+recognition, with network architectures influencing the similarity, as similar
+architectures prioritize similar facial regions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Multimed Tools Appl (2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DocLayLLM: An Efficient and Effective Multi-modal Extension of Large
+  Language Models for Text-rich Document Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15045v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15045v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenhui Liao, Jiapeng Wang, Hongliang Li, Chengyu Wang, Jun Huang, Lianwen Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-rich document understanding (TDU) refers to analyzing and comprehending
+documents containing substantial textual content. With the rapid evolution of
+large language models (LLMs), they have been widely leveraged for TDU due to
+their remarkable versatility and generalization. In this paper, we introduce
+DocLayLLM, an efficient and effective multi-modal extension of LLMs
+specifically designed for TDU. By integrating visual patch tokens and 2D
+positional tokens into LLMs and encoding the document content using the LLMs
+themselves, we fully take advantage of the document comprehension capability of
+LLMs and enhance their perception of OCR information. We have also deeply
+considered the role of the chain-of-thought (CoT) and innovatively proposed the
+techniques of CoT Pre-training and CoT Annealing. Our DocLayLLM can achieve
+remarkable performances with lightweight training settings, showcasing its
+efficiency and effectiveness. Experimental results demonstrate that our
+DocLayLLM surpasses existing OCR-dependent methods and also outperforms
+OCR-free competitors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond Uniform Query Distribution: Key-Driven Grouped Query Attention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.08454v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.08454v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zohaib Khan, Muhammad Khaquan, Omer Tafveez, Burhanuddin Samiwala, Agha Ali Raza
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Transformer architecture has revolutionized deep learning through its
+Self-Attention mechanism, which effectively captures contextual information.
+However, the memory footprint of Self-Attention presents significant challenges
+for long-sequence tasks. Grouped Query Attention (GQA) addresses this issue by
+grouping queries and mean-pooling the corresponding key-value heads - reducing
+the number of overall parameters and memory requirements in a flexible manner
+without adversely compromising model accuracy. In this work, we introduce
+enhancements to GQA, focusing on two novel approaches that deviate from the
+static nature of grouping: Key-Distributed GQA (KDGQA) and Dynamic
+Key-Distributed GQA (DGQA), which leverage information from the norms of the
+key heads to inform query allocation. Specifically, KDGQA looks at the ratios
+of the norms of the key heads during each forward pass, while DGQA examines the
+ratios of the norms as they evolve through training. Additionally, we present
+Perturbed GQA (PGQA) as a case-study, which introduces variability in (static)
+group formation via subtracting noise from the attention maps. Our experiments
+with up-trained Vision Transformers, for Image Classification on datasets such
+as CIFAR-10, CIFAR-100, Food101, and Tiny ImageNet, demonstrate the promise of
+these variants in improving upon the original GQA through more informed and
+adaptive grouping mechanisms: specifically ViT-L experiences accuracy gains of
+up to 8% when utilizing DGQA in comparison to GQA and other variants. We
+further analyze the impact of the number of Key-Value Heads on performance,
+underscoring the importance of utilizing query-key affinities. Code is
+available on GitHub.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adapting Segment Anything Model to Multi-modal Salient Object Detection
+  with Semantic Feature Fusion Guidance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15063v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15063v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kunpeng Wang, Danying Lin, Chenglong Li, Zhengzheng Tu, Bin Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although most existing multi-modal salient object detection (SOD) methods
+demonstrate effectiveness through training models from scratch, the limited
+multi-modal data hinders these methods from reaching optimality. In this paper,
+we propose a novel framework to explore and exploit the powerful feature
+representation and zero-shot generalization ability of the pre-trained Segment
+Anything Model (SAM) for multi-modal SOD. Despite serving as a recent vision
+fundamental model, driving the class-agnostic SAM to comprehend and detect
+salient objects accurately is non-trivial, especially in challenging scenes. To
+this end, we develop \underline{SAM} with se\underline{m}antic
+f\underline{e}ature fu\underline{s}ion guidanc\underline{e} (Sammese), which
+incorporates multi-modal saliency-specific knowledge into SAM to adapt SAM to
+multi-modal SOD tasks. However, it is difficult for SAM trained on single-modal
+data to directly mine the complementary benefits of multi-modal inputs and
+comprehensively utilize them to achieve accurate saliency prediction.To address
+these issues, we first design a multi-modal complementary fusion module to
+extract robust multi-modal semantic features by integrating information from
+visible and thermal or depth image pairs. Then, we feed the extracted
+multi-modal semantic features into both the SAM image encoder and mask decoder
+for fine-tuning and prompting, respectively. Specifically, in the image
+encoder, a multi-modal adapter is proposed to adapt the single-modal SAM to
+multi-modal information. In the mask decoder, a semantic-geometric prompt
+generation strategy is proposed to produce corresponding embeddings with
+various saliency cues. Extensive experiments on both RGB-D and RGB-T SOD
+benchmarks show the effectiveness of the proposed framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HAIR: Hypernetworks-based All-in-One Image Restoration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.08091v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.08091v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jin Cao, Yi Cao, Li Pang, Deyu Meng, Xiangyong Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image restoration aims to recover a high-quality clean image from its
+degraded version. Recent progress in image restoration has demonstrated the
+effectiveness of All-in-One image restoration models in addressing various
+degradations simultaneously. However, these existing methods typically utilize
+the same parameters to tackle images with different degradation types, thus
+forcing the model to balance the performance between different tasks and
+limiting its performance on each task. To alleviate this issue, we propose
+HAIR, a \textbf{H}ypernetworks-based \textbf{A}ll-in-One \textbf{I}mage
+\textbf{R}estoration method that dynamically generates parameters based on
+input images. Specifically, HAIR consists of two main components, i.e.,
+Classifier and Hyper Selecting Net (HSN). The Classifier is a simple image
+classification network used to generate a Global Information Vector (GIV) that
+contains the degradation information of the input image, and the HSN is a
+simple fully-connected neural network that receives the GIV and outputs
+parameters for the corresponding modules. Extensive experiments demonstrate
+that HAIR can significantly improve the performance of existing image
+restoration models in a plug-and-play manner, both in single-task and
+all-in-one settings. Notably, our innovative model, Res-HAIR, which integrates
+HAIR into the well-known Restormer, can obtain superior or comparable
+performance compared with current state-of-the-art methods. Moreover, we
+theoretically demonstrate that our proposed HAIR requires fewer parameters in
+contrast to the prevalent All-in-One methodologies. The code is available at
+\textcolor{blue}{\href{https://github.com/toummHus/HAIR}{https://github.com/toummHus/HAIR}.}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Boost Your NeRF: A Model-Agnostic Mixture of Experts Framework for High
+  Quality and Efficient Rendering <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.10389v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.10389v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francesco Di Sario, Riccardo Renzulli, Enzo Tartaglione, Marco Grangetto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Since the introduction of NeRFs, considerable attention has been focused on
+improving their training and inference times, leading to the development of
+Fast-NeRFs models. Despite demonstrating impressive rendering speed and
+quality, the rapid convergence of such models poses challenges for further
+improving reconstruction quality. Common strategies to improve rendering
+quality involves augmenting model parameters or increasing the number of
+sampled points. However, these computationally intensive approaches encounter
+limitations in achieving significant quality enhancements. This study
+introduces a model-agnostic framework inspired by Sparsely-Gated Mixture of
+Experts to enhance rendering quality without escalating computational
+complexity. Our approach enables specialization in rendering different scene
+components by employing a mixture of experts with varying resolutions. We
+present a novel gate formulation designed to maximize expert capabilities and
+propose a resolution-based routing technique to effectively induce sparsity and
+decompose scenes. Our work significantly improves reconstruction quality while
+maintaining competitive performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The paper has been accepted to the ECCV 2024 conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Quantitative Image Synthesis through <span class="highlight-title">Pretrain</span>ing and
+  Resolution Scaling for Bone Mineral Density Estimation from a Plain X-ray
+  Image <span class="chip">MICCAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.20495v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.20495v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Gu, Yoshito Otake, Keisuke Uemura, Masaki Takao, Mazen Soufi, Seiji Okada, Nobuhiko Sugano, Hugues Talbot, Yoshinobu Sato
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While most vision tasks are essentially visual in nature (for recognition),
+some important tasks, especially in the medical field, also require
+quantitative analysis (for quantification) using quantitative images. Unlike in
+visual analysis, pixel values in quantitative images correspond to physical
+metrics measured by specific devices (e.g., a depth image). However, recent
+work has shown that it is sometimes possible to synthesize accurate
+quantitative values from visual ones (e.g., depth from visual cues or defocus).
+This research aims to improve quantitative image synthesis (QIS) by exploring
+pretraining and image resolution scaling. We propose a benchmark for evaluating
+pretraining performance using the task of QIS-based bone mineral density (BMD)
+estimation from plain X-ray images, where the synthesized quantitative image is
+used to derive BMD. Our results show that appropriate pretraining can improve
+QIS performance, significantly raising the correlation of BMD estimation from
+0.820 to 0.898, while others do not help or even hinder it. Scaling-up the
+resolution can further boost the correlation up to 0.923, a significant
+enhancement over conventional methods. Future work will include exploring more
+pretraining strategies and validating them on other image synthesis tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>SASHIMI, 2024 (MICCAI workshop). 13 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NOVUM: Neural Object Volumes for Robust Object Classification <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14668v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14668v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Artur Jesslen, Guofeng Zhang, Angtian Wang, Wufei Ma, Alan Yuille, Adam Kortylewski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Discriminative models for object classification typically learn image-based
+representations that do not capture the compositional and 3D nature of objects.
+In this work, we show that explicitly integrating 3D compositional object
+representations into deep networks for image classification leads to a largely
+enhanced generalization in out-of-distribution scenarios. In particular, we
+introduce a novel architecture, referred to as NOVUM, that consists of a
+feature extractor and a neural object volume for every target object class.
+Each neural object volume is a composition of 3D Gaussians that emit feature
+vectors. This compositional object representation allows for a highly robust
+and fast estimation of the object class by independently matching the features
+of the 3D Gaussians of each category to features extracted from an input image.
+Additionally, the object pose can be estimated via inverse rendering of the
+corresponding neural object volume. To enable the classification of objects,
+the neural features at each 3D Gaussian are trained discriminatively to be
+distinct from (i) the features of 3D Gaussians in other categories, (ii)
+features of other 3D Gaussians of the same object, and (iii) the background
+features. Our experiments show that NOVUM offers intriguing advantages over
+standard architectures due to the 3D compositional structure of the object
+representation, namely: (1) An exceptional robustness across a spectrum of
+real-world and synthetic out-of-distribution shifts and (2) an enhanced human
+interpretability compared to standard models, all while maintaining real-time
+inference and a competitive accuracy on in-distribution data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 4 figures, accepted at ECCV 2024, code is accessible at
+  https://github.com/GenIntel/NOVUM</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Brain3D: Generating 3D Objects from fMRI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.15239v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.15239v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuankun Yang, Li Zhang, Ziyang Xie, Zhiyuan Yuan, Jianfeng Feng, Xiatian Zhu, Yu-Gang Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding the hidden mechanisms behind human's visual perception is a
+fundamental question in neuroscience. To that end, investigating into the
+neural responses of human mind activities, such as functional Magnetic
+Resonance Imaging (fMRI), has been a significant research vehicle. However,
+analyzing fMRI signals is challenging, costly, daunting, and demanding for
+professional training. Despite remarkable progress in fMRI analysis, existing
+approaches are limited to generating 2D images and far away from being
+biologically meaningful and practically useful. Under this insight, we propose
+to generate visually plausible and functionally more comprehensive 3D outputs
+decoded from brain signals, enabling more sophisticated modeling of fMRI data.
+Conceptually, we reformulate this task as a {\em fMRI conditioned 3D object
+generation} problem. We design a novel 3D object representation learning
+method, Brain3D, that takes as input the fMRI data of a subject who was
+presented with a 2D image, and yields as output the corresponding 3D object
+images. The key capabilities of this model include tackling the noises with
+high-level semantic signals and a two-stage architecture design for progressive
+high-level information integration. Extensive experiments validate the superior
+capability of our model over previous state-of-the-art 3D object generation
+methods. Importantly, we show that our model captures the distinct
+functionalities of each region of human vision system as well as their
+intricate interplay relationships, aligning remarkably with the established
+discoveries in neuroscience. Further, preliminary evaluations indicate that
+Brain3D can successfully identify the disordered brain regions in simulated
+scenarios, such as V1, V2, V3, V4, and the medial temporal lobe (MTL) within
+the human visual system. Our data and code will be available at
+https://brain-3d.github.io/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 11 figures, project page: https://brain-3d.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DualAnoDiff: Dual-Interrelated Diffusion Model for Few-Shot Anomaly
+  Image Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13509v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13509v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ying Jin, Jinlong Peng, Qingdong He, Teng Hu, Hao Chen, Jiafu Wu, Wenbing Zhu, Mingmin Chi, Jun Liu, Yabiao Wang, Chengjie Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The performance of anomaly inspection in industrial manufacturing is
+constrained by the scarcity of anomaly data. To overcome this challenge,
+researchers have started employing anomaly generation approaches to augment the
+anomaly dataset. However, existing anomaly generation methods suffer from
+limited diversity in the generated anomalies and struggle to achieve a seamless
+blending of this anomaly with the original image. In this paper, we overcome
+these challenges from a new perspective, simultaneously generating a pair of
+the overall image and the corresponding anomaly part. We propose DualAnoDiff, a
+novel diffusion-based few-shot anomaly image generation model, which can
+generate diverse and realistic anomaly images by using a dual-interrelated
+diffusion model, where one of them is employed to generate the whole image
+while the other one generates the anomaly part. Moreover, we extract background
+and shape information to mitigate the distortion and blurriness phenomenon in
+few-shot image generation. Extensive experiments demonstrate the superiority of
+our proposed model over state-of-the-art methods in terms of both realism and
+diversity. Overall, our approach significantly improves the performance of
+downstream anomaly detection tasks, including anomaly detection, anomaly
+localization, and anomaly classification tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code: https://github.com/yinyjin/DualAnoDiff</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Lightweight High-Speed Photography Built on Coded Exposure and Implicit
+  Neural Representation of Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.13134v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.13134v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhihong Zhang, Runzhao Yang, Jinli Suo, Yuxiao Cheng, Qionghai Dai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The demand for compact cameras capable of recording high-speed scenes with
+high resolution is steadily increasing. However, achieving such capabilities
+often entails high bandwidth requirements, resulting in bulky, heavy systems
+unsuitable for low-capacity platforms. To address this challenge, leveraging a
+coded exposure setup to encode a frame sequence into a blurry snapshot and
+subsequently retrieve the latent sharp video presents a lightweight solution.
+Nevertheless, restoring motion from blur remains a formidable challenge due to
+the inherent ill-posedness of motion blur decomposition, the intrinsic
+ambiguity in motion direction, and the diverse motions present in natural
+videos. In this study, we propose a novel approach to address these challenges
+by combining the classical coded exposure imaging technique with the emerging
+implicit neural representation for videos. We strategically embed motion
+direction cues into the blurry image during the imaging process. Additionally,
+we develop a novel implicit neural representation based blur decomposition
+network to sequentially extract the latent video frames from the blurry image,
+leveraging the embedded motion direction cues. To validate the effectiveness
+and efficiency of our proposed framework, we conduct extensive experiments
+using benchmark datasets and real-captured blurry images. The results
+demonstrate that our approach significantly outperforms existing methods in
+terms of both quality and flexibility. The code for our work is available at
+.https://github.com/zhihongz/BDINR
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IJCV</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Structural Attention: Rethinking <span class="highlight-title">Transformer</span> for Unpaired Medical Image
+  Synthesis <span class="chip">MICCAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.18967v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.18967v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vu Minh Hieu Phan, Yutong Xie, Bowen Zhang, Yuankai Qi, Zhibin Liao, Antonios Perperidis, Son Lam Phung, Johan W. Verjans, Minh-Son To
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unpaired medical image synthesis aims to provide complementary information
+for an accurate clinical diagnostics, and address challenges in obtaining
+aligned multi-modal medical scans. Transformer-based models excel in imaging
+translation tasks thanks to their ability to capture long-range dependencies.
+Although effective in supervised training settings, their performance falters
+in unpaired image synthesis, particularly in synthesizing structural details.
+This paper empirically demonstrates that, lacking strong inductive biases,
+Transformer can converge to non-optimal solutions in the absence of paired
+data. To address this, we introduce UNet Structured Transformer (UNest), a
+novel architecture incorporating structural inductive biases for unpaired
+medical image synthesis. We leverage the foundational Segment-Anything Model to
+precisely extract the foreground structure and perform structural attention
+within the main anatomy. This guides the model to learn key anatomical regions,
+thus improving structural synthesis under the lack of supervision in unpaired
+training. Evaluated on two public datasets, spanning three modalities, i.e.,
+MR, CT, and PET, UNest improves recent methods by up to 19.30% across six
+medical image synthesis tasks. Our code is released at
+https://github.com/HieuPhan33/MICCAI2024-UNest.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>MICCAI version before camera ready</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ xGen-MM (BLIP-3): A Family of Open Large Multimodal Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.08872v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.08872v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Le Xue, Manli Shu, Anas Awadalla, Jun Wang, An Yan, Senthil Purushwalkam, Honglu Zhou, Viraj Prabhu, Yutong Dai, Michael S Ryoo, Shrikant Kendre, Jieyu Zhang, Can Qin, Shu Zhang, Chia-Chih Chen, Ning Yu, Juntao Tan, Tulika Manoj Awalgaonkar, Shelby Heinecke, Huan Wang, Yejin Choi, Ludwig Schmidt, Zeyuan Chen, Silvio Savarese, Juan Carlos Niebles, Caiming Xiong, Ran Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This report introduces xGen-MM (also known as BLIP-3), a framework for
+developing Large Multimodal Models (LMMs). The framework comprises meticulously
+curated datasets, a training recipe, model architectures, and a resulting suite
+of LMMs. xGen-MM, short for xGen-MultiModal, expands the Salesforce xGen
+initiative on foundation AI models. Our models undergo rigorous evaluation
+across a range of tasks, including both single and multi-image benchmarks. Our
+pre-trained base model exhibits strong in-context learning capabilities and the
+instruction-tuned model demonstrates competitive performance among open-source
+LMMs with similar model sizes. In addition, we introduce a safety-tuned model
+with DPO, aiming to mitigate harmful behaviors such as hallucinations and
+improve safety. We open-source our models, curated large-scale datasets, and
+our fine-tuning codebase to facilitate further advancements in LMM research.
+Associated resources will be available on our project page above.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Classification Matters: Improving Video Action Detection with
+  Class-Specific Attention <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.19698v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.19698v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinsung Lee, Taeoh Kim, Inwoong Lee, Minho Shim, Dongyoon Wee, Minsu Cho, Suha Kwak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video action detection (VAD) aims to detect actors and classify their actions
+in a video. We figure that VAD suffers more from classification rather than
+localization of actors. Hence, we analyze how prevailing methods form features
+for classification and find that they prioritize actor regions, yet often
+overlooking the essential contextual information necessary for accurate
+classification. Accordingly, we propose to reduce the bias toward actor and
+encourage paying attention to the context that is relevant to each action
+class. By assigning a class-dedicated query to each action class, our model can
+dynamically determine where to focus for effective classification. The proposed
+model demonstrates superior performance on three challenging benchmarks with
+significantly fewer parameters and less computation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 pages, accepted to ECCV 2024 (oral)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Drone Referring Localization: An Efficient Heterogeneous Spatial Feature
+  Interaction Method For UAV Self-Localization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.06561v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.06561v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ming Dai, Enhui Zheng, Jiahao Chen, Lei Qi, Zhenhua Feng, Wankou Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image retrieval (IR) has emerged as a promising approach for
+self-localization in unmanned aerial vehicles (UAVs). However, IR-based methods
+face several challenges: 1) Pre- and post-processing incur significant
+computational and storage overhead; 2) The lack of interaction between
+dual-source features impairs precise spatial perception. In this paper, we
+propose an efficient heterogeneous spatial feature interaction method, termed
+Drone Referring Localization (DRL), which aims to localize UAV-view images
+within satellite imagery. Unlike conventional methods that treat different data
+sources in isolation, followed by cosine similarity computations, DRL
+facilitates the learnable interaction of heterogeneous features. To implement
+the proposed DRL, we design two transformer-based frameworks, Post-Fusion and
+Mix-Fusion, enabling end-to-end training and inference. Furthermore, we
+introduce random scale cropping and weight balance loss techniques to augment
+paired data and optimize the balance between positive and negative sample
+weights. Additionally, we construct a new dataset, UL14, and establish a
+benchmark tailored to the DRL framework. Compared to traditional IR methods,
+DRL achieves superior localization accuracy (MA@20 +9.4\%) while significantly
+reducing computational time (1/7) and storage overhead (1/3). The dataset and
+code will be made publicly available. The dataset and code are available at
+\url{https://github.com/Dmmm1997/DRL} .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 14 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MolNexTR: A Generalized Deep Learning Model for Molecular Image
+  Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.03691v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.03691v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yufan Chen, Ching Ting Leung, Yong Huang, Jianwei Sun, Hao Chen, Hanyu Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the field of chemical structure recognition, the task of converting
+molecular images into machine-readable data formats such as SMILES string
+stands as a significant challenge, primarily due to the varied drawing styles
+and conventions prevalent in chemical literature. To bridge this gap, we
+proposed MolNexTR, a novel image-to-graph deep learning model that collaborates
+to fuse the strengths of ConvNext, a powerful Convolutional Neural Network
+variant, and Vision-TRansformer. This integration facilitates a more detailed
+extraction of both local and global features from molecular images. MolNexTR
+can predict atoms and bonds simultaneously and understand their layout rules.
+It also excels at flexibly integrating symbolic chemistry principles to discern
+chirality and decipher abbreviated structures. We further incorporate a series
+of advanced algorithms, including an improved data augmentation module, an
+image contamination module, and a post-processing module for getting the final
+SMILES output. These modules cooperate to enhance the model's robustness to
+diverse styles of molecular images found in real literature. In our test sets,
+MolNexTR has demonstrated superior performance, achieving an accuracy rate of
+81-97%, marking a significant advancement in the domain of molecular structure
+recognition.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Phase Matching for Out-of-Distribution Generalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12622v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12622v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengming Hu, Yeqian Du, Rui Wang, Hao Chen, Congcong Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Fourier transform, an explicit decomposition method for visual signals,
+has been employed to explain the out-of-distribution generalization behaviors
+of Deep Neural Networks (DNNs). Previous studies indicate that the amplitude
+spectrum is susceptible to the disturbance caused by distribution shifts,
+whereas the phase spectrum preserves highly-structured spatial information that
+is crucial for robust visual representation learning. Inspired by this insight,
+this paper is dedicated to clarifying the relationships between Domain
+Generalization (DG) and the frequency components. Specifically, we provide
+distribution analysis and empirical experiments for the frequency components.
+Based on these observations, we propose a Phase Matching approach, termed
+PhaMa, to address DG problems. To this end, PhaMa introduces perturbations on
+the amplitude spectrum and establishes spatial relationships to match the phase
+components with patch contrastive learning. Experiments on multiple benchmarks
+demonstrate that our proposed method achieves state-of-the-art performance in
+domain generalization and out-of-distribution robustness tasks. Beyond vanilla
+analysis and experiments, we further clarify the relationships between the
+Fourier components and DG problems by introducing a Fourier-based Structural
+Causal Model (SCM).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SGNet: Salient Geometric Network for Point Cloud Registration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.06207v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.06207v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qianliang Wu, Yaqing Ding, Lei Luo, Haobo Jiang, Shuo Gu, Chuanwei Zhou, Jin Xie, Jian Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Point Cloud Registration (PCR) is a critical and challenging task in computer
+vision. One of the primary difficulties in PCR is identifying salient and
+meaningful points that exhibit consistent semantic and geometric properties
+across different scans. Previous methods have encountered challenges with
+ambiguous matching due to the similarity among patch blocks throughout the
+entire point cloud and the lack of consideration for efficient global geometric
+consistency. To address these issues, we propose a new framework that includes
+several novel techniques. Firstly, we introduce a semantic-aware geometric
+encoder that combines object-level and patch-level semantic information. This
+encoder significantly improves registration recall by reducing ambiguity in
+patch-level superpoint matching. Additionally, we incorporate a prior knowledge
+approach that utilizes an intrinsic shape signature to identify salient points.
+This enables us to extract the most salient super points and meaningful dense
+points in the scene. Secondly, we introduce an innovative transformer that
+encodes High-Order (HO) geometric features. These features are crucial for
+identifying salient points within initial overlap regions while considering
+global high-order geometric consistency. To optimize this high-order
+transformer further, we introduce an anchor node selection strategy. By
+encoding inter-frame triangle or polyhedron consistency features based on these
+anchor nodes, we can effectively learn high-order geometric features of salient
+super points. These high-order features are then propagated to dense points and
+utilized by a Sinkhorn matching module to identify key correspondences for
+successful registration. In our experiments conducted on well-known datasets
+such as 3DMatch/3DLoMatch and KITTI, our approach has shown promising results,
+highlighting the effectiveness of our novel method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fine-Grained Building Function Recognition from Street-View Images via
+  Geometry-Aware Semi-Supervised Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.09460v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.09460v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weijia Li, Jinhua Yu, Dairong Chen, Yi Lin, Runmin Dong, Xiang Zhang, Conghui He, Haohuan Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we propose a geometry-aware semi-supervised method for
+fine-grained building function recognition. This method leverages the geometric
+relationships between multi-source data to improve the accuracy of pseudo
+labels in semi-supervised learning, extending the task's scope and making it
+applicable to cross-categorization systems of building function recognition.
+Firstly, we design an online semi-supervised pre-training stage, which
+facilitates the precise acquisition of building facade location information in
+street-view images. In the second stage, we propose a geometry-aware coarse
+annotation generation module. This module effectively combines GIS data and
+street-view data based on the geometric relationships, improving the accuracy
+of pseudo annotations. In the third stage, we combine the newly generated
+coarse annotations with the existing labeled dataset to achieve fine-grained
+functional recognition of buildings across multiple cities at a large scale.
+Extensive experiments demonstrate that our proposed framework exhibits superior
+performance in fine-grained functional recognition of buildings. Within the
+same categorization system, it achieves improvements of 7.6% and 4.8% compared
+to fully-supervised methods and state-of-the-art semi-supervised methods,
+respectively. Additionally, our method also performs well in cross-city tasks,
+i.e., extending the model trained on OmniCity (New York) to new areas (i.e.,
+Los Angeles and Boston). This study provides a novel solution for the
+fine-grained function recognition of large-scale buildings across multiple
+cities, offering essential data for understanding urban infrastructure
+planning, human activity patterns, and the interactions between humans and
+buildings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is currently under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-weather Cross-view Geo-localization Using Denoising Diffusion
+  Models <span class="chip">ACM MM24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.02408v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.02408v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tongtong Feng, Qing Li, Xin Wang, Mingzi Wang, Guangyao Li, Wenwu Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-view geo-localization in GNSS-denied environments aims to determine an
+unknown location by matching drone-view images with the correct geo-tagged
+satellite-view images from a large gallery. Recent research shows that learning
+discriminative image representations under specific weather conditions can
+significantly enhance performance. However, the frequent occurrence of unseen
+extreme weather conditions hinders progress. This paper introduces MCGF, a
+Multi-weather Cross-view Geo-localization Framework designed to dynamically
+adapt to unseen weather conditions. MCGF establishes a joint optimization
+between image restoration and geo-localization using denoising diffusion
+models. For image restoration, MCGF incorporates a shared encoder and a
+lightweight restoration module to help the backbone eliminate weather-specific
+information. For geo-localization, MCGF uses EVA-02 as a backbone for feature
+extraction, with cross-entropy loss for training and cosine distance for
+testing. Extensive experiments on University160k-WX demonstrate that MCGF
+achieves competitive results for geo-localization in varying weather
+conditions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM MM24 workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VHAKG: A Multi-modal Knowledge Graph Based on Synchronized Multi-view
+  Videos of Daily Activities <span class="chip">CIKM2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14895v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14895v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shusaku Egami, Takahiro Ugai, Swe Nwe Nwe Htun, Ken Fukuda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal knowledge graphs (MMKGs), which ground various non-symbolic data
+(e.g., images and videos) into symbols, have attracted attention as resources
+enabling knowledge processing and machine learning across modalities. However,
+the construction of MMKGs for videos consisting of multiple events, such as
+daily activities, is still in the early stages. In this paper, we construct an
+MMKG based on synchronized multi-view simulated videos of daily activities.
+Besides representing the content of daily life videos as event-centric
+knowledge, our MMKG also includes frame-by-frame fine-grained changes, such as
+bounding boxes within video frames. In addition, we provide support tools for
+querying our MMKG. As an application example, we demonstrate that our MMKG
+facilitates benchmarking vision-language models by providing the necessary
+vision-language datasets for a tailored task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 4 figures, accepted by CIKM2024 Resource Track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Customize-A-Video: One-Shot Motion Customization of Text-to-Video
+  Diffusion Models <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.14780v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.14780v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yixuan Ren, Yang Zhou, Jimei Yang, Jing Shi, Difan Liu, Feng Liu, Mingi Kwon, Abhinav Shrivastava
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image customization has been extensively studied in text-to-image (T2I)
+diffusion models, leading to impressive outcomes and applications. With the
+emergence of text-to-video (T2V) diffusion models, its temporal counterpart,
+motion customization, has not yet been well investigated. To address the
+challenge of one-shot video motion customization, we propose Customize-A-Video
+that models the motion from a single reference video and adapts it to new
+subjects and scenes with both spatial and temporal varieties. It leverages
+low-rank adaptation (LoRA) on temporal attention layers to tailor the
+pre-trained T2V diffusion model for specific motion modeling. To disentangle
+the spatial and temporal information during training, we introduce a novel
+concept of appearance absorbers that detach the original appearance from the
+reference video prior to motion learning. The proposed modules are trained in a
+staged pipeline and inferred in a plug-and-play fashion, enabling easy
+extensions to various downstream tasks such as custom video generation and
+editing, video appearance customization and multiple motion combination. Our
+project page can be found at https://customize-a-video.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ECCV 2024. Project page:
+  https://customize-a-video.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AutoInst: Automatic Instance-Based Segmentation of LiDAR 3D Scans <span class="chip">IROS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.16318v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.16318v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cedric Perauer, Laurenz Adrian Heidrich, Haifan Zhang, Matthias Nießner, Anastasiia Kornilova, Alexey Artemov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, progress in acquisition equipment such as LiDAR sensors has enabled
+sensing increasingly spacious outdoor 3D environments. Making sense of such 3D
+acquisitions requires fine-grained scene understanding, such as constructing
+instance-based 3D scene segmentations. Commonly, a neural network is trained
+for this task; however, this requires access to a large, densely annotated
+dataset, which is widely known to be challenging to obtain. To address this
+issue, in this work we propose to predict instance segmentations for 3D scenes
+in an unsupervised way, without relying on ground-truth annotations. To this
+end, we construct a learning framework consisting of two components: (1) a
+pseudo-annotation scheme for generating initial unsupervised pseudo-labels; and
+(2) a self-training algorithm for instance segmentation to fit robust, accurate
+instances from initial noisy proposals. To enable generating 3D instance mask
+proposals, we construct a weighted proxy-graph by connecting 3D points with
+edges integrating multi-modal image- and point-based self-supervised features,
+and perform graph-cuts to isolate individual pseudo-instances. We then build on
+a state-of-the-art point-based architecture and train a 3D instance
+segmentation model, resulting in significant refinement of initial proposals.
+To scale to arbitrary complexity 3D scenes, we design our algorithm to operate
+on local 3D point chunks and construct a merging step to generate scene-level
+instance segmentations. Experiments on the challenging SemanticKITTI benchmark
+demonstrate the potential of our approach, where it attains 13.3% higher
+Average Precision and 9.1% higher F1 score compared to the best-performing
+baseline. The code will be made publicly available at
+https://github.com/artonson/autoinst.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 7 figures, to be published in IEEE/RSJ International
+  Conference on Intelligent Robots and Systems (IROS) 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AnomalousPatchCore: Exploring the Use of Anomalous Samples in Industrial
+  Anomaly Detection <span class="chip">ECCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15113v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15113v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mykhailo Koshil, Tilman Wegener, Detlef Mentrup, Simone Frintrop, Christian Wilms
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual inspection, or industrial anomaly detection, is one of the most common
+quality control types in manufacturing. The task is to identify the presence of
+an anomaly given an image, e.g., a missing component on an image of a circuit
+board, for subsequent manual inspection. While industrial anomaly detection has
+seen a surge in recent years, most anomaly detection methods still utilize
+knowledge only from normal samples, failing to leverage the information from
+the frequently available anomalous samples. Additionally, they heavily rely on
+very general feature extractors pre-trained on common image classification
+datasets. In this paper, we address these shortcomings and propose the new
+anomaly detection system AnomalousPatchCore~(APC) based on a feature extractor
+fine-tuned with normal and anomalous in-domain samples and a subsequent memory
+bank for identifying unusual features. To fine-tune the feature extractor in
+APC, we propose three auxiliary tasks that address the different aspects of
+anomaly detection~(classification vs. localization) and mitigate the effect of
+the imbalance between normal and anomalous samples. Our extensive evaluation on
+the MVTec dataset shows that APC outperforms state-of-the-art systems in
+detecting anomalies, which is especially important in industrial anomaly
+detection given the subsequent manual inspection. In detailed ablation studies,
+we further investigate the properties of our APC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the 2nd workshop on Vision-based InduStrial InspectiON
+  (VISION) @ ECCV</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MMASD+: A Novel <span class="highlight-title">Dataset</span> for Privacy-Preserving Behavior Analysis of
+  Children with Autism Spectrum Disorder 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15077v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15077v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pavan Uttej Ravva, Behdokht Kiafar, Pinar Kullu, Jicheng Li, Anjana Bhat, Roghayeh Leila Barmaki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autism spectrum disorder (ASD) is characterized by significant challenges in
+social interaction and comprehending communication signals. Recently,
+therapeutic interventions for ASD have increasingly utilized Deep learning
+powered-computer vision techniques to monitor individual progress over time.
+These models are trained on private, non-public datasets from the autism
+community, creating challenges in comparing results across different models due
+to privacy-preserving data-sharing issues. This work introduces MMASD+, an
+enhanced version of the novel open-source dataset called Multimodal ASD
+(MMASD). MMASD+ consists of diverse data modalities, including 3D-Skeleton, 3D
+Body Mesh, and Optical Flow data. It integrates the capabilities of Yolov8 and
+Deep SORT algorithms to distinguish between the therapist and children,
+addressing a significant barrier in the original dataset. Additionally, a
+Multimodal Transformer framework is proposed to predict 11 action types and the
+presence of ASD. This framework achieves an accuracy of 95.03% for predicting
+action types and 96.42% for predicting ASD presence, demonstrating over a 10%
+improvement compared to models trained on single data modalities. These
+findings highlight the advantages of integrating multiple data modalities
+within the Multimodal Transformer framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Field-of-View Extension for Brain Diffusion MRI via Deep Generative
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.03652v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.03652v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenyu Gao, Shunxing Bao, Michael Kim, Nancy Newlin, Praitayini Kanakaraj, Tianyuan Yao, Gaurav Rudravaram, Yuankai Huo, Daniel Moyer, Kurt Schilling, Walter Kukull, Arthur Toga, Derek Archer, Timothy Hohman, Bennett Landman, Zhiyuan Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Purpose: In diffusion MRI (dMRI), the volumetric and bundle analyses of
+whole-brain tissue microstructure and connectivity can be severely impeded by
+an incomplete field-of-view (FOV). This work aims to develop a method for
+imputing the missing slices directly from existing dMRI scans with an
+incomplete FOV. We hypothesize that the imputed image with complete FOV can
+improve the whole-brain tractography for corrupted data with incomplete FOV.
+Therefore, our approach provides a desirable alternative to discarding the
+valuable dMRI data, enabling subsequent tractography analyses that would
+otherwise be challenging or unattainable with corrupted data. Approach: We
+propose a framework based on a deep generative model that estimates the absent
+brain regions in dMRI scans with incomplete FOV. The model is capable of
+learning both the diffusion characteristics in diffusion-weighted images (DWI)
+and the anatomical features evident in the corresponding structural images for
+efficiently imputing missing slices of DWI outside of incomplete FOV. Results:
+For evaluating the imputed slices, on the WRAP dataset the proposed framework
+achieved PSNRb0=22.397, SSIMb0=0.905, PSNRb1300=22.479, SSIMb1300=0.893; on the
+NACC dataset it achieved PSNRb0=21.304, SSIMb0=0.892, PSNRb1300=21.599,
+SSIMb1300= 0.877. The proposed framework improved the tractography accuracy, as
+demonstrated by an increased average Dice score for 72 tracts (p < 0.001) on
+both the WRAP and NACC datasets. Conclusions: Results suggest that the proposed
+framework achieved sufficient imputation performance in dMRI data with
+incomplete FOV for improving whole-brain tractography, thereby repairing the
+corrupted data. Our approach achieved more accurate whole-brain tractography
+results with extended and complete FOV and reduced the uncertainty when
+analyzing bundles associated with Alzheimer's Disease.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Biomedical Image Segmentation: A Systematic Literature <span class="highlight-title">Review</span> of Deep
+  Learning Based Object Detection Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.03393v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.03393v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fazli Wahid, Yingliang Ma, Dawar Khan, Muhammad Aamir, Syed U. K. Bukhari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Biomedical image segmentation plays a vital role in diagnosis of diseases
+across various organs. Deep learning-based object detection methods are
+commonly used for such segmentation. There exists an extensive research in this
+topic. However, there is no standard review on this topic. Existing surveys
+often lack a standardized approach or focus on broader segmentation techniques.
+In this paper, we conducted a systematic literature review (SLR), collected and
+analysed 148 articles that explore deep learning object detection methods for
+biomedical image segmentation. We critically analyzed these methods, identified
+the key challenges, and discussed the future directions. From the selected
+articles we extracted the results including the deep learning models, targeted
+imaging modalities, targeted diseases, and the metrics for the analysis of the
+methods. The results have been presented in tabular and/or charted forms. The
+results are presented in three major categories including two stage detection
+models, one stage detection models and point-based detection models. Each
+article is individually analyzed along with its pros and cons. Finally, we
+discuss open challenges, potential benefits, and future research directions.
+This SLR aims to provide the research community with a quick yet deeper
+understanding of these segmentation models, ultimately facilitating the
+development of more powerful solutions for biomedical image analysis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PhysPart: Physically Plausible Part Completion for Interactable Objects 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13724v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13724v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rundong Luo, Haoran Geng, Congyue Deng, Puhao Li, Zan Wang, Baoxiong Jia, Leonidas Guibas, Siyuan Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interactable objects are ubiquitous in our daily lives. Recent advances in 3D
+generative models make it possible to automate the modeling of these objects,
+benefiting a range of applications from 3D printing to the creation of robot
+simulation environments. However, while significant progress has been made in
+modeling 3D shapes and appearances, modeling object physics, particularly for
+interactable objects, remains challenging due to the physical constraints
+imposed by inter-part motions. In this paper, we tackle the problem of
+physically plausible part completion for interactable objects, aiming to
+generate 3D parts that not only fit precisely into the object but also allow
+smooth part motions. To this end, we propose a diffusion-based part generation
+model that utilizes geometric conditioning through classifier-free guidance and
+formulates physical constraints as a set of stability and mobility losses to
+guide the sampling process. Additionally, we demonstrate the generation of
+dependent parts, paving the way toward sequential part generation for objects
+with complex part-whole hierarchies. Experimentally, we introduce a new metric
+for measuring physical plausibility based on motion success rates. Our model
+outperforms existing baselines over shape and physical metrics, especially
+those that do not adequately model physical constraints. We also demonstrate
+our applications in 3D printing, robot manipulation, and sequential part
+generation, showing our strength in realistic tasks with the demand for high
+physical plausibility.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SurGen: Text-Guided Diffusion Model for Surgical Video Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14028v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14028v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joseph Cho, Samuel Schmidgall, Cyril Zakka, Mrudang Mathur, Rohan Shad, William Hiesinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion-based video generation models have made significant strides,
+producing outputs with improved visual fidelity, temporal coherence, and user
+control. These advancements hold great promise for improving surgical education
+by enabling more realistic, diverse, and interactive simulation environments.
+In this study, we introduce SurGen, a text-guided diffusion model tailored for
+surgical video synthesis, producing the highest resolution and longest duration
+videos among existing surgical video generation models. We validate the visual
+and temporal quality of the outputs using standard image and video generation
+metrics. Additionally, we assess their alignment to the corresponding text
+prompts through a deep learning classifier trained on surgical data. Our
+results demonstrate the potential of diffusion models to serve as valuable
+educational tools for surgical trainees.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Interpretable Image Emotion Recognition: A Domain Adaptation Approach
+  Using Facial Expressions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2011.08388v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2011.08388v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Puneet Kumar, Balasubramanian Raman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a feature-based domain adaptation technique for
+identifying emotions in generic images, encompassing both facial and non-facial
+objects, as well as non-human components. This approach addresses the challenge
+of the limited availability of pre-trained models and well-annotated datasets
+for Image Emotion Recognition (IER). Initially, a deep-learning-based Facial
+Expression Recognition (FER) system is developed, classifying facial images
+into discrete emotion classes. Maintaining the same network architecture, this
+FER system is then adapted to recognize emotions in generic images through the
+application of discrepancy loss, enabling the model to effectively learn IER
+features while classifying emotions into categories such as 'happy,' 'sad,'
+'hate,' and 'anger.' Additionally, a novel interpretability method, Divide and
+Conquer based Shap (DnCShap), is introduced to elucidate the visual features
+most relevant for emotion recognition. The proposed IER system demonstrated
+emotion classification accuracies of 60.98% for the IAPSa dataset, 58.86% for
+the ArtPhoto dataset, 69.13% for the FI dataset, and 58.06% for the EMOTIC
+dataset. The system effectively identifies the important visual features
+leading to specific emotion classifications and provides detailed embedding
+plots to explain the predictions, enhancing the understanding and trust in
+AI-driven emotion recognition systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">13</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Modeling and Analyzing the Influence of Non-Item Pages on Sequential
+  Next-Item Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15953v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15953v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elisabeth Fischer, Daniel Schlör, Albin Zehe, Andreas Hotho
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Analyzing the sequence of historical interactions between users and items,
+sequential recommendation models learn user intent and make predictions about
+the next item of interest. Next to these item interactions, most systems also
+have interactions with pages not related to specific items, for example
+navigation pages, account pages, and pages for a specific category, which may
+provide additional insights into the user's interests. However, while there are
+several approaches to integrate additional information about items and users,
+the topic of integrating non-item pages has been less explored. We use the
+hypotheses testing framework HypTrails to show that there is indeed a
+relationship between these non-item pages and the items of interest and fill
+this gap by proposing various approaches of representing non-item pages (e.g,
+based on their content) to use them as an additional information source for the
+task of sequential next-item prediction.
+  We create a synthetic dataset with non-item pages highly related to the
+subsequent item to show that the models are generally capable of learning from
+these interactions, and subsequently evaluate the improvements gained by
+including non-item pages in two real-world datasets.
+  We adapt eight popular sequential recommender models, covering CNN-, RNN- and
+transformer-based architectures, to integrate non-item pages and investigate
+the capabilities of these models to leverage their information for next item
+prediction. We also analyze their behavior on noisy data and compare different
+item representation strategies.
+  Our results show that non-item pages are a valuable source of information,
+but representing such a page well is the key to successfully leverage them. The
+inclusion of non-item pages can increase the performance for next-item
+prediction in all examined model architectures with a varying degree.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>36 pages, 19 figures; Work in Progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Knowledge Navigator: LLM-guided Browsing Framework for Exploratory
+  Search in Scientific Literature 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15836v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15836v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Uri Katz, Mosh Levy, Yoav Goldberg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The exponential growth of scientific literature necessitates advanced tools
+for effective knowledge exploration. We present Knowledge Navigator, a system
+designed to enhance exploratory search abilities by organizing and structuring
+the retrieved documents from broad topical queries into a navigable, two-level
+hierarchy of named and descriptive scientific topics and subtopics. This
+structured organization provides an overall view of the research themes in a
+domain, while also enabling iterative search and deeper knowledge discovery
+within specific subtopics by allowing users to refine their focus and retrieve
+additional relevant documents. Knowledge Navigator combines LLM capabilities
+with cluster-based methods to enable an effective browsing method. We
+demonstrate our approach's effectiveness through automatic and manual
+evaluations on two novel benchmarks, CLUSTREC-COVID and SCITOC. Our code,
+prompts, and benchmarks are made publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating Named Entity Recognition Using Few-Shot <span class="highlight-title">Prompt</span>ing with Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15796v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15796v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hédi Zhegidi, Ludovic Moncla
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper evaluates Few-Shot Prompting with Large Language Models for Named
+Entity Recognition (NER). Traditional NER systems rely on extensive labeled
+datasets, which are costly and time-consuming to obtain. Few-Shot Prompting or
+in-context learning enables models to recognize entities with minimal examples.
+We assess state-of-the-art models like GPT-4 in NER tasks, comparing their
+few-shot performance to fully supervised benchmarks. Results show that while
+there is a performance gap, large models excel in adapting to new entity types
+and domains with very limited data. We also explore the effects of prompt
+engineering, guided output format and context length on performance. This study
+underscores Few-Shot Learning's potential to reduce the need for large labeled
+datasets, enhancing NER scalability and accessibility.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Github repo: https://github.com/GEODE-project/ner-llm</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Interactive Agents: Simulating Counselor-Client Psychological Counseling
+  via Role-Playing LLM-to-LLM Interactions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15787v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15787v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huachuan Qiu, Zhenzhong Lan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Virtual counselors powered by large language models (LLMs) aim to create
+interactive support systems that effectively assist clients struggling with
+mental health challenges. To replicate counselor-client conversations,
+researchers have built an online mental health platform that allows
+professional counselors to provide clients with text-based counseling services
+for about an hour per session. Notwithstanding its effectiveness, challenges
+exist as human annotation is time-consuming, cost-intensive, privacy-protected,
+and not scalable. To address this issue and investigate the applicability of
+LLMs in psychological counseling conversation simulation, we propose a
+framework that employs two LLMs via role-playing for simulating
+counselor-client interactions. Our framework involves two LLMs, one acting as a
+client equipped with a specific and real-life user profile and the other
+playing the role of an experienced counselor, generating professional responses
+using integrative therapy techniques. We implement both the counselor and the
+client by zero-shot prompting the GPT-4 model. In order to assess the
+effectiveness of LLMs in simulating counselor-client interactions and
+understand the disparities between LLM- and human-generated conversations, we
+evaluate the synthetic data from various perspectives. We begin by assessing
+the client's performance through automatic evaluations. Next, we analyze and
+compare the disparities between dialogues generated by the LLM and those
+generated by professional counselors. Furthermore, we conduct extensive
+experiments to thoroughly examine the performance of our LLM-based counselor
+trained with synthetic interactive dialogues by benchmarking against
+state-of-the-art models for mental health.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PDSR: A Privacy-Preserving Diversified Service Recommendation Method on
+  Distributed Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15688v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15688v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lina Wang, Huan Yang, Yiran Shen, Chao Liu, Lianyong Qi, Xiuzhen Cheng, Feng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The last decade has witnessed a tremendous growth of service computing, while
+efficient service recommendation methods are desired to recommend high-quality
+services to users. It is well known that collaborative filtering is one of the
+most popular methods for service recommendation based on QoS, and many existing
+proposals focus on improving recommendation accuracy, i.e., recommending
+high-quality redundant services. Nevertheless, users may have different
+requirements on QoS, and hence diversified recommendation has been attracting
+increasing attention in recent years to fulfill users' diverse demands and to
+explore potential services. Unfortunately, the recommendation performances
+relies on a large volume of data (e.g., QoS data), whereas the data may be
+distributed across multiple platforms. Therefore, to enable data sharing across
+the different platforms for diversified service recommendation, we propose a
+Privacy-preserving Diversified Service Recommendation (PDSR) method.
+Specifically, we innovate in leveraging the Locality-Sensitive Hashing (LSH)
+mechanism such that privacy-preserved data sharing across different platforms
+is enabled to construct a service similarity graph. Based on the similarity
+graph, we propose a novel accuracy-diversity metric and design a
+$2$-approximation algorithm to select $K$ services to recommend by maximizing
+the accuracy-diversity measure. Extensive experiments on real datasets are
+conducted to verify the efficacy of our PDSR method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CAPER: Enhancing Career Trajectory Prediction using Temporal Knowledge
+  Graph and Ternary Relationship 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15620v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15620v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yeon-Chang Lee, JaeHyun Lee, Michiharu Yamashita, Dongwon Lee, Sang-Wook Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The problem of career trajectory prediction (CTP) aims to predict one's
+future employer or job position. While several CTP methods have been developed
+for this problem, we posit that none of these methods (1) jointly considers the
+mutual ternary dependency between three key units (i.e., user, position, and
+company) of a career and (2) captures the characteristic shifts of key units in
+career over time, leading to an inaccurate understanding of the job movement
+patterns in the labor market. To address the above challenges, we propose a
+novel solution, named as CAPER, that solves the challenges via sophisticated
+temporal knowledge graph (TKG) modeling. It enables the utilization of a
+graph-structured knowledge base with rich expressiveness, effectively
+preserving the changes in job movement patterns. Furthermore, we devise an
+extrapolated career reasoning task on TKG for a realistic evaluation. The
+experiments on a real-world career trajectory dataset demonstrate that CAPER
+consistently and significantly outperforms four baselines, two recent TKG
+reasoning methods, and five state-of-the-art CTP methods in predicting one's
+future companies and positions-i.e., on average, yielding 6.80% and 34.58% more
+accurate predictions, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Lyrically Speaking: Exploring the Link Between Lyrical Emotions, Themes
+  and Depression Risk 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15575v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15575v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pavani Chowdary, Bhavyajeet Singh, Rajat Agarwal, Vinoo Alluri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lyrics play a crucial role in affecting and reinforcing emotional states by
+providing meaning and emotional connotations that interact with the acoustic
+properties of the music. Specific lyrical themes and emotions may intensify
+existing negative states in listeners and may lead to undesirable outcomes,
+especially in listeners with mood disorders such as depression. Hence, it is
+important for such individuals to be mindful of their listening strategies. In
+this study, we examine online music consumption of individuals at risk of
+depression in light of lyrical themes and emotions. Lyrics obtained from the
+listening histories of 541 Last.fm users, divided into At-Risk and No-Risk
+based on their mental well-being scores, were analyzed using natural language
+processing techniques. Statistical analyses of the results revealed that
+individuals at risk for depression prefer songs with lyrics associated with low
+valence and low arousal. Additionally, lyrics associated with themes of denial,
+self-reference, and ambivalence were preferred. In contrast, themes such as
+liberation, familiarity, and activity are not as favored. This study opens up
+the possibility of an approach to assessing depression risk from the digital
+footprint of individuals and potentially developing personalized recommendation
+systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the 25th International Society for Music Information
+  Retrieval Conference (ISMIR) 2024, San Francisco, United States</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient $k$-NN Search in IoT Data: Overlap Optimization in Tree-Based
+  Indexing Structures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16036v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16036v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ala-Eddine Benrazek, Zineddine Kouahla, Brahim Farou, Hamid Seridi, Ibtissem Kemouguette
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The proliferation of interconnected devices in the Internet of Things (IoT)
+has led to an exponential increase in data, commonly known as Big IoT Data.
+Efficient retrieval of this heterogeneous data demands a robust indexing
+mechanism for effective organization. However, a significant challenge remains:
+the overlap in data space partitions during index construction. This overlap
+increases node access during search and retrieval, resulting in higher resource
+consumption, performance bottlenecks, and impedes system scalability. To
+address this issue, we propose three innovative heuristics designed to quantify
+and strategically reduce data space partition overlap. The volume-based method
+(VBM) offers a detailed assessment by calculating the intersection volume
+between partitions, providing deeper insights into spatial relationships. The
+distance-based method (DBM) enhances efficiency by using the distance between
+partition centers and radii to evaluate overlap, offering a streamlined yet
+accurate approach. Finally, the object-based method (OBM) provides a practical
+solution by counting objects across multiple partitions, delivering an
+intuitive understanding of data space dynamics. Experimental results
+demonstrate the effectiveness of these methods in reducing search time,
+underscoring their potential to improve data space partitioning and enhance
+overall system performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, 21 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Extremely Data-efficient and Generative LLM-based Reinforcement
+  Learning Agent for Recommenders 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16032v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16032v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuang Feng, Grace Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in large language models (LLMs) have enabled
+understanding webpage contexts, product details, and human instructions.
+Utilizing LLMs as the foundational architecture for either reward models or
+policies in reinforcement learning has gained popularity -- a notable
+achievement is the success of InstructGPT. RL algorithms have been instrumental
+in maximizing long-term customer satisfaction and avoiding short-term, myopic
+goals in industrial recommender systems, which often rely on deep learning
+models to predict immediate clicks or purchases.
+  In this project, several RL methods are implemented and evaluated using the
+WebShop benchmark environment, data, simulator, and pre-trained model
+checkpoints. The goal is to train an RL agent to maximize the purchase reward
+given a detailed human instruction describing a desired product. The RL agents
+are developed by fine-tuning a pre-trained BERT model with various objectives,
+learning from preferences without a reward model, and employing contemporary
+training techniques such as Proximal Policy Optimization (PPO) as used in
+InstructGPT, and Direct Preference Optimization (DPO). This report also
+evaluates the RL agents trained using generative trajectories. Evaluations were
+conducted using Thompson sampling in the WebShop simulator environment.
+  The simulated online experiments demonstrate that agents trained on generated
+trajectories exhibited comparable task performance to those trained using human
+trajectories. This has demonstrated an example of an extremely low-cost
+data-efficient way of training reinforcement learning agents. Also, with
+limited training time (<2hours), without utilizing any images, a DPO agent
+achieved a 19% success rate after approximately 3000 steps or 30 minutes of
+training on T4 GPUs, compared to a PPO agent, which reached a 15% success rate.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Contextual Bandit with Herding Effects: Algorithms and Recommendation
+  Applications <span class="chip">PRICAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14432v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14432v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luyue Xu, Liming Wang, Hong Xie, Mingqiang Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contextual bandits serve as a fundamental algorithmic framework for
+optimizing recommendation decisions online. Though extensive attention has been
+paid to tailoring contextual bandits for recommendation applications, the
+"herding effects" in user feedback have been ignored. These herding effects
+bias user feedback toward historical ratings, breaking down the assumption of
+unbiased feedback inherent in contextual bandits. This paper develops a novel
+variant of the contextual bandit that is tailored to address the feedback bias
+caused by the herding effects. A user feedback model is formulated to capture
+this feedback bias. We design the TS-Conf (Thompson Sampling under Conformity)
+algorithm, which employs posterior sampling to balance the exploration and
+exploitation tradeoff. We prove an upper bound for the regret of the algorithm,
+revealing the impact of herding effects on learning speed. Extensive
+experiments on datasets demonstrate that TS-Conf outperforms four benchmark
+algorithms. Analysis reveals that TS-Conf effectively mitigates the negative
+impact of herding effects, resulting in faster learning and improved
+recommendation accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published as a conference paper at PRICAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PASH at TREC 2021 Deep Learning Track: Generative Enhanced Model for
+  Multi-stage Ranking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.11245v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.11245v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yixuan Qiao, Hao Chen, Jun Wang, Tuozhen Liu, Xianbin Ye, Xin Tang, Rui Fang, Peng Gao, Wenfeng Xie, Guotong Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper describes the PASH participation in TREC 2021 Deep Learning Track.
+In the recall stage, we adopt a scheme combining sparse and dense retrieval
+method. In the multi-stage ranking phase, point-wise and pair-wise ranking
+strategies are used one after another based on model continual pre-trained on
+general knowledge and document-level data. Compared to TREC 2020 Deep Learning
+Track, we have additionally introduced the generative model T5 to further
+enhance the performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>TREC 2021</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ WeKnow-RAG: An Adaptive Approach for Retrieval-Augmented Generation
+  Integrating Web Search and Knowledge Graphs <span class="chip">KDD</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.07611v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.07611v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weijian Xie, Xuefeng Liang, Yuhui Liu, Kaihua Ni, Hong Cheng, Zetian Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have greatly contributed to the development of
+adaptive intelligent agents and are positioned as an important way to achieve
+Artificial General Intelligence (AGI). However, LLMs are prone to produce
+factually incorrect information and often produce "phantom" content that
+undermines their reliability, which poses a serious challenge for their
+deployment in real-world scenarios. Enhancing LLMs by combining external
+databases and information retrieval mechanisms is an effective path. To address
+the above challenges, we propose a new approach called WeKnow-RAG, which
+integrates Web search and Knowledge Graphs into a "Retrieval-Augmented
+Generation (RAG)" system. First, the accuracy and reliability of LLM responses
+are improved by combining the structured representation of Knowledge Graphs
+with the flexibility of dense vector retrieval. WeKnow-RAG then utilizes
+domain-specific knowledge graphs to satisfy a variety of queries and domains,
+thereby improving performance on factual information and complex reasoning
+tasks by employing multi-stage web page retrieval techniques using both sparse
+and dense retrieval methods. Our approach effectively balances the efficiency
+and accuracy of information retrieval, thus improving the overall retrieval
+process. Finally, we also integrate a self-assessment mechanism for the LLM to
+evaluate the trustworthiness of the answers it generates. Our approach proves
+its outstanding effectiveness in a wide range of offline experiments and online
+submissions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 2 figures, technical report for 3rd place in Task 3 of Meta
+  KDD Cup 2024 CRAG Challenge</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From Data Creator to Data Reuser: Distance Matters 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.07926v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.07926v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christine L. Borgman, Paul T. Groth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sharing research data is necessary, but not sufficient, for data reuse. Open
+science policies focus more heavily on data sharing than on reuse, yet both are
+complex, labor-intensive, expensive, and require infrastructure investments by
+multiple stakeholders. The value of data reuse lies in relationships between
+creators and reusers. By addressing knowledge exchange, rather than mere
+transactions between stakeholders, investments in data management and knowledge
+infrastructures can be made more wisely. Drawing upon empirical studies of data
+sharing and reuse, we develop the theoretical construct of distance between
+data creator and data reuser, identifying six distance dimensions that
+influence the ability to transfer knowledge effectively: domain, methods,
+collaboration, curation, purposes, and time and temporality. We address the
+social and socio-technical aspects of these dimensions, exploring ways in which
+they may decrease -- or increase -- distances between creators and reusers. Our
+theoretical framing of the distance between data creators and prospective
+reusers leads to recommendations to four categories of stakeholders on how to
+make data sharing and reuse more effective: data creators, data reusers, data
+archivists, and funding agencies. 'It takes a village' to share research data
+-- and a village to reuse data. Our aim is to provoke new research questions,
+new research, and new investments in effective and efficient circulation of
+research data; and to identify criteria for investments at each stage of data
+and research life cycles.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>74 pages, double-spaced, consisting of Table of Contents, Abstract,
+  45 page narrative, 1 box, 1 figure, 1 table, 27 pages references. Original
+  work</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">119</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Q-MRS: A Deep Learning Framework for Quantitative Magnetic Resonance
+  Spectra Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15999v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15999v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christopher J. Wu, Lawrence S. Kegeles, Jia Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Magnetic resonance spectroscopy (MRS) is an established technique for
+studying tissue metabolism, particularly in central nervous system disorders.
+While powerful and versatile, MRS is often limited by challenges associated
+with data quality, processing, and quantification. Existing MRS quantification
+methods face difficulties in balancing model complexity and reproducibility
+during spectral modeling, often falling into the trap of either
+oversimplification or over-parameterization. To address these limitations, this
+study introduces a deep learning (DL) framework that employs transfer learning,
+in which the model is pre-trained on simulated datasets before it undergoes
+fine-tuning on in vivo data. The proposed framework showed promising
+performance when applied to the Philips dataset from the BIG GABA repository
+and represents an exciting advancement in MRS data analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 4 figures, and 3 tables for the main body; 9 pages, 4
+  figures, and 3 tables for the supplementary material</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Eagle: Exploring The Design Space for Multimodal LLMs with Mixture of
+  Encoders 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15998v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15998v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Min Shi, Fuxiao Liu, Shihao Wang, Shijia Liao, Subhashree Radhakrishnan, De-An Huang, Hongxu Yin, Karan Sapra, Yaser Yacoob, Humphrey Shi, Bryan Catanzaro, Andrew Tao, Jan Kautz, Zhiding Yu, Guilin Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ability to accurately interpret complex visual information is a crucial
+topic of multimodal large language models (MLLMs). Recent work indicates that
+enhanced visual perception significantly reduces hallucinations and improves
+performance on resolution-sensitive tasks, such as optical character
+recognition and document analysis. A number of recent MLLMs achieve this goal
+using a mixture of vision encoders. Despite their success, there is a lack of
+systematic comparisons and detailed ablation studies addressing critical
+aspects, such as expert selection and the integration of multiple vision
+experts. This study provides an extensive exploration of the design space for
+MLLMs using a mixture of vision encoders and resolutions. Our findings reveal
+several underlying principles common to various existing strategies, leading to
+a streamlined yet effective design approach. We discover that simply
+concatenating visual tokens from a set of complementary vision encoders is as
+effective as more complex mixing architectures or strategies. We additionally
+introduce Pre-Alignment to bridge the gap between vision-focused encoders and
+language tokens, enhancing model coherence. The resulting family of MLLMs,
+Eagle, surpasses other leading open-source models on major MLLM benchmarks.
+Models and code: https://github.com/NVlabs/Eagle
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Github: https://github.com/NVlabs/Eagle, HuggingFace:
+  https://huggingface.co/NVEagle</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mamba or <span class="highlight-title">Transformer</span> for Time Series Forecasting? Mixture of Universals
+  (MoU) Is All You Need 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15997v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15997v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sijia Peng, Yun Xiong, Yangyong Zhu, Zhiqiang Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Time series forecasting requires balancing short-term and long-term
+dependencies for accurate predictions. Existing methods mainly focus on
+long-term dependency modeling, neglecting the complexities of short-term
+dynamics, which may hinder performance. Transformers are superior in modeling
+long-term dependencies but are criticized for their quadratic computational
+cost. Mamba provides a near-linear alternative but is reported less effective
+in time series longterm forecasting due to potential information loss. Current
+architectures fall short in offering both high efficiency and strong
+performance for long-term dependency modeling. To address these challenges, we
+introduce Mixture of Universals (MoU), a versatile model to capture both
+short-term and long-term dependencies for enhancing performance in time series
+forecasting. MoU is composed of two novel designs: Mixture of Feature
+Extractors (MoF), an adaptive method designed to improve time series patch
+representations for short-term dependency, and Mixture of Architectures (MoA),
+which hierarchically integrates Mamba, FeedForward, Convolution, and
+Self-Attention architectures in a specialized order to model long-term
+dependency from a hybrid perspective. The proposed approach achieves
+state-of-the-art performance while maintaining relatively low computational
+costs. Extensive experiments on seven real-world datasets demonstrate the
+superiority of MoU. Code is available at https://github.com/lunaaa95/mou/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code at https://github.com/lunaaa95/mou/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ClimDetect: A Benchmark <span class="highlight-title">Dataset</span> for Climate Change Detection and
+  Attribution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15993v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15993v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sungduk Yu, Brian L. White, Anahita Bhiwandiwalla, Musashi Hinck, Matthew Lyle Olson, Tung Nguyen, Vasudev Lal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting and attributing temperature increases due to climate change is
+crucial for understanding global warming and guiding adaptation strategies. The
+complexity of distinguishing human-induced climate signals from natural
+variability has challenged traditional detection and attribution (D&A)
+approaches, which seek to identify specific "fingerprints" in climate response
+variables. Deep learning offers potential for discerning these complex patterns
+in expansive spatial datasets. However, lack of standard protocols has hindered
+consistent comparisons across studies. We introduce ClimDetect, a standardized
+dataset of over 816k daily climate snapshots, designed to enhance model
+accuracy in identifying climate change signals. ClimDetect integrates various
+input and target variables used in past research, ensuring comparability and
+consistency. We also explore the application of vision transformers (ViT) to
+climate data, a novel and modernizing approach in this context. Our open-access
+data and code serve as a benchmark for advancing climate science through
+improved model evaluations. ClimDetect is publicly accessible via Huggingface
+dataet respository at: https://huggingface.co/datasets/ClimDetect/ClimDetect.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CoGen: Learning from Feedback with Coupled Comprehension and Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15992v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15992v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mustafa Omer Gul, Yoav Artzi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Systems with both language comprehension and generation capabilities can
+benefit from the tight connection between the two. This work studies coupling
+comprehension and generation with focus on continually learning from
+interaction with users. We propose techniques to tightly integrate the two
+capabilities for both learning and inference. We situate our studies in
+two-player reference games, and deploy various models for thousands of
+interactions with human users, while learning from interaction feedback
+signals. We show dramatic improvements in performance over time, with
+comprehension-generation coupling leading to performance improvements up to 26%
+in absolute terms and up to 17% higher accuracies compared to a non-coupled
+system. Our analysis also shows coupling has substantial qualitative impact on
+the system's language, making it significantly more human-like.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Stability of Primal-Dual Gradient Flow Dynamics for Multi-Block Convex
+  Optimization Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15969v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15969v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ibrahim K. Ozaslan, Panagiotis Patrinos, Mihailo R. Jovanović
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We examine stability properties of primal-dual gradient flow dynamics for
+composite convex optimization problems with multiple, possibly nonsmooth, terms
+in the objective function under the generalized consensus constraint. The
+proposed dynamics are based on the proximal augmented Lagrangian and they
+provide a viable alternative to ADMM which faces significant challenges from
+both analysis and implementation viewpoints in large-scale multi-block
+scenarios. In contrast to customized algorithms with individualized convergence
+guarantees, we provide a systematic approach for solving a broad class of
+challenging composite optimization problems. We leverage various structural
+properties to establish global (exponential) convergence guarantees for the
+proposed dynamics. Our assumptions are much weaker than those required to prove
+(exponential) stability of various primal-dual dynamics as well as (linear)
+convergence of discrete-time methods, e.g., standard two-block and multi-block
+ADMM and EXTRA algorithms. Finally, we show necessity of some of our structural
+assumptions for exponential stability and provide computational experiments to
+demonstrate the convenience of the proposed dynamics for parallel and
+distributed computing applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 pages; 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient Slice Anomaly Detection Network for 3D Brain MRI Volume 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15958v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15958v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeduo Zhang, Yalda Mohsenzadeh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current anomaly detection methods excel with benchmark industrial data but
+struggle with natural images and medical data due to varying definitions of
+'normal' and 'abnormal.' This makes accurate identification of deviations in
+these fields particularly challenging. Especially for 3D brain MRI data, all
+the state-of-the-art models are reconstruction-based with 3D convolutional
+neural networks which are memory-intensive, time-consuming and producing noisy
+outputs that require further post-processing. We propose a framework called
+Simple Slice-based Network (SimpleSliceNet), which utilizes a model pre-trained
+on ImageNet and fine-tuned on a separate MRI dataset as a 2D slice feature
+extractor to reduce computational cost. We aggregate the extracted features to
+perform anomaly detection tasks on 3D brain MRI volumes. Our model integrates a
+conditional normalizing flow to calculate log likelihood of features and
+employs the Semi-Push-Pull Mechanism to enhance anomaly detection accuracy. The
+results indicate improved performance, showcasing our model's remarkable
+adaptability and effectiveness when addressing the challenges exists in brain
+MRI data. In addition, for the large-scale 3D brain volumes, our model
+SimpleSliceNet outperforms the state-of-the-art 2D and 3D models in terms of
+accuracy, memory usage and time consumption. Code is available at:
+https://anonymous.4open.science/r/SimpleSliceNet-8EA3.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generating Binary Species Range Maps 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15956v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15956v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Filip Dorm, Christian Lange, Scott Loarie, Oisin Mac Aodha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurately predicting the geographic ranges of species is crucial for
+assisting conservation efforts. Traditionally, range maps were manually created
+by experts. However, species distribution models (SDMs) and, more recently,
+deep learning-based variants offer a potential automated alternative. Deep
+learning-based SDMs generate a continuous probability representing the
+predicted presence of a species at a given location, which must be binarized by
+setting per-species thresholds to obtain binary range maps. However, selecting
+appropriate per-species thresholds to binarize these predictions is non-trivial
+as different species can require distinct thresholds. In this work, we evaluate
+different approaches for automatically identifying the best thresholds for
+binarizing range maps using presence-only data. This includes approaches that
+require the generation of additional pseudo-absence data, along with ones that
+only require presence data. We also propose an extension of an existing
+presence-only technique that is more robust to outliers. We perform a detailed
+evaluation of different thresholding techniques on the tasks of binary range
+estimation and large-scale fine-grained visual classification, and we
+demonstrate improved performance over existing pseudo-absence free approaches
+using our method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Modeling and Analyzing the Influence of Non-Item Pages on Sequential
+  Next-Item Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15953v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15953v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elisabeth Fischer, Daniel Schlör, Albin Zehe, Andreas Hotho
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Analyzing the sequence of historical interactions between users and items,
+sequential recommendation models learn user intent and make predictions about
+the next item of interest. Next to these item interactions, most systems also
+have interactions with pages not related to specific items, for example
+navigation pages, account pages, and pages for a specific category, which may
+provide additional insights into the user's interests. However, while there are
+several approaches to integrate additional information about items and users,
+the topic of integrating non-item pages has been less explored. We use the
+hypotheses testing framework HypTrails to show that there is indeed a
+relationship between these non-item pages and the items of interest and fill
+this gap by proposing various approaches of representing non-item pages (e.g,
+based on their content) to use them as an additional information source for the
+task of sequential next-item prediction.
+  We create a synthetic dataset with non-item pages highly related to the
+subsequent item to show that the models are generally capable of learning from
+these interactions, and subsequently evaluate the improvements gained by
+including non-item pages in two real-world datasets.
+  We adapt eight popular sequential recommender models, covering CNN-, RNN- and
+transformer-based architectures, to integrate non-item pages and investigate
+the capabilities of these models to leverage their information for next item
+prediction. We also analyze their behavior on noisy data and compare different
+item representation strategies.
+  Our results show that non-item pages are a valuable source of information,
+but representing such a page well is the key to successfully leverage them. The
+inclusion of non-item pages can increase the performance for next-item
+prediction in all examined model architectures with a varying degree.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>36 pages, 19 figures; Work in Progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sigma Flows for Image and Data Labeling and Learning Structured
+  Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15946v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15946v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonas Cassel, Bastian Boll, Stefania Petra, Peter Albers, Christoph Schnörr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces the sigma flow model for the prediction of structured
+labelings of data observed on Riemannian manifolds, including Euclidean image
+domains as special case. The approach combines the Laplace-Beltrami framework
+for image denoising and enhancement, introduced by Sochen, Kimmel and Malladi
+about 25 years ago, and the assignment flow approach introduced and studied by
+the authors.
+  The sigma flow arises as Riemannian gradient flow of generalized harmonic
+energies and thus is governed by a nonlinear geometric PDE which determines a
+harmonic map from a closed Riemannian domain manifold to a statistical
+manifold, equipped with the Fisher-Rao metric from information geometry. A
+specific ingredient of the sigma flow is the mutual dependency of the
+Riemannian metric of the domain manifold on the evolving state. This makes the
+approach amenable to machine learning in a specific way, by realizing this
+dependency through a mapping with compact time-variant parametrization that can
+be learned from data. Proof of concept experiments demonstrate the expressivity
+of the sigma flow model and prediction performance.
+  Structural similarities to transformer network architectures and networks
+generated by the geometric integration of sigma flows are pointed out, which
+highlights the connection to deep learning and, conversely, may stimulate the
+use of geometric design principles for structured prediction in other areas of
+scientific machine learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>51 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generalized Naive Bayes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15923v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15923v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Edith Alice Kovács, Anna Ország, Dániel Pfeifer, András Benczúr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper we introduce the so-called Generalized Naive Bayes structure as
+an extension of the Naive Bayes structure. We give a new greedy algorithm that
+finds a good fitting Generalized Naive Bayes (GNB) probability distribution. We
+prove that this fits the data at least as well as the probability distribution
+determined by the classical Naive Bayes (NB). Then, under a not very
+restrictive condition, we give a second algorithm for which we can prove that
+it finds the optimal GNB probability distribution, i.e. best fitting structure
+in the sense of KL divergence. Both algorithms are constructed to maximize the
+information content and aim to minimize redundancy. Based on these algorithms,
+new methods for feature selection are introduced. We discuss the similarities
+and differences to other related algorithms in terms of structure, methodology,
+and complexity. Experimental results show, that the algorithms introduced
+outperform the related algorithms in many cases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>44 pages, 19 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-modal Adversarial Training for Zero-Shot Voice Cloning <span class="chip">INTERSPEECH 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15916v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15916v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        John Janiczek, Dading Chong, Dongyang Dai, Arlo Faria, Chao Wang, Tao Wang, Yuzong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A text-to-speech (TTS) model trained to reconstruct speech given text tends
+towards predictions that are close to the average characteristics of a dataset,
+failing to model the variations that make human speech sound natural. This
+problem is magnified for zero-shot voice cloning, a task that requires training
+data with high variance in speaking styles. We build off of recent works which
+have used Generative Advsarial Networks (GAN) by proposing a Transformer
+encoder-decoder architecture to conditionally discriminates between real and
+generated speech features. The discriminator is used in a training pipeline
+that improves both the acoustic and prosodic features of a TTS model. We
+introduce our novel adversarial training technique by applying it to a
+FastSpeech2 acoustic model and training on Libriheavy, a large multi-speaker
+dataset, for the task of zero-shot voice cloning. Our model achieves
+improvements over the baseline in terms of speech quality and speaker
+similarity. Audio examples from our system are available online.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at INTERSPEECH 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MetaGFN: Exploring Distant Modes with Adapted Metadynamics for
+  Continuous GFlowNets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15905v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15905v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dominic Phillips, Flaviu Cipcigan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative Flow Networks (GFlowNets) are a class of generative models that
+sample objects in proportion to a specified reward function through a learned
+policy. They can be trained either on-policy or off-policy, needing a balance
+between exploration and exploitation for fast convergence to a target
+distribution. While exploration strategies for discrete GFlowNets have been
+studied, exploration in the continuous case remains to be investigated, despite
+the potential for novel exploration algorithms due to the local connectedness
+of continuous domains. Here, we introduce Adapted Metadynamics, a variant of
+metadynamics that can be applied to arbitrary black-box reward functions on
+continuous domains. We use Adapted Metadynamics as an exploration strategy for
+continuous GFlowNets. We show three continuous domains where the resulting
+algorithm, MetaGFN, accelerates convergence to the target distribution and
+discovers more distant reward modes than previous off-policy exploration
+strategies used for GFlowNets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Nexus: Specialization meets Adaptability for Efficiently Training
+  Mixture of Experts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15901v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15901v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikolas Gritsch, Qizhen Zhang, Acyr Locatelli, Sara Hooker, Ahmet Üstün
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Efficiency, specialization, and adaptability to new data distributions are
+qualities that are hard to combine in current Large Language Models. The
+Mixture of Experts (MoE) architecture has been the focus of significant
+research because its inherent conditional computation enables such desirable
+properties. In this work, we focus on "upcycling" dense expert models into an
+MoE, aiming to improve specialization while also adding the ability to adapt to
+new tasks easily. We introduce Nexus, an enhanced MoE architecture with
+adaptive routing where the model learns to project expert embeddings from
+domain representations. This approach allows Nexus to flexibly add new experts
+after the initial upcycling through separately trained dense models, without
+requiring large-scale MoE training for unseen data domains. Our experiments
+show that Nexus achieves a relative gain of up to 2.1% over the baseline for
+initial upcycling, and a 18.8% relative gain for extending the MoE with a new
+expert by using limited finetuning data. This flexibility of Nexus is crucial
+to enable an open-source ecosystem where every user continuously assembles
+their own MoE-mix according to their needs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Airfoil Diffusion: Denoising Diffusion Model For Conditional Airfoil
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15898v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15898v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Reid Graves, Amir Barati Farimani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The design of aerodynamic shapes, such as airfoils, has traditionally
+required significant computational resources and relied on predefined design
+parameters, which limit the potential for novel shape synthesis. In this work,
+we introduce a data-driven methodology for airfoil generation using a diffusion
+model. Trained on a dataset of preexisting airfoils, our model can generate an
+arbitrary number of new airfoils from random vectors, which can be conditioned
+on specific aerodynamic performance metrics such as lift and drag, or geometric
+criteria. Our results demonstrate that the diffusion model effectively produces
+airfoil shapes with realistic aerodynamic properties, offering substantial
+improvements in efficiency, flexibility, and the potential for discovering
+innovative airfoil designs. This approach significantly expands the design
+space, facilitating the synthesis of high-performance aerodynamic shapes that
+transcend the limitations of traditional methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 Pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A New Method for Cross-Lingual-based Semantic Role Labeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15896v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15896v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Ebrahimi, Behrouz Minaei Bidgoli, Nasim Khozouei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic role labeling is a crucial task in natural language processing,
+enabling better comprehension of natural language. However, the lack of
+annotated data in multiple languages has posed a challenge for researchers. To
+address this, a deep learning algorithm based on model transfer has been
+proposed. The algorithm utilizes a dataset consisting of the English portion of
+CoNLL2009 and a corpus of semantic roles in Persian. To optimize the efficiency
+of training, only ten percent of the educational data from each language is
+used. The results of the proposed model demonstrate significant improvements
+compared to Niksirt et al.'s model. In monolingual mode, the proposed model
+achieved a 2.05 percent improvement on F1-score, while in cross-lingual mode,
+the improvement was even more substantial, reaching 6.23 percent. Worth noting
+is that the compared model only trained two of the four stages of semantic role
+labeling and employed golden data for the remaining two stages. This suggests
+that the actual superiority of the proposed model surpasses the reported
+numbers by a significant margin. The development of cross-lingual methods for
+semantic role labeling holds promise, particularly in addressing the scarcity
+of annotated data for various languages. These advancements pave the way for
+further research in understanding and processing natural language across
+different linguistic contexts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bias in LLMs as Annotators: The Effect of Party Cues on Labelling
+  Decision by Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15895v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15895v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sebastian Vallejo Vera, Hunter Driggers
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human coders are biased. We test similar biases in Large Language Models
+(LLMs) as annotators. By replicating an experiment run by Ennser-Jedenastik and
+Meyer (2018), we find evidence that LLMs use political information, and
+specifically party cues, to judge political statements. Not only do LLMs use
+relevant information to contextualize whether a statement is positive,
+negative, or neutral based on the party cue, they also reflect the biases of
+the human-generated data upon which they have been trained. We also find that
+unlike humans, who are only biased when faced with statements from extreme
+parties, LLMs exhibit significant bias even when prompted with statements from
+center-left and center-right parties. The implications of our findings are
+discussed in the conclusion.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Role of Fibration Symmetries in Geometric Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15894v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15894v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Osvaldo Velarde, Lucas Parra, Paolo Boldi, Hernan Makse
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Geometric Deep Learning (GDL) unifies a broad class of machine learning
+techniques from the perspectives of symmetries, offering a framework for
+introducing problem-specific inductive biases like Graph Neural Networks
+(GNNs). However, the current formulation of GDL is limited to global symmetries
+that are not often found in real-world problems. We propose to relax GDL to
+allow for local symmetries, specifically fibration symmetries in graphs, to
+leverage regularities of realistic instances. We show that GNNs apply the
+inductive bias of fibration symmetries and derive a tighter upper bound for
+their expressive power. Additionally, by identifying symmetries in networks, we
+collapse network nodes, thereby increasing their computational efficiency
+during both inference and training of deep neural networks. The mathematical
+extension introduced here applies beyond graphs to manifolds, bundles, and
+grids for the development of models with inductive biases induced by local
+symmetries that can lead to better generalization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Statistical Scaling of Outlier Scores: Improving the Quality of
+  Outlier Probabilities for Outliers (Extended Version) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15874v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15874v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Philipp Röchner, Henrique O. Marques, Ricardo J. G. B. Campello, Arthur Zimek, Franz Rothlauf
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Outlier detection algorithms typically assign an outlier score to each
+observation in a dataset, indicating the degree to which an observation is an
+outlier. However, these scores are often not comparable across algorithms and
+can be difficult for humans to interpret. Statistical scaling addresses this
+problem by transforming outlier scores into outlier probabilities without using
+ground-truth labels, thereby improving interpretability and comparability
+across algorithms. However, the quality of this transformation can be different
+for outliers and inliers. Missing outliers in scenarios where they are of
+particular interest - such as healthcare, finance, or engineering - can be
+costly or dangerous. Thus, ensuring good probabilities for outliers is
+essential. This paper argues that statistical scaling, as commonly used in the
+literature, does not produce equally good probabilities for outliers as for
+inliers. Therefore, we propose robust statistical scaling, which uses robust
+estimators to improve the probabilities for outliers. We evaluate several
+variants of our method against other outlier score transformations for
+real-world datasets and outlier detection algorithms, where it can improve the
+probabilities for outliers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 4 figures, accepted for publication in SISAP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Retrieval-Augmented Instruction Tuning for Automated Process Engineering
+  Calculations : A Tool-Chaining Problem-Solving Framework with Attributable
+  Reflection <span class="chip">ECML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15866v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15866v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sagar Srinivas Sakhinana, Geethan Sannidhi, Venkataramana Runkana
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The current technology landscape lacks a foundational AI model for solving
+process engineering calculations. In this work, we introduce a novel autonomous
+agent framework leveraging Retrieval-Augmented Instruction-Tuning (RAIT) to
+enhance open, customizable small code language models (SLMs) for these
+calculations. By combining instruction tuned code SLMs with Retrieval-Augmented
+Code Generation (RACG) using external tools, the agent generates, debugs, and
+optimizes code from natural language specifications. Our approach addresses the
+limitations of the current lack of a foundational AI model for specialized
+process engineering tasks and offers benefits of explainability, knowledge
+editing, and cost-effectiveness. Additionally, we curate custom datasets of
+chemical and process engineering problems and solutions to overcome data
+scarcity. Experimental results show that our framework matches the performance
+of large-scale proprietary models on benchmark datasets, proving its
+effectiveness and usability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at ML4CCE workshop at ECML PKDD 2024. Please
+  find the link: https://ml4cce-ecml.com/#agenda</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ microYOLO: Towards Single-Shot Object Detection on Microcontrollers <span class="chip">ECML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15865v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15865v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mark Deutel, Christopher Mutschler, Jürgen Teich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work-in-progress paper presents results on the feasibility of
+single-shot object detection on microcontrollers using YOLO. Single-shot object
+detectors like YOLO are widely used, however due to their complexity mainly on
+larger GPU-based platforms. We present microYOLO, which can be used on Cortex-M
+based microcontrollers, such as the OpenMV H7 R2, achieving about 3.5 FPS when
+classifying 128x128 RGB images while using less than 800 KB Flash and less than
+350 KB RAM. Furthermore, we share experimental results for three different
+object detection tasks, analyzing the accuracy of microYOLO on them.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at the ECML PKDD Conference 2023, at the 4th Workshop on
+  IoT, Edge, and Mobile for Embedded Machine Learning</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fusing Pruned and Backdoored Models: Optimal Transport-based Data-free
+  Backdoor Mitigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15861v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15861v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weilin Lin, Li Liu, Jianze Li, Hui Xiong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Backdoor attacks present a serious security threat to deep neuron networks
+(DNNs). Although numerous effective defense techniques have been proposed in
+recent years, they inevitably rely on the availability of either clean or
+poisoned data. In contrast, data-free defense techniques have evolved slowly
+and still lag significantly in performance. To address this issue, different
+from the traditional approach of pruning followed by fine-tuning, we propose a
+novel data-free defense method named Optimal Transport-based Backdoor Repairing
+(OTBR) in this work. This method, based on our findings on neuron weight
+changes (NWCs) of random unlearning, uses optimal transport (OT)-based model
+fusion to combine the advantages of both pruned and backdoored models.
+Specifically, we first demonstrate our findings that the NWCs of random
+unlearning are positively correlated with those of poison unlearning. Based on
+this observation, we propose a random-unlearning NWC pruning technique to
+eliminate the backdoor effect and obtain a backdoor-free pruned model. Then,
+motivated by the OT-based model fusion, we propose the pruned-to-backdoored
+OT-based fusion technique, which fuses pruned and backdoored models to combine
+the advantages of both, resulting in a model that demonstrates high clean
+accuracy and a low attack success rate. To our knowledge, this is the first
+work to apply OT and model fusion techniques to backdoor defense. Extensive
+experiments show that our method successfully defends against all seven
+backdoor attacks across three benchmark datasets, outperforming both
+state-of-the-art (SOTA) data-free and data-dependent methods. The code
+implementation and Appendix are provided in the Supplementary Material.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ chemtrain: Learning Deep Potential Models via Automatic Differentiation
+  and Statistical Physics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15852v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15852v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul Fuchs, Stephan Thaler, Sebastien Röcken, Julija Zavadlav
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Networks (NNs) are promising models for refining the accuracy of
+molecular dynamics, potentially opening up new fields of application. Typically
+trained bottom-up, atomistic NN potential models can reach first-principle
+accuracy, while coarse-grained implicit solvent NN potentials surpass classical
+continuum solvent models. However, overcoming the limitations of costly
+generation of accurate reference data and data inefficiency of common bottom-up
+training demands efficient incorporation of data from many sources. This paper
+introduces the framework chemtrain to learn sophisticated NN potential models
+through customizable training routines and advanced training algorithms. These
+routines can combine multiple top-down and bottom-up algorithms, e.g., to
+incorporate both experimental and simulation data or pre-train potentials with
+less costly algorithms. chemtrain provides an object-oriented high-level
+interface to simplify the creation of custom routines. On the lower level,
+chemtrain relies on JAX to compute gradients and scale the computations to use
+available resources. We demonstrate the simplicity and importance of combining
+multiple algorithms in the examples of parametrizing an all-atomistic model of
+titanium and a coarse-grained implicit solvent model of alanine dipeptide.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Package source code published at http://github.com/tummfm/chemtrain</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automatic Differential Diagnosis using <span class="highlight-title">Transformer</span>-Based Multi-Label
+  Sequence Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15827v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15827v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abu Adnan Sadi, Mohammad Ashrafuzzaman Khan, Lubaba Binte Saber
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the field of artificial intelligence progresses, assistive technologies
+are becoming more widely used across all industries. The healthcare industry is
+no different, with numerous studies being done to develop assistive tools for
+healthcare professionals. Automatic diagnostic systems are one such beneficial
+tool that can assist with a variety of tasks, including collecting patient
+information, analyzing test results, and diagnosing patients. However, the idea
+of developing systems that can provide a differential diagnosis has been
+largely overlooked in most of these research studies. In this study, we propose
+a transformer-based approach for providing differential diagnoses based on a
+patient's age, sex, medical history, and symptoms. We use the DDXPlus dataset,
+which provides differential diagnosis information for patients based on 49
+disease types. Firstly, we propose a method to process the tabular patient data
+from the dataset and engineer them into patient reports to make them suitable
+for our research. In addition, we introduce two data modification modules to
+diversify the training data and consequently improve the robustness of the
+models. We approach the task as a multi-label classification problem and
+conduct extensive experiments using four transformer models. All the models
+displayed promising results by achieving over 97% F1 score on the held-out test
+set. Moreover, we design additional behavioral tests to get a broader
+understanding of the models. In particular, for one of our test cases, we
+prepared a custom test set of 100 samples with the assistance of a doctor. The
+results on the custom set showed that our proposed data modification modules
+improved the model's generalization capabilities. We hope our findings will
+provide future researchers with valuable insights and inspire them to develop
+reliable systems for automatic differential diagnosis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automated Mixture Analysis via Structural Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15819v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15819v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zachary T. P. Fried, Brett A. McGuire
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The determination of chemical mixture components is vital to a multitude of
+scientific fields. Oftentimes spectroscopic methods are employed to decipher
+the composition of these mixtures. However, the sheer density of spectral
+features present in spectroscopic databases can make unambiguous assignment to
+individual species challenging. Yet, components of a mixture are commonly
+chemically related due to environmental processes or shared precursor
+molecules. Therefore, analysis of the chemical relevance of a molecule is
+important when determining which species are present in a mixture. In this
+paper, we combine machine-learning molecular embedding methods with a
+graph-based ranking system to determine the likelihood of a molecule being
+present in a mixture based on the other known species and/or chemical priors.
+By incorporating this metric in a rotational spectroscopy mixture analysis
+algorithm, we demonstrate that the mixture components can be identified with
+extremely high accuracy (>97%) in an efficient manner.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in The Journal of Physical Chemistry A</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Language Adaptation on a Tight Academic Compute Budget: Tokenizer
+  Swapping Works and Pure bfloat16 Is Enough <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15793v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15793v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Konstantin Dobler, Gerard de Melo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate continued pretraining of LLMs for language adaptation on a
+tight academic budget: a setting in which only a few GPUs can be used in
+parallel, for a heavily constrained duration. We focus on adapting Mistral-7B
+to German or Arabic and evaluate several techniques to improve efficiency and
+effectiveness in this setting. Our German models adapted on this tight compute
+budget underperform compared to the base Mistral-7B, while our Arabic models
+outperform several baselines, showing that for sufficiently well-represented
+languages, continued pretraining for specialization is not always helpful. Our
+main findings focus on training precision and tokenizer swapping. Our results
+show that pure bfloat16 training is a viable alternative to mixed-precision
+training, while being much faster when only using a few GPUs. Swapping the
+tokenizer for a specialized one yields more efficient tokenization and is
+competitive with the original tokenizer, which already contains some German
+tokens, but did not significantly increase performance for German. Code and
+model weights are available at on GitHub.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>WANT@ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient LLM Scheduling by Learning to Rank 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15792v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15792v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yichao Fu, Siqi Zhu, Runlong Su, Aurick Qiao, Ion Stoica, Hao Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In Large Language Model (LLM) inference, the output length of an LLM request
+is typically regarded as not known a priori. Consequently, most LLM serving
+systems employ a simple First-come-first-serve (FCFS) scheduling strategy,
+leading to Head-Of-Line (HOL) blocking and reduced throughput and service
+quality. In this paper, we reexamine this assumption -- we show that, although
+predicting the exact generation length of each request is infeasible, it is
+possible to predict the relative ranks of output lengths in a batch of
+requests, using learning to rank. The ranking information offers valuable
+guidance for scheduling requests. Building on this insight, we develop a novel
+scheduler for LLM inference and serving that can approximate the
+shortest-job-first (SJF) schedule better than existing approaches. We integrate
+this scheduler with the state-of-the-art LLM serving system and show
+significant performance improvement in several important applications: 2.8x
+lower latency in chatbot serving and 6.5x higher throughput in synthetic data
+generation. Our code is available at https://github.com/hao-ai-lab/vllm-ltr.git
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Implicit Regularization Paths of Weighted Neural Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15784v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15784v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jin-Hong Du, Pratik Patil
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the implicit regularization effects induced by (observation)
+weighting of pretrained features. For weight and feature matrices of bounded
+operator norms that are infinitesimally free with respect to (normalized) trace
+functionals, we derive equivalence paths connecting different weighting
+matrices and ridge regularization levels. Specifically, we show that ridge
+estimators trained on weighted features along the same path are asymptotically
+equivalent when evaluated against test vectors of bounded norms. These paths
+can be interpreted as matching the effective degrees of freedom of ridge
+estimators fitted with weighted features. For the special case of subsampling
+without replacement, our results apply to independently sampled random features
+and kernel features and confirm recent conjectures (Conjectures 7 and 8) of the
+authors on the existence of such paths in Patil et al. We also present an
+additive risk decomposition for ensembles of weighted estimators and show that
+the risks are equivalent along the paths when the ensemble size goes to
+infinity. As a practical consequence of the path equivalences, we develop an
+efficient cross-validation method for tuning and apply it to subsampled
+pretrained representations across several models (e.g., ResNet-50) and datasets
+(e.g., CIFAR-100).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages for main and 19 pages for appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ wav2pos: Sound Source Localization using Masked Autoencoders 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15771v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15771v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Axel Berg, Jens Gulin, Mark O'Connor, Chuteng Zhou, Karl Åström, Magnus Oskarsson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel approach to the 3D sound source localization task for
+distributed ad-hoc microphone arrays by formulating it as a set-to-set
+regression problem. By training a multi-modal masked autoencoder model that
+operates on audio recordings and microphone coordinates, we show that such a
+formulation allows for accurate localization of the sound source, by
+reconstructing coordinates masked in the input. Our approach is flexible in the
+sense that a single model can be used with an arbitrary number of microphones,
+even when a subset of audio recordings and microphone coordinates are missing.
+We test our method on simulated and real-world recordings of music and speech
+in indoor environments, and demonstrate competitive performance compared to
+both classical and other learning based localization methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IPIN 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Harmonized Speculative Sampling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15766v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15766v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lefan Zhang, Xiaodan Wang, Yanhua Huang, Ruiwen Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speculative sampling has proven to be an effective solution to accelerate
+decoding from large language models, where the acceptance rate significantly
+determines the performance. Most previous works on improving the acceptance
+rate focus on aligned training and efficient decoding, implicitly paying less
+attention to the linkage of training and decoding. In this work, we first
+investigate the linkage of training and decoding for speculative sampling and
+then propose a solution named HArmonized Speculative Sampling (HASS). HASS
+improves the acceptance rate without extra inference overhead by harmonizing
+training and decoding on their objectives and contexts. Experiments on three
+LLaMA models demonstrate that HASS achieves 2.81x-3.65x wall-clock time speedup
+ratio averaging across three datasets, which is 8%-15% faster than EAGLE-2.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Neural Material Point Method for Particle-based Simulations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15753v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15753v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Omer Rochman Sharabi, Sacha Lewin, Gilles Louppe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mesh-free Lagrangian methods are widely used for simulating fluids, solids,
+and their complex interactions due to their ability to handle large
+deformations and topological changes. These physics simulators, however,
+require substantial computational resources for accurate simulations. To
+address these issues, deep learning emulators promise faster and scalable
+simulations, yet they often remain expensive and difficult to train, limiting
+their practical use. Inspired by the Material Point Method (MPM), we present
+NeuralMPM, a neural emulation framework for particle-based simulations.
+NeuralMPM interpolates Lagrangian particles onto a fixed-size grid, computes
+updates on grid nodes using image-to-image neural networks, and interpolates
+back to the particles. Similarly to MPM, NeuralMPM benefits from the regular
+voxelized representation to simplify the computation of the state dynamics,
+while avoiding the drawbacks of mesh-based Eulerian methods. We demonstrate the
+advantages of NeuralMPM on several datasets, including fluid dynamics and
+fluid-solid interactions. Compared to existing methods, NeuralMPM reduces
+training times from days to hours, while achieving comparable or superior
+long-term accuracy, making it a promising approach for practical forward and
+inverse problems. A project page is available at https://neuralmpm.isach.be
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Advanced POD-Based Performance Evaluation of Classifiers Applied to
+  Human Driver Lane Changing Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15722v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15722v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zahra Rastin, Dirk Söffker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning (ML) classifiers serve as essential tools facilitating
+classification and prediction across various domains. The performance of these
+algorithms should be known to ensure their reliable application. In certain
+fields, receiver operating characteristic and precision-recall curves are
+frequently employed to assess machine learning algorithms without accounting
+for the impact of process parameters. However, it may be essential to evaluate
+the performance of these algorithms in relation to such parameters. As a
+performance evaluation metric capable of considering the effects of process
+parameters, this paper uses a modified probability of detection (POD) approach
+to assess the reliability of ML-based algorithms. As an example, the POD-based
+approach is employed to assess ML models used for predicting the lane changing
+behavior of a vehicle driver. The time remaining to the predicted (and
+therefore unknown) lane changing event is considered as process parameter. The
+hit/miss approach to POD is taken here and modified by considering the
+probability of lane changing derived from ML algorithms at each time step, and
+obtaining the final result of the analysis accordingly. This improves the
+reliability of results compared to the standard hit/miss approach, which
+considers the outcome of the classifiers as either 0 or 1, while also
+simplifying evaluation compared to the \^a versus a approach. Performance
+evaluation results of the proposed approach are compared with those obtained
+with the standard hit/miss approach and a pre-developed \^a versus a approach
+to validate the effectiveness of the proposed method. The comparison shows that
+this method provides an averaging conservative behavior with the advantage of
+enhancing the reliability of the hit/miss approach to POD while retaining its
+simplicity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Manuscript: 8 pages, 6 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Autoregressive model path dependence near Ising criticality 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15715v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15715v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Hong Teoh, Roger G. Melko
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autoregressive models are a class of generative model that probabilistically
+predict the next output of a sequence based on previous inputs. The
+autoregressive sequence is by definition one-dimensional (1D), which is natural
+for language tasks and hence an important component of modern architectures
+like recurrent neural networks (RNNs) and transformers. However, when language
+models are used to predict outputs on physical systems that are not
+intrinsically 1D, the question arises of which choice of autoregressive
+sequence -- if any -- is optimal. In this paper, we study the reconstruction of
+critical correlations in the two-dimensional (2D) Ising model, using RNNs and
+transformers trained on binary spin data obtained near the thermal phase
+transition. We compare the training performance for a number of different 1D
+autoregressive sequences imposed on finite-size 2D lattices. We find that paths
+with long 1D segments are more efficient at training the autoregressive models
+compared to space-filling curves that better preserve the 2D locality. Our
+results illustrate the potential importance in choosing the optimal
+autoregressive sequence ordering when training modern language models for tasks
+in physics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Pixels to Prose: Understanding the art of Image Captioning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15714v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15714v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hrishikesh Singh, Aarti Sharma, Millie Pant
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the era of evolving artificial intelligence, machines are increasingly
+emulating human-like capabilities, including visual perception and linguistic
+expression. Image captioning stands at the intersection of these domains,
+enabling machines to interpret visual content and generate descriptive text.
+This paper provides a thorough review of image captioning techniques, catering
+to individuals entering the field of machine learning who seek a comprehensive
+understanding of available options, from foundational methods to
+state-of-the-art approaches. Beginning with an exploration of primitive
+architectures, the review traces the evolution of image captioning models to
+the latest cutting-edge solutions. By dissecting the components of these
+architectures, readers gain insights into the underlying mechanisms and can
+select suitable approaches tailored to specific problem requirements without
+duplicating efforts. The paper also delves into the application of image
+captioning in the medical domain, illuminating its significance in various
+real-world scenarios.
+  Furthermore, the review offers guidance on evaluating the performance of
+image captioning systems, highlighting key metrics for assessment. By
+synthesizing theoretical concepts with practical application, this paper equips
+readers with the knowledge needed to navigate the complex landscape of image
+captioning and harness its potential for diverse applications in machine
+learning and beyond.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating Model Robustness Using Adaptive Sparse L0 Regularization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15702v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15702v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weiyou Liu, Zhenyang Li, Weitong Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Neural Networks have demonstrated remarkable success in various domains
+but remain susceptible to adversarial examples, which are slightly altered
+inputs designed to induce misclassification. While adversarial attacks
+typically optimize under Lp norm constraints, attacks based on the L0 norm,
+prioritising input sparsity, are less studied due to their complex and non
+convex nature. These sparse adversarial examples challenge existing defenses by
+altering a minimal subset of features, potentially uncovering more subtle DNN
+weaknesses. However, the current L0 norm attack methodologies face a trade off
+between accuracy and efficiency either precise but computationally intense or
+expedient but imprecise. This paper proposes a novel, scalable, and effective
+approach to generate adversarial examples based on the L0 norm, aimed at
+refining the robustness evaluation of DNNs against such perturbations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by the 20th International Conference on Advanced Data Mining
+  and Applications (ADMA 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards reliable respiratory disease diagnosis based on cough sounds and
+  vision <span class="highlight-title">transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15667v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15667v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qian Wang, Zhaoyang Bu, Jiaxuan Mao, Wenyu Zhu, Jingya Zhao, Wei Du, Guochao Shi, Min Zhou, Si Chen, Jieming Qu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in deep learning techniques have sparked performance
+boosts in various real-world applications including disease diagnosis based on
+multi-modal medical data. Cough sound data-based respiratory disease (e.g.,
+COVID-19 and Chronic Obstructive Pulmonary Disease) diagnosis has also
+attracted much attention. However, existing works usually utilise traditional
+machine learning or deep models of moderate scales. On the other hand, the
+developed approaches are trained and evaluated on small-scale data due to the
+difficulty of curating and annotating clinical data on scale. To address these
+issues in prior works, we create a unified framework to evaluate various deep
+models from lightweight Convolutional Neural Networks (e.g., ResNet18) to
+modern vision transformers and compare their performance in respiratory disease
+classification. Based on the observations from such an extensive empirical
+study, we propose a novel approach to cough-based disease classification based
+on both self-supervised and supervised learning on a large-scale cough data
+set. Experimental results demonstrate our proposed approach outperforms prior
+arts consistently on two benchmark datasets for COVID-19 diagnosis and a
+proprietary dataset for COPD/non-COPD classification with an AUROC of 92.5%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Auxiliary-Loss-Free Load Balancing Strategy for Mixture-of-Experts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15664v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15664v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lean Wang, Huazuo Gao, Chenggang Zhao, Xu Sun, Damai Dai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For Mixture-of-Experts (MoE) models, an unbalanced expert load will lead to
+routing collapse or increased computational overhead. Existing methods commonly
+employ an auxiliary loss to encourage load balance, but a large auxiliary loss
+will introduce non-negligible interference gradients into training and thus
+impair the model performance. In order to control load balance while not
+producing undesired gradients during training, we propose Loss-Free Balancing,
+featured by an auxiliary-loss-free load balancing strategy. To be specific,
+before the top-K routing decision, Loss-Free Balancing will first apply an
+expert-wise bias to the routing scores of each expert. By dynamically updating
+the bias of each expert according to its recent load, Loss-Free Balancing can
+consistently maintain a balanced distribution of expert load. In addition,
+since Loss-Free Balancing does not produce any interference gradients, it also
+elevates the upper bound of model performance gained from MoE training. We
+validate the performance of Loss-Free Balancing on MoE models with up to 3B
+parameters trained on up to 200B tokens. Experimental results show that
+Loss-Free Balancing achieves both better performance and better load balance
+compared with traditional auxiliary-loss-controlled load balancing strategies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GANs Conditioning Methods: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15640v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15640v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anis Bourou, Auguste Genovesio, Valérie Mezger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, Generative Adversarial Networks (GANs) have seen significant
+advancements, leading to their widespread adoption across various fields. The
+original GAN architecture enables the generation of images without any specific
+control over the content, making it an unconditional generation process.
+However, many practical applications require precise control over the generated
+output, which has led to the development of conditional GANs (cGANs) that
+incorporate explicit conditioning to guide the generation process. cGANs extend
+the original framework by incorporating additional information (conditions),
+enabling the generation of samples that adhere to that specific criteria.
+Various conditioning methods have been proposed, each differing in how they
+integrate the conditioning information into both the generator and the
+discriminator networks. In this work, we review the conditioning methods
+proposed for GANs, exploring the characteristics of each method and
+highlighting their unique mechanisms and theoretical foundations. Furthermore,
+we conduct a comparative analysis of these methods, evaluating their
+performance on various image datasets. Through these analyses, we aim to
+provide insights into the strengths and limitations of various conditioning
+techniques, guiding future research and application in generative modeling.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comparison of Model Predictive Control and Proximal Policy Optimization
+  for a 1-DOF Helicopter System 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15633v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15633v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Georg Schäfer, Jakob Rehrl, Stefan Huber, Simon Hirlaender
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study conducts a comparative analysis of Model Predictive Control (MPC)
+and Proximal Policy Optimization (PPO), a Deep Reinforcement Learning (DRL)
+algorithm, applied to a 1-Degree of Freedom (DOF) Quanser Aero 2 system.
+Classical control techniques such as MPC and Linear Quadratic Regulator (LQR)
+are widely used due to their theoretical foundation and practical
+effectiveness. However, with advancements in computational techniques and
+machine learning, DRL approaches like PPO have gained traction in solving
+optimal control problems through environment interaction. This paper
+systematically evaluates the dynamic response characteristics of PPO and MPC,
+comparing their performance, computational resource consumption, and
+implementation complexity. Experimental results show that while LQR achieves
+the best steady-state accuracy, PPO excels in rise-time and adaptability,
+making it a promising approach for applications requiring rapid response and
+adaptability. Additionally, we have established a baseline for future
+RL-related research on this specific testbed. We also discuss the strengths and
+limitations of each control strategy, providing recommendations for selecting
+appropriate controllers for real-world scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at INDIN2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Convergent Differential Privacy Analysis for General Federated Learning:
+  the f-DP Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15621v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15621v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yan Sun, Li Shen, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) is an efficient collaborative training paradigm
+extensively developed with a focus on local privacy protection, and
+differential privacy (DP) is a classical approach to capture and ensure the
+reliability of local privacy. The powerful cooperation of FL and DP provides a
+promising learning framework for large-scale private clients, juggling both
+privacy securing and trustworthy learning. As the predominant algorithm of DP,
+the noisy perturbation has been widely studied and incorporated into various
+federated algorithms, theoretically proven to offer significant privacy
+protections. However, existing analyses in noisy FL-DP mostly rely on the
+composition theorem and cannot tightly quantify the privacy leakage challenges,
+which is nearly tight for small numbers of communication rounds but yields an
+arbitrarily loose and divergent bound under the large communication rounds.
+This implies a counterintuitive judgment, suggesting that FL may not provide
+adequate privacy protection during long-term training. To further investigate
+the convergent privacy and reliability of the FL-DP framework, in this paper,
+we comprehensively evaluate the worst privacy of two classical methods under
+the non-convex and smooth objectives based on the f-DP analysis, i.e.
+Noisy-FedAvg and Noisy-FedProx methods. With the aid of the
+shifted-interpolation technique, we successfully prove that the worst privacy
+of the Noisy-FedAvg method achieves a tight convergent lower bound. Moreover,
+in the Noisy-FedProx method, with the regularization of the proxy term, the
+worst privacy has a stable constant lower bound. Our analysis further provides
+a solid theoretical foundation for the reliability of privacy protection in
+FL-DP. Meanwhile, our conclusions can also be losslessly converted to other
+classical DP analytical frameworks, e.g. $(\epsilon,\delta)$-DP and
+R$\acute{\text{e}}$nyi-DP (RDP).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CAPER: Enhancing Career Trajectory Prediction using Temporal Knowledge
+  Graph and Ternary Relationship 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15620v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15620v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yeon-Chang Lee, JaeHyun Lee, Michiharu Yamashita, Dongwon Lee, Sang-Wook Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The problem of career trajectory prediction (CTP) aims to predict one's
+future employer or job position. While several CTP methods have been developed
+for this problem, we posit that none of these methods (1) jointly considers the
+mutual ternary dependency between three key units (i.e., user, position, and
+company) of a career and (2) captures the characteristic shifts of key units in
+career over time, leading to an inaccurate understanding of the job movement
+patterns in the labor market. To address the above challenges, we propose a
+novel solution, named as CAPER, that solves the challenges via sophisticated
+temporal knowledge graph (TKG) modeling. It enables the utilization of a
+graph-structured knowledge base with rich expressiveness, effectively
+preserving the changes in job movement patterns. Furthermore, we devise an
+extrapolated career reasoning task on TKG for a realistic evaluation. The
+experiments on a real-world career trajectory dataset demonstrate that CAPER
+consistently and significantly outperforms four baselines, two recent TKG
+reasoning methods, and five state-of-the-art CTP methods in predicting one's
+future companies and positions-i.e., on average, yielding 6.80% and 34.58% more
+accurate predictions, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large-Scale Demand Prediction in Urban Rail using Multi-Graph Inductive
+  Representation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15619v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15619v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dang Viet Anh Nguyen, J. Victor Flensburg, Fabrizio Cerreto, Bianca Pascariu, Paola Pellegrini, Carlos Lima Azevedo, Filipe Rodrigues
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the expansion of cities over time, URT (Urban Rail Transit) networks
+have also grown significantly. Demand prediction plays an important role in
+supporting planning, scheduling, fleet management, and other operational
+decisions. In this study, we propose an Origin-Destination (OD) demand
+prediction model called Multi-Graph Inductive Representation Learning
+(mGraphSAGE) for large-scale URT networks under operational uncertainties. Our
+main contributions are twofold: we enhance prediction results while ensuring
+scalability for large networks by relying simultaneously on multiple graphs,
+where each OD pair is a node on a graph and distinct OD relationships, such as
+temporal and spatial correlations; we show the importance of including
+operational uncertainties such as train delays and cancellations as inputs in
+demand prediction for daily operations. The model is validated on three
+different scales of the URT network in Copenhagen, Denmark. Experimental
+results show that by leveraging information from neighboring ODs and learning
+node representations via sampling and aggregation, mGraphSAGE is particularly
+suitable for OD demand prediction in large-scale URT networks, outperforming
+reference machine learning methods. Furthermore, during periods with train
+cancellations and delays, the performance gap between mGraphSAGE and other
+methods improves compared to normal operating conditions, demonstrating its
+ability to leverage system reliability information for predicting OD demand
+under uncertainty.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Statistical QoS Provision in Business-Centric Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15609v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15609v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chang Wu, Yuang Chen, Hancheng Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  More refined resource management and Quality of Service (QoS) provisioning is
+a critical goal of wireless communication technologies. In this paper, we
+propose a novel Business-Centric Network (BCN) aimed at enabling scalable QoS
+provisioning, based on a cross-layer framework that captures the relationship
+between application, transport parameters, and channels. We investigate both
+continuous flow and event-driven flow models, presenting key QoS metrics such
+as throughput, delay, and reliability. By jointly considering power and
+bandwidth allocation, transmission parameters, and AP network topology across
+layers, we optimize weighted resource efficiency with statistical QoS
+provisioning. To address the coupling among parameters, we propose a novel deep
+reinforcement learning (DRL) framework, which is Collaborative Optimization
+among Heterogeneous Actors with Experience Sharing (COHA-ES). Power and
+sub-channel (SC) Actors representing multiple APs are jointly optimized under
+the unified guidance of a common critic. Additionally, we introduce a novel
+multithreaded experience-sharing mechanism to accelerate training and enhance
+rewards. Extensive comparative experiments validate the effectiveness of our
+DRL framework in terms of convergence and efficiency. Moreover, comparative
+analyses demonstrate the comprehensive advantages of the BCN structure in
+enhancing both spectral and energy efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Grand canonical generative diffusion model for crystalline phases and
+  grain boundaries 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15601v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15601v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bo Lei, Enze Chen, Hyuna Kwon, Tim Hsu, Babak Sadigh, Vincenzo Lordi, Timofey Frolov, Fei Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The diffusion model has emerged as a powerful tool for generating atomic
+structures for materials science. This work calls attention to the deficiency
+of current particle-based diffusion models, which represent atoms as a point
+cloud, in generating even the simplest ordered crystalline structures. The
+problem is attributed to particles being trapped in local minima during the
+score-driven simulated annealing of the diffusion process, similar to the
+physical process of force-driven simulated annealing. We develop a solution,
+the grand canonical diffusion model, which adopts an alternative voxel-based
+representation with continuous rather than fixed number of particles. The
+method is applied towards generation of several common crystalline phases as
+well as the technologically important and challenging problem of grain boundary
+structures.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Selective Layer Fine-Tuning in Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15600v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15600v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuchang Sun, Yuexiang Xie, Bolin Ding, Yaliang Li, Jun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) has emerged as a promising paradigm for fine-tuning
+foundation models using distributed data in a privacy-preserving manner. Under
+limited computational resources, clients often find it more practical to
+fine-tune a selected subset of layers, rather than the entire model, based on
+their task-specific data. In this study, we provide a thorough theoretical
+exploration of selective layer fine-tuning in FL, emphasizing a flexible
+approach that allows the clients to adjust their selected layers according to
+their local data and resources. We theoretically demonstrate that the layer
+selection strategy has a significant impact on model convergence in two
+critical aspects: the importance of selected layers and the heterogeneous
+choices across clients. Drawing from these insights, we further propose a
+strategic layer selection method that utilizes local gradients and regulates
+layer selections across clients. The extensive experiments on both image and
+text datasets demonstrate the effectiveness of the proposed strategy compared
+with several baselines, highlighting its advances in identifying critical
+layers that adapt to the client heterogeneity and training dynamics in FL.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Skills Regularized Task Decomposition for Multi-task Offline
+  Reinforcement Learning <span class="chip">NeurIPS 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15593v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15593v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minjong Yoo, Sangwoo Cho, Honguk Woo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning (RL) with diverse offline datasets can have the
+advantage of leveraging the relation of multiple tasks and the common skills
+learned across those tasks, hence allowing us to deal with real-world complex
+problems efficiently in a data-driven way. In offline RL where only offline
+data is used and online interaction with the environment is restricted, it is
+yet difficult to achieve the optimal policy for multiple tasks, especially when
+the data quality varies for the tasks. In this paper, we present a skill-based
+multi-task RL technique on heterogeneous datasets that are generated by
+behavior policies of different quality. To learn the shareable knowledge across
+those datasets effectively, we employ a task decomposition method for which
+common skills are jointly learned and used as guidance to reformulate a task in
+shared and achievable subtasks. In this joint learning, we use Wasserstein
+auto-encoder (WAE) to represent both skills and tasks on the same latent space
+and use the quality-weighted loss as a regularization term to induce tasks to
+be decomposed into subtasks that are more consistent with high-quality skills
+than others. To improve the performance of offline RL agents learned on the
+latent space, we also augment datasets with imaginary trajectories relevant to
+high-quality skills for each task. Through experiments, we show that our
+multi-task offline RL approach is robust to the mixed configurations of
+different-quality datasets and it outperforms other state-of-the-art algorithms
+for several robotic manipulation tasks and drone navigation tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 5 figures, acceepted in NeurIPS 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VFLIP: A Backdoor Defense for Vertical Federated Learning via
+  Identification and Purification <span class="chip">ESORICS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15591v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15591v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yungi Cho, Woorim Han, Miseon Yu, Ho Bae, Yunheung Paek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vertical Federated Learning (VFL) focuses on handling vertically partitioned
+data over FL participants. Recent studies have discovered a significant
+vulnerability in VFL to backdoor attacks which specifically target the distinct
+characteristics of VFL. Therefore, these attacks may neutralize existing
+defense mechanisms designed primarily for Horizontal Federated Learning (HFL)
+and deep neural networks. In this paper, we present the first backdoor defense,
+called VFLIP, specialized for VFL. VFLIP employs the identification and
+purification techniques that operate at the inference stage, consequently
+improving the robustness against backdoor attacks to a great extent. VFLIP
+first identifies backdoor-triggered embeddings by adopting a participant-wise
+anomaly detection approach. Subsequently, VFLIP conducts purification which
+removes the embeddings identified as malicious and reconstructs all the
+embeddings based on the remaining embeddings. We conduct extensive experiments
+on CIFAR10, CINIC10, Imagenette, NUS-WIDE, and BankMarketing to demonstrate
+that VFLIP can effectively mitigate backdoor attacks in VFL.
+https://github.com/blingcho/VFLIP-esorics24
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by 29th European Symposium on Research in Computer Security
+  (ESORICS 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bayesian optimization of atomic structures with prior probabilities from
+  universal interatomic potentials 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15590v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15590v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peder Lyngby, Casper Larsen, Karsten Wedel Jacobsen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The optimization of atomic structures plays a pivotal role in understanding
+and designing materials with desired properties. However, conventional methods
+often struggle with the formidable task of navigating the vast potential energy
+surface, especially in high-dimensional spaces with numerous local minima.
+Recent advancements in machine learning-driven surrogate models offer a
+promising avenue for alleviating this computational burden. In this study, we
+propose a novel approach that combines the strengths of universal machine
+learning potentials with a Bayesian approach of the GOFEE/BEACON framework. By
+leveraging the comprehensive chemical knowledge encoded in pretrained universal
+machine learning potentials as a prior estimate of energy and forces, we enable
+the Gaussian process to focus solely on capturing the intricate nuances of the
+potential energy surface. We demonstrate the efficacy of our approach through
+comparative analyses across diverse systems, including periodic bulk materials,
+surface structures, and a cluster.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Boosting Lossless Speculative Decoding via Feature Sampling and Partial
+  Alignment Distillation <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15562v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15562v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lujun Gui, Bin Xiao, Lei Su, Weipeng Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lossless speculative decoding accelerates target large language model (LLM)
+inference by employing a lightweight draft model for generating tree-structured
+candidates, which are subsequently verified in parallel by the target LLM.
+Currently, effective approaches leverage feature-level rather than token-level
+autoregression within the draft model to facilitate more straightforward
+predictions and enhanced knowledge distillation. In this paper, we reassess
+these approaches and propose FSPAD (Feature Sampling and Partial Alignment
+Distillation for Lossless Speculative Decoding), which introduces two
+straightforward and effective components within the existing framework to boost
+lossless speculative decoding. Firstly, FSPAD utilizes token embeddings to
+sample features of the target LLM in high-dimensional space before feeding them
+into the draft model, due to the inherent uncertainty of the features
+preventing the draft model from obtaining the specific token output by the
+target LLM. Secondly, FSPAD introduces partial alignment distillation to weaken
+the draft model's connection between features and logits, aiming to reduce the
+conflict between feature alignment and logit confidence during training. Our
+experiments include both greedy and non-greedy decoding on the largest and
+smallest models from the Vicuna and LLaMA3-Instruct series, as well as tasks in
+multi-turn conversation, translation, summarization, question answering,
+mathematical reasoning, and retrieval-augmented generation. The results show
+that FSPAD outperforms the state-of-the-art method across all the
+aforementioned tasks and target LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The work was not submitted to AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Latent Relationship Mining of Glaucoma Biomarkers: a TRI-LSTM based Deep
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15555v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15555v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng Huang, Junhao Shen, Qiuyu Luo, Karanjit Kooner, Tsengdar Lee, Yishen Liu, Jia Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recently years, a significant amount of research has been conducted on
+applying deep learning methods for glaucoma classification and detection.
+However, the explainability of those established machine learning models
+remains a big concern. In this research, in contrast, we learn from cognitive
+science concept and study how ophthalmologists judge glaucoma detection.
+Simulating experts' efforts, we propose a hierarchical decision making system,
+centered around a holistic set of carefully designed biomarker-oriented machine
+learning models. While biomarkers represent the key indicators of how
+ophthalmologists identify glaucoma, they usually exhibit latent
+inter-relations. We thus construct a time series model, named TRI-LSTM, capable
+of calculating and uncovering potential and latent relationships among various
+biomarkers of glaucoma. Our model is among the first efforts to explore the
+intrinsic connections among glaucoma biomarkers. We monitor temporal
+relationships in patients' disease states over time and to capture and retain
+the progression of disease-relevant clinical information from prior visits,
+thereby enriching biomarker's potential relationships. Extensive experiments
+over real-world dataset have demonstrated the effectiveness of the proposed
+model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 4 images</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Novel Denoising Technique and Deep Learning Based Hybrid Wind Speed
+  Forecasting Model for Variable Terrain Conditions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15554v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15554v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sourav Malakar, Saptarsi Goswami, Amlan Chakrabarti, Bhaswati Ganguli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Wind flow can be highly unpredictable and can suffer substantial fluctuations
+in speed and direction due to the shape and height of hills, mountains, and
+valleys, making accurate wind speed (WS) forecasting essential in complex
+terrain. This paper presents a novel and adaptive model for short-term
+forecasting of WS. The paper's key contributions are as follows: (a) The
+Partial Auto Correlation Function (PACF) is utilised to minimise the dimension
+of the set of Intrinsic Mode Functions (IMF), hence reducing training time; (b)
+The sample entropy (SampEn) was used to calculate the complexity of the reduced
+set of IMFs. The proposed technique is adaptive since a specific Deep Learning
+(DL) model-feature combination was chosen based on complexity; (c) A novel
+bidirectional feature-LSTM framework for complicated IMFs has been suggested,
+resulting in improved forecasting accuracy; (d) The proposed model shows
+superior forecasting performance compared to the persistence, hybrid, Ensemble
+empirical mode decomposition (EEMD), and Variational Mode Decomposition
+(VMD)-based deep learning models. It has achieved the lowest variance in terms
+of forecasting accuracy between simple and complex terrain conditions 0.70%.
+Dimension reduction of IMF's and complexity-based model-feature selection helps
+reduce the training time by 68.77% and improve forecasting quality by 58.58% on
+average.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SciLitLLM: How to Adapt LLMs for Scientific Literature Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15545v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15545v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sihang Li, Jian Huang, Jiaxi Zhuang, Yaorui Shi, Xiaochen Cai, Mingjun Xu, Xiang Wang, Linfeng Zhang, Guolin Ke, Hengxing Cai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scientific literature understanding is crucial for extracting targeted
+information and garnering insights, thereby significantly advancing scientific
+discovery. Despite the remarkable success of Large Language Models (LLMs), they
+face challenges in scientific literature understanding, primarily due to (1) a
+lack of scientific knowledge and (2) unfamiliarity with specialized scientific
+tasks.
+  To develop an LLM specialized in scientific literature understanding, we
+propose a hybrid strategy that integrates continual pre-training (CPT) and
+supervised fine-tuning (SFT), to simultaneously infuse scientific domain
+knowledge and enhance instruction-following capabilities for domain-specific
+tasks.cIn this process, we identify two key challenges: (1) constructing
+high-quality CPT corpora, and (2) generating diverse SFT instructions. We
+address these challenges through a meticulous pipeline, including PDF text
+extraction, parsing content error correction, quality filtering, and synthetic
+instruction creation. Applying this strategy, we present a suite of LLMs:
+SciLitLLM, specialized in scientific literature understanding. These models
+demonstrate promising performance on scientific literature understanding
+benchmarks.
+  Our contributions are threefold: (1) We present an effective framework that
+integrates CPT and SFT to adapt LLMs to scientific literature understanding,
+which can also be easily adapted to other domains. (2) We propose an LLM-based
+synthesis method to generate diverse and high-quality scientific instructions,
+resulting in a new instruction set -- SciLitIns -- for supervised fine-tuning
+in less-represented scientific domains. (3) SciLitLLM achieves promising
+performance improvements on scientific literature understanding benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Thompson Sampling via Information Relaxation for Budgeted
+  Multi-armed Bandits 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15535v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15535v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Woojin Jeong, Seungki Min
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider a Bayesian budgeted multi-armed bandit problem, in which each arm
+consumes a different amount of resources when selected and there is a budget
+constraint on the total amount of resources that can be used. Budgeted Thompson
+Sampling (BTS) offers a very effective heuristic to this problem, but its
+arm-selection rule does not take into account the remaining budget information.
+We adopt \textit{Information Relaxation Sampling} framework that generalizes
+Thompson Sampling for classical $K$-armed bandit problems, and propose a series
+of algorithms that are randomized like BTS but more carefully optimize their
+decisions with respect to the budget constraint. In a one-to-one correspondence
+with these algorithms, a series of performance benchmarks that improve the
+conventional benchmark are also suggested. Our theoretical analysis and
+simulation results show that our algorithms (and our benchmarks) make
+incremental improvements over BTS (respectively, the conventional benchmark)
+across various settings including a real-world example.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Measuring the Reliability of Causal Probing Methods: Tradeoffs,
+  Limitations, and the Plight of Nullifying Interventions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15510v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15510v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marc Canby, Adam Davies, Chirag Rastogi, Julia Hockenmaier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Causal probing is an approach to interpreting foundation models, such as
+large language models, by training probes to recognize latent properties of
+interest from embeddings, intervening on probes to modify this representation,
+and analyzing the resulting changes in the model's behavior. While some recent
+works have cast doubt on the theoretical basis of several leading causal
+probing intervention methods, it has been unclear how to systematically and
+empirically evaluate their effectiveness in practice. To address this problem,
+we propose a general empirical analysis framework to evaluate the reliability
+of causal probing interventions, formally defining and quantifying two key
+causal probing desiderata: completeness (fully transforming the representation
+of the target property) and selectivity (minimally impacting other properties).
+Our formalism allows us to make the first direct comparisons between different
+families of causal probing methods (e.g., linear vs. nonlinear or
+counterfactual vs. nullifying interventions). We conduct extensive experiments
+across several leading methods, finding that (1) there is an inherent tradeoff
+between these criteria, and no method is able to consistently satisfy both at
+once; and (2) across the board, nullifying interventions are always far less
+complete than counterfactual interventions, indicating that nullifying methods
+may not be an effective approach to causal probing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MODULI: Unlocking Preference Generalization via Diffusion Models for
+  Offline Multi-Objective Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15501v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15501v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifu Yuan, Zhenrui Zheng, Zibin Dong, Jianye Hao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-objective Reinforcement Learning (MORL) seeks to develop policies that
+simultaneously optimize multiple conflicting objectives, but it requires
+extensive online interactions. Offline MORL provides a promising solution by
+training on pre-collected datasets to generalize to any preference upon
+deployment. However, real-world offline datasets are often conservatively and
+narrowly distributed, failing to comprehensively cover preferences, leading to
+the emergence of out-of-distribution (OOD) preference areas. Existing offline
+MORL algorithms exhibit poor generalization to OOD preferences, resulting in
+policies that do not align with preferences. Leveraging the excellent
+expressive and generalization capabilities of diffusion models, we propose
+MODULI (Multi-objective Diffusion Planner with Sliding Guidance), which employs
+a preference-conditioned diffusion model as a planner to generate trajectories
+that align with various preferences and derive action for decision-making. To
+achieve accurate generation, MODULI introduces two return normalization methods
+under diverse preferences for refining guidance. To further enhance
+generalization to OOD preferences, MODULI proposes a novel sliding guidance
+mechanism, which involves training an additional slider adapter to capture the
+direction of preference changes. Incorporating the slider, it transitions from
+in-distribution (ID) preferences to generating OOD preferences, patching, and
+extending the incomplete Pareto front. Extensive experiments on the D4MORL
+benchmark demonstrate that our algorithm outperforms state-of-the-art Offline
+MORL baselines, exhibiting excellent generalization to OOD preferences.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Learning to Predict Late-Onset Breast Cancer Metastasis: the Single
+  Hyperparameter Grid Search (SHGS) Strategy for Meta Tuning Concerning Deep
+  Feed-forward Neural Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15498v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15498v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yijun Zhou, Om Arora-Jain, Xia Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While machine learning has advanced in medicine, its widespread use in
+clinical applications, especially in predicting breast cancer metastasis, is
+still limited. We have been dedicated to constructing a DFNN model to predict
+breast cancer metastasis n years in advance. However, the challenge lies in
+efficiently identifying optimal hyperparameter values through grid search,
+given the constraints of time and resources. Issues such as the infinite
+possibilities for continuous hyperparameters like l1 and l2, as well as the
+time-consuming and costly process, further complicate the task. To address
+these challenges, we developed Single Hyperparameter Grid Search (SHGS)
+strategy, serving as a preselection method before grid search. Our experiments
+with SHGS applied to DFNN models for breast cancer metastasis prediction focus
+on analyzing eight target hyperparameters: epochs, batch size, dropout, L1, L2,
+learning rate, decay, and momentum. We created three figures, each depicting
+the experiment results obtained from three LSM-I-10-Plus-year datasets. These
+figures illustrate the relationship between model performance and the target
+hyperparameter values. For each hyperparameter, we analyzed whether changes in
+this hyperparameter would affect model performance, examined if there were
+specific patterns, and explored how to choose values for the particular
+hyperparameter. Our experimental findings reveal that the optimal value of a
+hyperparameter is not only dependent on the dataset but is also significantly
+influenced by the settings of other hyperparameters. Additionally, our
+experiments suggested some reduced range of values for a target hyperparameter,
+which may be helpful for low-budget grid search. This approach serves as a
+prior experience and foundation for subsequent use of grid search to enhance
+model performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Remove Symmetries to Control Model Expressivity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15495v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15495v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liu Ziyin, Yizhou Xu, Isaac Chuang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When symmetry is present in the loss function, the model is likely to be
+trapped in a low-capacity state that is sometimes known as a "collapse." Being
+trapped in these low-capacity states can be a major obstacle to training across
+many scenarios where deep learning technology is applied. We first prove two
+concrete mechanisms through which symmetries lead to reduced capacities and
+ignored features during training. We then propose a simple and theoretically
+justified algorithm, syre, to remove almost all symmetry-induced low-capacity
+states in neural networks. The proposed method is shown to improve the training
+of neural networks in scenarios when this type of entrapment is especially a
+concern. A remarkable merit of the proposed method is that it is model-agnostic
+and does not require any knowledge of the symmetry.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CTRQNets & LQNets: Continuous Time Recurrent and Liquid Quantum Neural
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15462v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15462v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alejandro Mayorga, Alexander Yuan, Andrew Yuan, Tyler Wooldridge, Xiaodi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural networks have continued to gain prevalence in the modern era for their
+ability to model complex data through pattern recognition and behavior
+remodeling. However, the static construction of traditional neural networks
+inhibits dynamic intelligence. This makes them inflexible to temporal changes
+in data and unfit to capture complex dependencies. With the advent of quantum
+technology, there has been significant progress in creating quantum algorithms.
+In recent years, researchers have developed quantum neural networks that
+leverage the capabilities of qubits to outperform classical networks. However,
+their current formulation exhibits a static construction limiting the system's
+dynamic intelligence. To address these weaknesses, we develop a Liquid Quantum
+Neural Network (LQNet) and a Continuous Time Recurrent Quantum Neural Network
+(CTRQNet). Both models demonstrate a significant improvement in accuracy
+compared to existing quantum neural networks (QNNs), achieving accuracy
+increases as high as 40\% on CIFAR 10 through binary classification. We propose
+LQNets and CTRQNets might shine a light on quantum machine learning's black
+box.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PersonalizedUS: Interpretable Breast Cancer Risk Assessment with Local
+  Coverage Uncertainty Quantification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15458v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15458v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alek Fröhlich, Thiago Ramos, Gustavo Cabello, Isabela Buzatto, Rafael Izbicki, Daniel Tiezzi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Correctly assessing the malignancy of breast lesions identified during
+ultrasound examinations is crucial for effective clinical decision-making.
+However, the current "golden standard" relies on manual BI-RADS scoring by
+clinicians, often leading to unnecessary biopsies and a significant mental
+health burden on patients and their families. In this paper, we introduce
+PersonalizedUS, an interpretable machine learning system that leverages recent
+advances in conformal prediction to provide precise and personalized risk
+estimates with local coverage guarantees and sensitivity, specificity, and
+predictive values above 0.9 across various threshold levels. In particular, we
+identify meaningful lesion subgroups where distribution-free, model-agnostic
+conditional coverage holds, with approximately 90% of our prediction sets
+containing only the ground truth in most lesion subgroups, thus explicitly
+characterizing for which patients the model is most suitably applied. Moreover,
+we make available a curated tabular dataset of 1936 biopsied breast lesions
+from a recent observational multicenter study and benchmark the performance of
+several state-of-the-art learning algorithms. We also report a successful case
+study of the deployed system in the same multicenter context. Concrete clinical
+benefits include up to a 65% reduction in requested biopsies among BI-RADS 4a
+and 4b lesions, with minimal to no missed cancer cases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 5 figure, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Certified Causal Defense with Generalizable Robustness <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15451v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15451v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiran Qiao, Yu Yin, Chen Chen, Jing Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While machine learning models have proven effective across various scenarios,
+it is widely acknowledged that many models are vulnerable to adversarial
+attacks. Recently, there have emerged numerous efforts in adversarial defense.
+Among them, certified defense is well known for its theoretical guarantees
+against arbitrary adversarial perturbations on input within a certain range
+(e.g., $l_2$ ball). However, most existing works in this line struggle to
+generalize their certified robustness in other data domains with distribution
+shifts. This issue is rooted in the difficulty of eliminating the negative
+impact of spurious correlations on robustness in different domains. To address
+this problem, in this work, we propose a novel certified defense framework
+GLEAN, which incorporates a causal perspective into the generalization problem
+in certified defense. More specifically, our framework integrates a certifiable
+causal factor learning component to disentangle the causal relations and
+spurious correlations between input and label, and thereby exclude the negative
+effect of spurious correlations on defense. On top of that, we design a
+causally certified defense strategy to handle adversarial attacks on latent
+causal factors. In this way, our framework is not only robust against malicious
+noises on data in the training distribution but also can generalize its
+robustness across domains with distribution shifts. Extensive experiments on
+benchmark datasets validate the superiority of our framework in certified
+robustness generalization in different data domains. Code is available in the
+supplementary materials.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to AAAI</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Avoiding Generative Model Writer's Block With Embedding Nudging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15450v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15450v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Zand, Milad Nasr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative image models, since introduction, have become a global phenomenon.
+From new arts becoming possible to new vectors of abuse, many new capabilities
+have become available. One of the challenging issues with generative models is
+controlling the generation process specially to prevent specific generations
+classes or instances . There are several reasons why one may want to control
+the output of generative models, ranging from privacy and safety concerns to
+application limitations or user preferences
+  To address memorization and privacy challenges, there has been considerable
+research dedicated to filtering prompts or filtering the outputs of these
+models. What all these solutions have in common is that at the end of the day
+they stop the model from producing anything, hence limiting the usability of
+the model. In this paper, we propose a method for addressing this usability
+issue by making it possible to steer away from unwanted concepts (when detected
+in model's output) and still generating outputs. In particular we focus on the
+latent diffusion image generative models and how one can prevent them to
+generate particular images while generating similar images with limited
+overhead.
+  We focus on mitigating issues like image memorization, demonstrating our
+technique's effectiveness through qualitative and quantitative evaluations. Our
+method successfully prevents the generation of memorized training images while
+maintaining comparable image quality and relevance to the unmodified model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CardBench: A Benchmark for Learned Cardinality Estimation in Relational
+  Databases 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16170v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16170v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yannis Chronis, Yawen Wang, Yu Gan, Sami Abu-El-Haija, Chelsea Lin, Carsten Binnig, Fatma Özcan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cardinality estimation is crucial for enabling high query performance in
+relational databases. Recently learned cardinality estimation models have been
+proposed to improve accuracy but there is no systematic benchmark or datasets
+which allows researchers to evaluate the progress made by new learned
+approaches and even systematically develop new learned approaches. In this
+paper, we are releasing a benchmark, containing thousands of queries over 20
+distinct real-world databases for learned cardinality estimation. In contrast
+to other initial benchmarks, our benchmark is much more diverse and can be used
+for training and testing learned models systematically. Using this benchmark,
+we explored whether learned cardinality estimation can be transferred to an
+unseen dataset in a zero-shot manner. We trained GNN-based and
+transformer-based models to study the problem in three setups: 1-)
+instance-based, 2-) zero-shot, and 3-) fine-tuned. Our results show that while
+we get promising results for zero-shot cardinality estimation on simple single
+table queries; as soon as we add joins, the accuracy drops. However, we show
+that with fine-tuning, we can still utilize pre-trained models for cardinality
+estimation, significantly reducing training overheads compared to instance
+specific models. We are open sourcing our scripts to collect statistics,
+generate queries and training datasets to foster more extensive research, also
+from the ML community on the important problem of cardinality estimation and in
+particular improve on recent directions such as pre-trained cardinality
+estimation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Simulating realistic short tandem repeat capillary electrophoretic
+  signal using a generative adversarial network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16169v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16169v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Duncan Taylor, Melissa Humphries
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  DNA profiles are made up from multiple series of electrophoretic signal
+measuring fluorescence over time. Typically, human DNA analysts 'read' DNA
+profiles using their experience to distinguish instrument noise, artefactual
+signal, and signal corresponding to DNA fragments of interest. Recent work has
+developed an artificial neural network, ANN, to carry out the task of
+classifying fluorescence types into categories in DNA profile electrophoretic
+signal. But the creation of the necessarily large amount of labelled training
+data for the ANN is time consuming and expensive, and a limiting factor in the
+ability to robustly train the ANN. If realistic, prelabelled, training data
+could be simulated then this would remove the barrier to training an ANN with
+high efficacy. Here we develop a generative adversarial network, GAN, modified
+from the pix2pix GAN to achieve this task. With 1078 DNA profiles we train the
+GAN and achieve the ability to simulate DNA profile information, and then use
+the generator from the GAN as a 'realism filter' that applies the noise and
+artefact elements exhibited in typical electrophoretic signal.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>29 pages, 9 Figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LeMON: Learning to Learn Multi-Operator Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16168v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16168v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingmin Sun, Zecheng Zhang, Hayden Schaeffer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Single-operator learning involves training a deep neural network to learn a
+specific operator, whereas recent work in multi-operator learning uses an
+operator embedding structure to train a single neural network on data from
+multiple operators. Thus, multi-operator learning is capable of predicting a
+range of operators within one model. In this work, we propose pretraining and
+fine-tuning strategies for solving PDEs using multi-operator learning. One key
+aspect is that by increasing the number of families of operators used in
+pretraining, a PDE foundation model can be fine-tuned to downstream tasks
+involving new PDEs with a limited number of samples, thus outperforming single
+operator neural networks. Specifically, a multi-operator learning model
+pre-trained with data from diverse PDE families can predict unseen operators
+after fine-tuning with only a limited number of operators from the new family,
+enabling them to serve as a data-free PDE solver. We also show that the
+proposed training and fine-tuning method is able to predict new operators in
+zero-shot prediction without samples. Additionally, we introduce a PDE-agnostic
+meta-learning algorithm to improve the adaptability of the model to various
+PDEs by providing a better parameter initialization process. To address the
+needs of applications with limited computing resources, we explore low-rank
+adaptation methods that reduce computational costs while enhancing solver
+accuracy. Lastly, by examining the scaling law with respect to the number of
+operator families, we establish and highlight its potential for broad
+adaptation in PDE-solving tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Free Lunch in the Forest: Functionally-Identical Pruning of Boosted Tree
+  Ensembles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16167v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16167v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youssouf Emine, Alexandre Forel, Idriss Malek, Thibaut Vidal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tree ensembles, including boosting methods, are highly effective and widely
+used for tabular data. However, large ensembles lack interpretability and
+require longer inference times. We introduce a method to prune a tree ensemble
+into a reduced version that is "functionally identical" to the original model.
+In other words, our method guarantees that the prediction function stays
+unchanged for any possible input. As a consequence, this pruning algorithm is
+lossless for any aggregated metric. We formalize the problem of functionally
+identical pruning on ensembles, introduce an exact optimization model, and
+provide a fast yet highly effective method to prune large ensembles. Our
+algorithm iteratively prunes considering a finite set of points, which is
+incrementally augmented using an adversarial model. In multiple computational
+experiments, we show that our approach is a "free lunch", significantly
+reducing the ensemble size without altering the model's behavior. Thus, we can
+preserve state-of-the-art performance at a fraction of the original model's
+size.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CLPNets: Coupled Lie-Poisson Neural Networks for Multi-Part Hamiltonian
+  Systems with Symmetries 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16160v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16160v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christopher Eldred, François Gay-Balmaz, Vakhtang Putkaradze
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To accurately compute data-based prediction of Hamiltonian systems,
+especially the long-term evolution of such systems, it is essential to utilize
+methods that preserve the structure of the equations over time. We consider a
+case that is particularly challenging for data-based methods: systems with
+interacting parts that do not reduce to pure momentum evolution. Such systems
+are essential in scientific computations. For example, any discretization of a
+continuum elastic rod can be viewed as interacting elements that can move and
+rotate in space, with each discrete element moving on the group of rotations
+and translations $SE(3)$.
+  We develop a novel method of data-based computation and complete phase space
+learning of such systems. We follow the original framework of \emph{SympNets}
+(Jin et al, 2020) building the neural network from canonical phase space
+mappings, and transformations that preserve the Lie-Poisson structure
+(\emph{LPNets}) as in (Eldred et al, 2024). We derive a novel system of
+mappings that are built into neural networks for coupled systems. We call such
+networks Coupled Lie-Poisson Neural Networks, or \emph{CLPNets}. We consider
+increasingly complex examples for the applications of CLPNets: rotation of two
+rigid bodies about a common axis, the free rotation of two rigid bodies, and
+finally the evolution of two connected and interacting $SE(3)$ components. Our
+method preserves all Casimir invariants of each system to machine precision,
+irrespective of the quality of the training data, and preserves energy to high
+accuracy. Our method also shows good resistance to the curse of dimensionality,
+requiring only a few thousand data points for all cases studied, with the
+effective dimension varying from three to eighteen. Additionally, the method is
+highly economical in memory requirements, requiring only about 200 parameters
+for the most complex case considered.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>52 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Does Data-Efficient Generalization Exacerbate Bias in Foundation Models? <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16154v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16154v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dilermando Queiroz, Anderson Carlos, Maíra Fatoretto, André Anjos, Lilian Berton, Luis Filipe Nakayama
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation models have emerged as robust models with label efficiency in
+diverse domains. In medical imaging, these models contribute to the advancement
+of medical diagnoses due to the difficulty in obtaining labeled data. However,
+it is unclear whether using a large amount of unlabeled data, biased by the
+presence of sensitive attributes during pre-training, influences the fairness
+of the model. This research examines the bias in the Foundation model
+(RetFound) when it is applied to fine-tune the Brazilian Multilabel
+Ophthalmological Dataset (BRSET), which has a different population than the
+pre-training dataset. The model evaluation, in comparison with supervised
+learning, shows that the Foundation Model has the potential to reduce the gap
+between the maximum AUC and minimum AUC evaluations across gender and age
+groups. However, in a data-efficient generalization, the model increases the
+bias when the data amount decreases. These findings suggest that when deploying
+a Foundation Model in real-life scenarios with limited data, the possibility of
+fairness issues should be considered.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint of paper to be presented at Fairness and Ethics Towards
+  Transparent AI: Facing the Challenge through Model Debiasing (FAILED) during
+  ECCV 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving the Prediction of Individual Engagement in Recommendations
+  Using Cognitive Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16147v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16147v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roderick Seow, Yunfan Zhao, Duncan Wood, Milind Tambe, Cleotilde Gonzalez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For public health programs with limited resources, the ability to predict how
+behaviors change over time and in response to interventions is crucial for
+deciding when and to whom interventions should be allocated. Using data from a
+real-world maternal health program, we demonstrate how a cognitive model based
+on Instance-Based Learning (IBL) Theory can augment existing purely
+computational approaches. Our findings show that, compared to general
+time-series forecasters (e.g., LSTMs), IBL models, which reflect human
+decision-making processes, better predict the dynamics of individuals' states.
+Additionally, IBL provides estimates of the volatility in individuals' states
+and their sensitivity to interventions, which can improve the efficiency of
+training of other time series models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Embedded FPGA Developments in 130nm and 28nm CMOS for Machine Learning
+  in Particle Detector Readout 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.17701v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.17701v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julia Gonski, Aseem Gupta, Haoyi Jia, Hyunjoon Kim, Lorenzo Rota, Larry Ruckman, Angelo Dragone, Ryan Herbst
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Embedded field programmable gate array (eFPGA) technology allows the
+implementation of reconfigurable logic within the design of an
+application-specific integrated circuit (ASIC). This approach offers the low
+power and efficiency of an ASIC along with the ease of FPGA configuration,
+particularly beneficial for the use case of machine learning in the data
+pipeline of next-generation collider experiments. An open-source framework
+called "FABulous" was used to design eFPGAs using 130 nm and 28 nm CMOS
+technology nodes, which were subsequently fabricated and verified through
+testing. The capability of an eFPGA to act as a front-end readout chip was
+assessed using simulation of high energy particles passing through a silicon
+pixel sensor. A machine learning-based classifier, designed for reduction of
+sensor data at the source, was synthesized and configured onto the eFPGA. A
+successful proof-of-concept was demonstrated through reproduction of the
+expected algorithm result on the eFPGA with perfect accuracy. Further
+development of the eFPGA technology and its application to collider detector
+readout is discussed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Flextron: Many-in-One Flexible Large Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.10260v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.10260v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruisi Cai, Saurav Muralidharan, Greg Heinrich, Hongxu Yin, Zhangyang Wang, Jan Kautz, Pavlo Molchanov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training modern LLMs is extremely resource intensive, and customizing them
+for various deployment scenarios characterized by limited compute and memory
+resources through repeated training is impractical. In this paper, we introduce
+Flextron, a network architecture and post-training model optimization framework
+supporting flexible model deployment. The Flextron architecture utilizes a
+nested elastic structure to rapidly adapt to specific user-defined latency and
+accuracy targets during inference with no additional fine-tuning required. It
+is also input-adaptive, and can automatically route tokens through its
+sub-networks for improved performance and efficiency. We present a
+sample-efficient training method and associated routing algorithms for
+systematically transforming an existing trained LLM into a Flextron model. We
+evaluate Flextron on the GPT-3 and LLama-2 family of LLMs, and demonstrate
+superior performance over multiple end-to-end trained variants and other
+state-of-the-art elastic networks, all with a single pretraining run that
+consumes a mere 7.63% tokens compared to original pretraining.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Examining Pathological Bias in a Generative Adversarial Network
+  Discriminator: A Case Study on a StyleGAN3 Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.09786v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.09786v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alvin Grissom II, Ryan F. Lei, Matt Gusdorff, Jeova Farias Sales Rocha Neto, Bailey Lin, Ryan Trotter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative adversarial networks (GANs) generate photorealistic faces that are
+often indistinguishable by humans from real faces. While biases in machine
+learning models are often assumed to be due to biases in training data, we find
+pathological internal color and luminance biases in the discriminator of a
+pre-trained StyleGAN3-r model that are not explicable by the training data. We
+also find that the discriminator systematically stratifies scores by both
+image- and face-level qualities and that this disproportionately affects images
+across gender, race, and other categories. We examine axes common in research
+on stereotyping in social psychology.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GINN-KAN: Interpretability pipelining with applications in Physics
+  Informed Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14780v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14780v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nisal Ranasinghe, Yu Xia, Sachith Seneviratne, Saman Halgamuge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural networks are powerful function approximators, yet their ``black-box"
+nature often renders them opaque and difficult to interpret. While many
+post-hoc explanation methods exist, they typically fail to capture the
+underlying reasoning processes of the networks. A truly interpretable neural
+network would be trained similarly to conventional models using techniques such
+as backpropagation, but additionally provide insights into the learned
+input-output relationships. In this work, we introduce the concept of
+interpretability pipelineing, to incorporate multiple interpretability
+techniques to outperform each individual technique. To this end, we first
+evaluate several architectures that promise such interpretability, with a
+particular focus on two recent models selected for their potential to
+incorporate interpretability into standard neural network architectures while
+still leveraging backpropagation: the Growing Interpretable Neural Network
+(GINN) and Kolmogorov Arnold Networks (KAN). We analyze the limitations and
+strengths of each and introduce a novel interpretable neural network GINN-KAN
+that synthesizes the advantages of both models. When tested on the Feynman
+symbolic regression benchmark datasets, GINN-KAN outperforms both GINN and KAN.
+To highlight the capabilities and the generalizability of this approach, we
+position GINN-KAN as an alternative to conventional black-box networks in
+Physics-Informed Neural Networks (PINNs). We expect this to have far-reaching
+implications in the application of deep learning pipelines in the natural
+sciences. Our experiments with this interpretable PINN on 15 different partial
+differential equations demonstrate that GINN-KAN augmented PINNs outperform
+PINNs with black-box networks in solving differential equations and surpass the
+capabilities of both GINN and KAN.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Deep Learning Based Resource Allocator for Communication Systems with
+  Dynamic User Utility Demands 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.04600v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.04600v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pourya Behmandpoor, Mark Eisen, Panagiotis Patrinos, Marc Moonen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning (DL) based resource allocation (RA) has recently gained
+significant attention due to its performance efficiency. However, most related
+studies assume an ideal case where the number of users and their utility
+demands, e.g., data rate constraints, are fixed, and the designed DL-based RA
+scheme exploits a policy trained only for these fixed parameters. Consequently,
+computationally complex policy retraining is required whenever these parameters
+change. In this paper, we introduce a DL-based resource allocator (ALCOR) that
+allows users to adjust their utility demands freely, such as based on their
+application layer requirements. ALCOR employs deep neural networks (DNNs) as
+the policy in a time-sharing problem. The underlying optimization algorithm
+iteratively optimizes the on-off status of users to satisfy their utility
+demands in expectation. The policy performs unconstrained RA (URA)--RA without
+considering user utility demands--among active users to maximize the sum
+utility (SU) at each time instant. Depending on the chosen URA scheme, ALCOR
+can perform RA in either a centralized or distributed scenario. Derived
+convergence analyses provide guarantees for ALCOR's convergence, and numerical
+experiments corroborate its effectiveness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Geometric Neural Network based on Phase Space for BCI-EEG decoding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.05645v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.05645v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Igor Carrara, Bruno Aristimunha, Marie-Constance Corsi, Raphael Y. de Camargo, Sylvain Chevallier, Théodore Papadopoulo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Objective: The integration of Deep Learning (DL) algorithms on brain signal
+analysis is still in its nascent stages compared to their success in fields
+like Computer Vision. This is particularly true for BCI, where the brain
+activity is decoded to control external devices without requiring muscle
+control. Electroencephalography (EEG) is a widely adopted choice for designing
+BCI systems due to its non-invasive and cost-effective nature and excellent
+temporal resolution. Still, it comes at the expense of limited training data,
+poor signal-to-noise, and a large variability across and within-subject
+recordings. Finally, setting up a BCI system with many electrodes takes a long
+time, hindering the widespread adoption of reliable DL architectures in BCIs
+outside research laboratories. To improve adoption, we need to improve user
+comfort using, for instance, reliable algorithms that operate with few
+electrodes. Approach: Our research aims to develop a DL algorithm that delivers
+effective results with a limited number of electrodes. Taking advantage of the
+Augmented Covariance Method and the framework of SPDNet, we propose the
+Phase-SPDNet architecture and analyze its performance and the interpretability
+of the results. The evaluation is conducted on 5-fold cross-validation, using
+only three electrodes positioned above the Motor Cortex. The methodology was
+tested on nearly 100 subjects from several open-source datasets using the
+Mother Of All BCI Benchmark (MOABB) framework. Main results: The results of our
+Phase-SPDNet demonstrate that the augmented approach combined with the SPDNet
+significantly outperforms all the current state-of-the-art DL architecture in
+MI decoding. Significance: This new architecture is explainable and with a low
+number of trainable parameters.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On-Device Training of Fully Quantized Deep Neural Networks on Cortex-M
+  Microcontrollers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.10734v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.10734v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mark Deutel, Frank Hannig, Christopher Mutschler, Jürgen Teich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  On-device training of DNNs allows models to adapt and fine-tune to newly
+collected data or changing domains while deployed on microcontroller units
+(MCUs). However, DNN training is a resource-intensive task, making the
+implementation and execution of DNN training algorithms on MCUs challenging due
+to low processor speeds, constrained throughput, limited floating-point
+support, and memory constraints. In this work, we explore on-device training of
+DNNs for Cortex-M MCUs. We present a method that enables efficient training of
+DNNs completely in place on the MCU using fully quantized training (FQT) and
+dynamic partial gradient updates. We demonstrate the feasibility of our
+approach on multiple vision and time-series datasets and provide insights into
+the tradeoff between training accuracy, memory overhead, energy, and latency on
+real hardware.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Provable Probabilistic Imaging using Score-Based Generative Priors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10835v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10835v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Sun, Zihui Wu, Yifan Chen, Berthy T. Feng, Katherine L. Bouman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Estimating high-quality images while also quantifying their uncertainty are
+two desired features in an image reconstruction algorithm for solving ill-posed
+inverse problems. In this paper, we propose plug-and-play Monte Carlo (PMC) as
+a principled framework for characterizing the space of possible solutions to a
+general inverse problem. PMC is able to incorporate expressive score-based
+generative priors for high-quality image reconstruction while also performing
+uncertainty quantification via posterior sampling. In particular, we develop
+two PMC algorithms that can be viewed as the sampling analogues of the
+traditional plug-and-play priors (PnP) and regularization by denoising (RED)
+algorithms. To improve the sampling efficiency, we introduce weighted annealing
+into these PMC algorithms, further developing two additional annealed PMC
+algorithms (APMC). We establish a theoretical analysis for characterizing the
+convergence behavior of PMC algorithms. Our analysis provides non-asymptotic
+stationarity guarantees in terms of the Fisher information, fully compatible
+with the joint presence of weighted annealing, potentially non-log-concave
+likelihoods, and imperfect score networks. We demonstrate the performance of
+the PMC algorithms on multiple representative inverse problems with both linear
+and nonlinear forward models. Experimental results show that PMC significantly
+improves reconstruction quality and enables high-fidelity uncertainty
+quantification.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Correlation recurrent units: A novel neural architecture for improving
+  the predictive performance of time-series data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.16653v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.16653v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sunghyun Sim, Dohee Kim, Hyerim Bae
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The time-series forecasting (TSF) problem is a traditional problem in the
+field of artificial intelligence. Models such as Recurrent Neural Network
+(RNN), Long Short Term Memory (LSTM), and GRU (Gate Recurrent Units) have
+contributed to improving the predictive accuracy of TSF. Furthermore, model
+structures have been proposed to combine time-series decomposition methods,
+such as seasonal-trend decomposition using Loess (STL) to ensure improved
+predictive accuracy. However, because this approach is learned in an
+independent model for each component, it cannot learn the relationships between
+time-series components. In this study, we propose a new neural architecture
+called a correlation recurrent unit (CRU) that can perform time series
+decomposition within a neural cell and learn correlations (autocorrelation and
+correlation) between each decomposition component. The proposed neural
+architecture was evaluated through comparative experiments with previous
+studies using five univariate time-series datasets and four multivariate
+time-series data. The results showed that long- and short-term predictive
+performance was improved by more than 10%. The experimental results show that
+the proposed CRU is an excellent method for TSF problems compared to other
+neural architectures.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RecurrentGemma: Moving Past <span class="highlight-title">Transformer</span>s for Efficient Open Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.07839v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.07839v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aleksandar Botev, Soham De, Samuel L Smith, Anushan Fernando, George-Cristian Muraru, Ruba Haroun, Leonard Berrada, Razvan Pascanu, Pier Giuseppe Sessa, Robert Dadashi, Léonard Hussenot, Johan Ferret, Sertan Girgin, Olivier Bachem, Alek Andreev, Kathleen Kenealy, Thomas Mesnard, Cassidy Hardin, Surya Bhupatiraju, Shreya Pathak, Laurent Sifre, Morgane Rivière, Mihir Sanjay Kale, Juliette Love, Pouya Tafti, Armand Joulin, Noah Fiedel, Evan Senter, Yutian Chen, Srivatsan Srinivasan, Guillaume Desjardins, David Budden, Arnaud Doucet, Sharad Vikram, Adam Paszke, Trevor Gale, Sebastian Borgeaud, Charlie Chen, Andy Brock, Antonia Paterson, Jenny Brennan, Meg Risdal, Raj Gundluru, Nesh Devanathan, Paul Mooney, Nilay Chauhan, Phil Culliton, Luiz Gustavo Martins, Elisa Bandy, David Huntsperger, Glenn Cameron, Arthur Zucker, Tris Warkentin, Ludovic Peran, Minh Giang, Zoubin Ghahramani, Clément Farabet, Koray Kavukcuoglu, Demis Hassabis, Raia Hadsell, Yee Whye Teh, Nando de Frietas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce RecurrentGemma, a family of open language models which uses
+Google's novel Griffin architecture. Griffin combines linear recurrences with
+local attention to achieve excellent performance on language. It has a
+fixed-sized state, which reduces memory use and enables efficient inference on
+long sequences. We provide two sizes of models, containing 2B and 9B
+parameters, and provide pre-trained and instruction tuned variants for both.
+Our models achieve comparable performance to similarly-sized Gemma baselines
+despite being trained on fewer tokens.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Statistical Framework of Watermarks for Large Language Models: Pivot,
+  Detection Efficiency and Optimal Rules 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.01245v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.01245v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Li, Feng Ruan, Huiyuan Wang, Qi Long, Weijie J. Su
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Since ChatGPT was introduced in November 2022, embedding (nearly)
+unnoticeable statistical signals into text generated by large language models
+(LLMs), also known as watermarking, has been used as a principled approach to
+provable detection of LLM-generated text from its human-written counterpart. In
+this paper, we introduce a general and flexible framework for reasoning about
+the statistical efficiency of watermarks and designing powerful detection
+rules. Inspired by the hypothesis testing formulation of watermark detection,
+our framework starts by selecting a pivotal statistic of the text and a secret
+key -- provided by the LLM to the verifier -- to enable controlling the false
+positive rate (the error of mistakenly detecting human-written text as
+LLM-generated). Next, this framework allows one to evaluate the power of
+watermark detection rules by obtaining a closed-form expression of the
+asymptotic false negative rate (the error of incorrectly classifying
+LLM-generated text as human-written). Our framework further reduces the problem
+of determining the optimal detection rule to solving a minimax optimization
+program. We apply this framework to two representative watermarks -- one of
+which has been internally implemented at OpenAI -- and obtain several findings
+that can be instrumental in guiding the practice of implementing watermarks. In
+particular, we derive optimal detection rules for these watermarks under our
+framework. These theoretically derived detection rules are demonstrated to be
+competitive and sometimes enjoy a higher power than existing detection
+approaches through numerical experiments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Guaranteed Coverage Prediction Intervals with Gaussian Process
+  Regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.15641v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.15641v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Harris Papadopoulos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gaussian Process Regression (GPR) is a popular regression method, which
+unlike most Machine Learning techniques, provides estimates of uncertainty for
+its predictions. These uncertainty estimates however, are based on the
+assumption that the model is well-specified, an assumption that is violated in
+most practical applications, since the required knowledge is rarely available.
+As a result, the produced uncertainty estimates can become very misleading; for
+example the prediction intervals (PIs) produced for the 95% confidence level
+may cover much less than 95% of the true labels. To address this issue, this
+paper introduces an extension of GPR based on a Machine Learning framework
+called, Conformal Prediction (CP). This extension guarantees the production of
+PIs with the required coverage even when the model is completely misspecified.
+The proposed approach combines the advantages of GPR with the valid coverage
+guarantee of CP, while the performed experimental results demonstrate its
+superiority over existing methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages. This article has been accepted for publication in IEEE
+  Transactions on Pattern Analysis and Machine Intelligence. This is the
+  author's version which has not been fully edited and content may change prior
+  to final publication. Citation information: DOI 10.1109/TPAMI.2024.3418214</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FRANC: A Lightweight Framework for High-Quality Code Generation <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08220v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08220v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammed Latif Siddiq, Beatrice Casey, Joanna C. S. Santos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, the use of automated source code generation utilizing
+transformer-based generative models has expanded, and these models can generate
+functional code according to the requirements of the developers. However,
+recent research revealed that these automatically generated source codes can
+contain vulnerabilities and other quality issues. Despite researchers' and
+practitioners' attempts to enhance code generation models, retraining and
+fine-tuning large language models is time-consuming and resource-intensive.
+Thus, we describe FRANC, a lightweight framework for recommending more secure
+and high-quality source code derived from transformer-based code generation
+models. FRANC includes a static filter to make the generated code compilable
+with heuristics and a quality-aware ranker to sort the code snippets based on a
+quality score. Moreover, the framework uses prompt engineering to fix
+persistent quality issues. We evaluated the framework with five Python and Java
+code generation models and six prompt datasets, including a newly created one
+in this work (SOEval). The static filter improves 9% to 46% Java suggestions
+and 10% to 43% Python suggestions regarding compilability. The average
+improvement over the NDCG@10 score for the ranking system is 0.0763, and the
+repairing techniques repair the highest 80% of prompts. FRANC takes, on
+average, 1.98 seconds for Java; for Python, it takes 0.08 seconds.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the 24th IEEE International Conference on Source Code
+  Analysis and Manipulation (SCAM 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Fault in our Stars: Quality Assessment of Code Generation Benchmarks <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.10155v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.10155v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammed Latif Siddiq, Simantika Dristi, Joy Saha, Joanna C. S. Santos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) are gaining popularity among software engineers.
+A crucial aspect of developing effective code generation LLMs is to evaluate
+these models using a robust benchmark. Evaluation benchmarks with quality
+issues can provide a false sense of performance. In this work, we conduct the
+first-of-its-kind study of the quality of prompts within benchmarks used to
+compare the performance of different code generation models. To conduct this
+study, we analyzed 3,566 prompts from 9 code generation benchmarks to identify
+quality issues in them. We also investigated whether fixing the identified
+quality issues in the benchmarks' prompts affects a model's performance. We
+also studied memorization issues of the evaluation dataset, which can put into
+question a benchmark's trustworthiness. We found that code generation
+evaluation benchmarks mainly focused on Python and coding exercises and had
+very limited contextual dependencies to challenge the model. These datasets and
+the developers' prompts suffer from quality issues like spelling and
+grammatical errors, unclear sentences to express developers' intent, and not
+using proper documentation style. Fixing all these issues in the benchmarks can
+lead to a better performance for Python code generation, but not a significant
+improvement was observed for Java code generation. We also found evidence that
+GPT-3.5-Turbo and CodeGen-2.5 models may have data contamination issues.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the 24th IEEE International Conference on Source Code
+  Analysis and Manipulation(SCAM 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unveiling the Statistical Foundations of Chain-of-Thought <span class="highlight-title">Prompt</span>ing
+  Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14511v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14511v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyang Hu, Fengzhuo Zhang, Siyu Chen, Zhuoran Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chain-of-Thought (CoT) prompting and its variants have gained popularity as
+effective methods for solving multi-step reasoning problems using pretrained
+large language models (LLMs). In this work, we analyze CoT prompting from a
+statistical estimation perspective, providing a comprehensive characterization
+of its sample complexity. To this end, we introduce a multi-step latent
+variable model that encapsulates the reasoning process, where the latent
+variable encodes the task information. Under this framework, we demonstrate
+that when the pretraining dataset is sufficiently large, the estimator formed
+by CoT prompting is equivalent to a Bayesian estimator. This estimator
+effectively solves the multi-step reasoning problem by aggregating a posterior
+distribution inferred from the demonstration examples in the prompt. Moreover,
+we prove that the statistical error of the CoT estimator can be decomposed into
+two main components: (i) a prompting error, which arises from inferring the
+true task using CoT prompts, and (ii) the statistical error of the pretrained
+LLM. We establish that, under appropriate assumptions, the prompting error
+decays exponentially to zero as the number of demonstrations increases.
+Additionally, we explicitly characterize the approximation and generalization
+errors of the pretrained LLM. Notably, we construct a transformer model that
+approximates the target distribution of the multi-step reasoning problem with
+an error that decreases exponentially in the number of transformer blocks. Our
+analysis extends to other variants of CoT, including Self-Consistent CoT,
+Tree-of-Thought, and Selection-Inference, offering a broad perspective on the
+efficacy of these methods. We also provide numerical experiments to validate
+the theoretical findings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>150 pages, 18 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Framework to Model ML Engineering Processes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.18531v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.18531v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sergio Morales, Robert Clarisó, Jordi Cabot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The development of Machine Learning (ML) based systems is complex and
+requires multidisciplinary teams with diverse skill sets. This may lead to
+communication issues or misapplication of best practices. Process models can
+alleviate these challenges by standardizing task orchestration, providing a
+common language to facilitate communication, and nurturing a collaborative
+environment. Unfortunately, current process modeling languages are not suitable
+for describing the development of such systems. In this paper, we introduce a
+framework for modeling ML-based software development processes, built around a
+domain-specific language and derived from an analysis of scientific and gray
+literature. A supporting toolkit is also available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Stick to your Role! Stability of Personal Values Expressed in Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.14846v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.14846v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Grgur Kovač, Rémy Portelas, Masataka Sawayama, Peter Ford Dominey, Pierre-Yves Oudeyer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The standard way to study Large Language Models (LLMs) with benchmarks or
+psychology questionnaires is to provide many different queries from similar
+minimal contexts (e.g. multiple choice questions). However, due to LLMs' highly
+context-dependent nature, conclusions from such minimal-context evaluations may
+be little informative about the model's behavior in deployment (where it will
+be exposed to many new contexts). We argue that context-dependence
+(specifically, value stability) should be studied as a specific property of
+LLMs and used as another dimension of LLM comparison (alongside others such as
+cognitive abilities, knowledge, or model size). We present a case-study on the
+stability of value expression over different contexts (simulated conversations
+on different topics) as measured using a standard psychology questionnaire
+(PVQ) and on behavioral downstream tasks. Reusing methods from psychology, we
+study Rank-order stability on the population (interpersonal) level, and
+Ipsative stability on the individual (intrapersonal) level. We consider two
+settings (with and without instructing LLMs to simulate particular personas),
+two simulated populations, and three downstream tasks. We observe consistent
+trends in the stability of models and model families - Mixtral, Mistral,
+GPT-3.5 and Qwen families are more stable than LLaMa-2 and Phi. The consistency
+of these trends implies that some models exhibit higher value stability than
+others, and that stability can be estimated with the set of introduced
+methodological tools. When instructed to simulate particular personas, LLMs
+exhibit low Rank-order stability, which further diminishes with conversation
+length. This highlights the need for future research on LLMs that coherently
+simulate different personas. This paper provides a foundational step in that
+direction, and, to our knowledge, it is the first study of value stability in
+LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The project website and code are available at
+  https://sites.google.com/view/llmvaluestability Published in PLOS ONE (
+  https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0309114 ),
+  and a shorter version at CogSci 24 (
+  https://escholarship.org/uc/item/7w4823c6 )</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Metric-based Principal Curve Approach for Learning One-dimensional
+  Manifold 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.12390v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.12390v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elvis Han Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Principal curve is a well-known statistical method oriented in manifold
+learning using concepts from differential geometry. In this paper, we propose a
+novel metric-based principal curve (MPC) method that learns one-dimensional
+manifold of spatial data. Synthetic datasets Real applications using MNIST
+dataset show that our method can learn the one-dimensional manifold well in
+terms of the shape.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Marked Neural Spatio-Temporal Point Process Involving a Dynamic Graph
+  Neural Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.03469v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.03469v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alice Moallemy-Oureh, Silvia Beddar-Wiesing, Yannick Nagel, Rüdiger Nather, Josephine M. Thomas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Temporal Point Processes (TPPs) have recently become increasingly interesting
+for learning dynamics in graph data. A reason for this is that learning on
+dynamic graph data is becoming more relevant, since data from many scientific
+fields, ranging from mathematics, biology, social sciences, and physics to
+computer science, is naturally related and inherently dynamic. In addition,
+TPPs provide a meaningful characterization of event streams and a prediction
+mechanism for future events. Therefore, (semi-)parameterized Neural TPPs have
+been introduced whose characterization can be (partially) learned and, thus,
+enable the representation of more complex phenomena. However, the research on
+modeling dynamic graphs with TPPs is relatively young, and only a few models
+for node attribute changes or evolving edges have been proposed yet. To allow
+for learning on fully dynamic graph streams, i.e., graphs that can change in
+their structure (addition/deletion of nodes/edge) and in their node/edge
+attributes, we propose a Marked Neural Spatio-Temporal Point Process (MNSTPP).
+It leverages a Dynamic Graph Neural Network to learn a Marked TPP that handles
+attributes and spatial data to model and predict any event in a graph stream.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Analysis of Diagnostics (Part I): Prevalence, Uncertainty
+  Quantification, and Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.00645v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.00645v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul N. Patrone, Raquel A. Binder, Catherine S. Forconi, Ann M. Moormann, Anthony J. Kearsley
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diagnostic testing provides a unique setting for studying and developing
+tools in classification theory. In such contexts, the concept of prevalence,
+i.e. the number of individuals with a given condition, is fundamental, both as
+an inherent quantity of interest and as a parameter that controls
+classification accuracy. This manuscript is the first in a two-part series that
+studies deeper connections between classification theory and prevalence,
+showing how the latter establishes a more complete theory of uncertainty
+quantification (UQ) for certain types of machine learning (ML). We motivate
+this analysis via a lemma demonstrating that general classifiers minimizing a
+prevalence-weighted error contain the same probabilistic information as
+Bayes-optimal classifiers, which depend on conditional probability densities.
+This leads us to study relative probability level-sets $B^\star (q)$, which are
+reinterpreted as both classification boundaries and useful tools for
+quantifying uncertainty in class labels. To realize this in practice, we also
+propose a numerical, homotopy algorithm that estimates the $B^\star (q)$ by
+minimizing a prevalence-weighted empirical error. The successes and
+shortcomings of this method motivate us to revisit properties of the level
+sets, and we deduce the corresponding classifiers obey a useful monotonicity
+property that stabilizes the numerics and points to important extensions to UQ
+of ML. Throughout, we validate our methods in the context of synthetic data and
+a research-use-only SARS-CoV-2 enzyme-linked immunosorbent (ELISA) assay.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ When Multi-Task Learning Meets Partial Supervision: A Computer Vision
+  <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14382v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14382v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maxime Fontana, Michael Spratling, Miaojing Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-Task Learning (MTL) aims to learn multiple tasks simultaneously while
+exploiting their mutual relationships. By using shared resources to
+simultaneously calculate multiple outputs, this learning paradigm has the
+potential to have lower memory requirements and inference times compared to the
+traditional approach of using separate methods for each task. Previous work in
+MTL has mainly focused on fully-supervised methods, as task relationships can
+not only be leveraged to lower the level of data-dependency of those methods
+but they can also improve performance. However, MTL introduces a set of
+challenges due to a complex optimisation scheme and a higher labeling
+requirement. This review focuses on how MTL could be utilised under different
+partial supervision settings to address these challenges. First, this review
+analyses how MTL traditionally uses different parameter sharing techniques to
+transfer knowledge in between tasks. Second, it presents the different
+challenges arising from such a multi-objective optimisation scheme. Third, it
+introduces how task groupings can be achieved by analysing task relationships.
+Fourth, it focuses on how partially supervised methods applied to MTL can
+tackle the aforementioned challenges. Lastly, this review presents the
+available datasets, tools and benchmarking results of such methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Proceedings of the IEEE</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ QEDCartographer: Automating Formal Verification Using Reward-Free
+  Reinforcement Learning <span class="chip">ICSE</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.09237v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.09237v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alex Sanchez-Stern, Abhishek Varghese, Zhanna Kaufman, Dylan Zhang, Talia Ringer, Yuriy Brun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Formal verification is a promising method for producing reliable software,
+but the difficulty of manually writing verification proofs severely limits its
+utility in practice. Recent methods have automated some proof synthesis by
+guiding a search through the proof space using a theorem prover. Unfortunately,
+the theorem prover provides only the crudest estimate of progress, resulting in
+effectively undirected search. To address this problem, we create
+QEDCartographer, an automated proof-synthesis tool that combines supervised and
+reinforcement learning to more effectively explore the proof space.
+QEDCartographer incorporates the proofs' branching structure, enabling
+reward-free search and overcoming the sparse reward problem inherent to formal
+verification. We evaluate QEDCartographer using the CoqGym benchmark of 68.5K
+theorems from 124 open-source Coq projects. QEDCartographer fully automatically
+proves 21.4% of the test-set theorems. Previous search-based proof-synthesis
+tools Tok, Tac, ASTactic, Passport, and Proverbot9001, which rely only on
+supervised learning, prove 9.6%, 9.8%, 10.9%, 12.5%, and 19.8%, respectively.
+Diva, which combines 62 tools, proves 19.2%. Comparing to the most effective
+prior tool, Proverbot9001, QEDCartographer produces 26% shorter proofs 27%
+faster, on average over the theorems both tools prove. Together,
+QEDCartographer and non-learning-based CoqHammer prove 31.8% of the theorems,
+while CoqHammer alone proves 26.6%. Our work demonstrates that reinforcement
+learning is a fruitful research direction for improving proof-synthesis tools'
+search mechanisms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in the International Conference on Software Engineering
+  (ICSE) 2025: Alex Sanchez-Stern, Abhishek Varghese, Zhanna Kaufman, Dylan
+  Zhang, Talia Ringer, and Yuriy Brun, QEDCartographer: Automating Formal
+  Verification Using Reward-Free Reinforcement Learning, in Proceedings of the
+  47th International Conference on Software Engineering (ICSE), 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Research on the Spatial Data Intelligent Foundation Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19730v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19730v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaohua Wang, Xing Xie, Yong Li, Danhuai Guo, Zhi Cai, Yu Liu, Yang Yue, Xiao Pan, Feng Lu, Huayi Wu, Zhipeng Gui, Zhiming Ding, Bolong Zheng, Fuzheng Zhang, Jingyuan Wang, Zhengchao Chen, Hao Lu, Jiayi Li, Peng Yue, Wenhao Yu, Yao Yao, Leilei Sun, Yong Zhang, Longbiao Chen, Xiaoping Du, Xiang Li, Xueying Zhang, Kun Qin, Zhaoya Gong, Weihua Dong, Xiaofeng Meng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This report focuses on spatial data intelligent large models, delving into
+the principles, methods, and cutting-edge applications of these models. It
+provides an in-depth discussion on the definition, development history, current
+status, and trends of spatial data intelligent large models, as well as the
+challenges they face. The report systematically elucidates the key technologies
+of spatial data intelligent large models and their applications in urban
+environments, aerospace remote sensing, geography, transportation, and other
+scenarios. Additionally, it summarizes the latest application cases of spatial
+data intelligent large models in themes such as urban development, multimodal
+systems, remote sensing, smart transportation, and resource environments.
+Finally, the report concludes with an overview and outlook on the development
+prospects of spatial data intelligent large models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>V1 and V2 are in Chinese language, other versions are in English</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FADE: Towards Fairness-aware Augmentation for Domain Generalization via
+  Classifier-Guided Score-based Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.09495v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.09495v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yujie Lin, Dong Li, Chen Zhao, Minglai Shao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fairness-aware domain generalization (FairDG) has emerged as a critical
+challenge for deploying trustworthy AI systems, particularly in scenarios
+involving distribution shifts. Traditional methods for addressing fairness have
+failed in domain generalization due to their lack of consideration for
+distribution shifts. Although disentanglement has been used to tackle FairDG,
+it is limited by its strong assumptions. To overcome these limitations, we
+propose Fairness-aware Classifier-Guided Score-based Diffusion Models (FADE) as
+a novel approach to effectively address the FairDG issue. Specifically, we
+first pre-train a score-based diffusion model (SDM) and two classifiers to
+equip the model with strong generalization capabilities across different
+domains. Then, we guide the SDM using these pre-trained classifiers to
+effectively eliminate sensitive information from the generated data. Finally,
+the generated fair data is used to train downstream classifiers, ensuring
+robust performance under new data distributions. Extensive experiments on three
+real-world datasets demonstrate that FADE not only enhances fairness but also
+improves accuracy in the presence of distribution shifts. Additionally, FADE
+outperforms existing methods in achieving the best accuracy-fairness
+trade-offs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Re-Nerfing: Improving Novel View Synthesis through Novel View Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.02255v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.02255v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Felix Tristram, Stefano Gasperini, Nassir Navab, Federico Tombari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent neural rendering and reconstruction techniques, such as NeRFs or
+Gaussian Splatting, have shown remarkable novel view synthesis capabilities but
+require hundreds of images of the scene from diverse viewpoints to render
+high-quality novel views. With fewer images available, these methods start to
+fail since they can no longer correctly triangulate the underlying 3D geometry
+and converge to a non-optimal solution. These failures can manifest as floaters
+or blurry renderings in sparsely observed areas of the scene. In this paper, we
+propose Re-Nerfing, a simple and general add-on approach that leverages novel
+view synthesis itself to tackle this problem. Using an already trained NVS
+method, we render novel views between existing ones and augment the training
+data to optimize a second model. This introduces additional multi-view
+constraints and allows the second model to converge to a better solution. With
+Re-Nerfing we achieve significant improvements upon multiple pipelines based on
+NeRF and Gaussian-Splatting in sparse view settings of the mip-NeRF 360 and
+LLFF datasets. Notably, Re-Nerfing does not require prior knowledge or extra
+supervision signals, making it a flexible and practical add-on.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code will be released upon acceptance</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Contextual Bandit with Herding Effects: Algorithms and Recommendation
+  Applications <span class="chip">PRICAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14432v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14432v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luyue Xu, Liming Wang, Hong Xie, Mingqiang Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contextual bandits serve as a fundamental algorithmic framework for
+optimizing recommendation decisions online. Though extensive attention has been
+paid to tailoring contextual bandits for recommendation applications, the
+"herding effects" in user feedback have been ignored. These herding effects
+bias user feedback toward historical ratings, breaking down the assumption of
+unbiased feedback inherent in contextual bandits. This paper develops a novel
+variant of the contextual bandit that is tailored to address the feedback bias
+caused by the herding effects. A user feedback model is formulated to capture
+this feedback bias. We design the TS-Conf (Thompson Sampling under Conformity)
+algorithm, which employs posterior sampling to balance the exploration and
+exploitation tradeoff. We prove an upper bound for the regret of the algorithm,
+revealing the impact of herding effects on learning speed. Extensive
+experiments on datasets demonstrate that TS-Conf outperforms four benchmark
+algorithms. Analysis reveals that TS-Conf effectively mitigates the negative
+impact of herding effects, resulting in faster learning and improved
+recommendation accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published as a conference paper at PRICAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sensitivity-Aware Amortized Bayesian Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11122v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11122v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lasse Elsemüller, Hans Olischläger, Marvin Schmitt, Paul-Christian Bürkner, Ullrich Köthe, Stefan T. Radev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sensitivity analyses reveal the influence of various modeling choices on the
+outcomes of statistical analyses. While theoretically appealing, they are
+overwhelmingly inefficient for complex Bayesian models. In this work, we
+propose sensitivity-aware amortized Bayesian inference (SA-ABI), a multifaceted
+approach to efficiently integrate sensitivity analyses into simulation-based
+inference with neural networks. First, we utilize weight sharing to encode the
+structural similarities between alternative likelihood and prior specifications
+in the training process with minimal computational overhead. Second, we
+leverage the rapid inference of neural networks to assess sensitivity to data
+perturbations and preprocessing steps. In contrast to most other Bayesian
+approaches, both steps circumvent the costly bottleneck of refitting the model
+for each choice of likelihood, prior, or data set. Finally, we propose to use
+deep ensembles to detect sensitivity arising from unreliable approximation
+(e.g., due to model misspecification). We demonstrate the effectiveness of our
+method in applied modeling problems, ranging from disease outbreak dynamics and
+global warming thresholds to human decision-making. Our results support
+sensitivity-aware inference as a default choice for amortized Bayesian
+workflows, automatically providing modelers with insights into otherwise hidden
+dimensions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in TMLR (2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Articulation Work and Tinkering for Fairness in Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.16496v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.16496v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miriam Fahimi, Mayra Russo, Kristen M. Scott, Maria-Esther Vidal, Bettina Berendt, Katharina Kinder-Kurlanda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The field of fair AI aims to counter biased algorithms through computational
+modelling. However, it faces increasing criticism for perpetuating the use of
+overly technical and reductionist methods. As a result, novel approaches appear
+in the field to address more socially-oriented and interdisciplinary (SOI)
+perspectives on fair AI. In this paper, we take this dynamic as the starting
+point to study the tension between computer science (CS) and SOI research. By
+drawing on STS and CSCW theory, we position fair AI research as a matter of
+'organizational alignment': what makes research 'doable' is the successful
+alignment of three levels of work organization (the social world, the
+laboratory, and the experiment). Based on qualitative interviews with CS
+researchers, we analyze the tasks, resources, and actors required for doable
+research in the case of fair AI. We find that CS researchers engage with SOI
+research to some extent, but organizational conditions, articulation work, and
+ambiguities of the social world constrain the doability of SOI research for
+them. Based on our findings, we identify and discuss problems for aligning CS
+and SOI as fair AI continues to evolve.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Forecasting Intraday Power Output by a Set of PV Systems using Recurrent
+  Neural Networks and Physical Covariates 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.08459v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.08459v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pierrick Bruneau, David Fiorelli, Christian Braun, Daniel Koster
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate intraday forecasts of the power output by PhotoVoltaic (PV) systems
+are critical to improve the operation of energy distribution grids. We describe
+a neural autoregressive model that aims to perform such intraday forecasts. We
+build upon a physical, deterministic PV performance model, the output of which
+is used as covariates in the context of the neural model. In addition, our
+application data relates to a geographically distributed set of PV systems. We
+address all PV sites with a single neural model, which embeds the information
+about the PV site in specific covariates. We use a scale-free approach which
+relies on the explicit modeling of seasonal effects. Our proposal repurposes a
+model initially used in the retail sector and discloses a novel truncated
+Gaussian output distribution. An ablation study and a comparison to alternative
+architectures from the literature shows that the components in the best
+performing proposed model variant work synergistically to reach a skill score
+of 15.72% with respect to the physical model, used as a baseline.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 7 figures, Accepted for publication in Neural Computing and
+  Applications on 12/07/2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Language-specific Calibration for Pruning Multilingual Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14398v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14398v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simon Kurz, Jian-Jia Chen, Lucie Flek, Zhixue Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in large language model (LLM) pruning have shown
+state-of-the-art compression results in post-training and retraining-free
+settings while maintaining high predictive performance. However, such research
+mainly considers calibrating pruning using English text, despite the
+multilingual nature of modern LLMs and their frequent uses in non-English
+languages. In this paper, we set out to explore effective strategies for
+calibrating the pruning of multilingual language models. We present the first
+comprehensive empirical study, comparing different calibration languages for
+pruning multilingual models across diverse tasks, models, and state-of-the-art
+pruning techniques. Our results present practical suggestions, for example,
+calibrating in the target language can efficiently yield lower perplexity, but
+does not necessarily benefit downstream tasks. Our further analysis experiments
+unveil that calibration in the target language mainly contributes to preserving
+language-specific features related to fluency and coherence, but might not
+contribute to capturing language-agnostic features such as language
+understanding and reasoning. Last, we provide practical recommendations for
+future practitioners.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Causality-Aware Spatiotemporal Graph Neural Networks for Spatiotemporal
+  Time Series Imputation <span class="chip">CIKM'2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.11960v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.11960v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Baoyu Jing, Dawei Zhou, Kan Ren, Carl Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spatiotemporal time series are usually collected via monitoring sensors
+placed at different locations, which usually contain missing values due to
+various failures, such as mechanical damages and Internet outages. Imputing the
+missing values is crucial for analyzing time series. When recovering a specific
+data point, most existing methods consider all the information relevant to that
+point regardless of the cause-and-effect relationship. During data collection,
+it is inevitable that some unknown confounders are included, e.g., background
+noise in time series and non-causal shortcut edges in the constructed sensor
+network. These confounders could open backdoor paths and establish non-causal
+correlations between the input and output. Over-exploiting these non-causal
+correlations could cause overfitting. In this paper, we first revisit
+spatiotemporal time series imputation from a causal perspective and show how to
+block the confounders via the frontdoor adjustment. Based on the results of
+frontdoor adjustment, we introduce a novel Causality-Aware Spatiotemporal Graph
+Neural Network (Casper), which contains a novel Prompt Based Decoder (PBD) and
+a Spatiotemporal Causal Attention (SCA). PBD could reduce the impact of
+confounders and SCA could discover the sparse causal relationships among
+embeddings. Theoretical analysis reveals that SCA discovers causal
+relationships based on the values of gradients. We evaluate Casper on three
+real-world datasets, and the experimental results show that Casper could
+outperform the baselines and could effectively discover causal relationships.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CIKM'2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Inferring Individual Direct Causal Effects Under Heterogeneous Peer
+  Influence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.17479v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.17479v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shishir Adhikari, Elena Zheleva
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Causal inference in networks should account for interference, which occurs
+when a unit's outcome is influenced by treatments or outcomes of peers.
+Heterogeneous peer influence (HPI) occurs when a unit's outcome is influenced
+differently by different peers based on their attributes and relationships, or
+when each unit has a different susceptibility to peer influence. Existing
+solutions to estimating direct causal effects under interference consider
+either homogeneous influence from peers or specific heterogeneous influence
+mechanisms (e.g., based on local neighborhood structure). This paper presents a
+methodology for estimating individual direct causal effects in the presence of
+HPI where the mechanism of influence is not known a priori. We propose a
+structural causal model for networks that can capture different possible
+assumptions about network structure, interference conditions, and causal
+dependence and enables reasoning about identifiability in the presence of HPI.
+We find potential heterogeneous contexts using the causal model and propose a
+novel graph neural network-based estimator to estimate individual direct causal
+effects. We show that state-of-the-art methods for individual direct effect
+estimation produce biased results in the presence of HPI, and that our proposed
+estimator is robust.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Platform-Agnostic Deep Reinforcement Learning Framework for Effective
+  Sim2Real Transfer towards Autonomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.08235v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.08235v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dianzhao Li, Ostap Okhrin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Reinforcement Learning (DRL) has shown remarkable success in solving
+complex tasks across various research fields. However, transferring DRL agents
+to the real world is still challenging due to the significant discrepancies
+between simulation and reality. To address this issue, we propose a robust DRL
+framework that leverages platform-dependent perception modules to extract
+task-relevant information and train a lane-following and overtaking agent in
+simulation. This framework facilitates the seamless transfer of the DRL agent
+to new simulated environments and the real world with minimal effort. We
+evaluate the performance of the agent in various driving scenarios in both
+simulation and the real world, and compare it to human players and the PID
+baseline in simulation. Our proposed framework significantly reduces the gaps
+between different platforms and the Sim2Real gap, enabling the trained agent to
+achieve similar performance in both simulation and the real world, driving the
+vehicle effectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving the forecast accuracy of wind power by leveraging multiple
+  hierarchical structure 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03472v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03472v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucas English, Mahdi Abolghasemi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Renewable energy generation is of utmost importance for global
+decarbonization. Forecasting renewable energies, particularly wind energy, is
+challenging due to the inherent uncertainty in wind energy generation, which
+depends on weather conditions. Recent advances in hierarchical forecasting
+through reconciliation have demonstrated a significant increase in the quality
+of wind energy forecasts for short-term periods. We leverage the
+cross-sectional and temporal hierarchical structure of turbines in wind farms
+and build cross-temporal hierarchies to further investigate how integrated
+cross-sectional and temporal dimensions can add value to forecast accuracy in
+wind farms. We found that cross-temporal reconciliation was superior to
+individual cross-sectional reconciliation at multiple temporal aggregations.
+Additionally, machine learning based forecasts that were cross-temporally
+reconciled demonstrated high accuracy at coarser temporal granularities, which
+may encourage adoption for short-term wind forecasts. Empirically, we provide
+insights for decision-makers on the best methods for forecasting high-frequency
+wind data across different forecasting horizons and levels.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>41 pages, 14 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DeepMIF: Deep Monotonic Implicit Fields for Large-Scale LiDAR 3D Mapping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.17550v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.17550v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kutay Yılmaz, Matthias Nießner, Anastasiia Kornilova, Alexey Artemov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, significant progress has been achieved in sensing real large-scale
+outdoor 3D environments, particularly by using modern acquisition equipment
+such as LiDAR sensors. Unfortunately, they are fundamentally limited in their
+ability to produce dense, complete 3D scenes. To address this issue, recent
+learning-based methods integrate neural implicit representations and
+optimizable feature grids to approximate surfaces of 3D scenes. However,
+naively fitting samples along raw LiDAR rays leads to noisy 3D mapping results
+due to the nature of sparse, conflicting LiDAR measurements. Instead, in this
+work we depart from fitting LiDAR data exactly, instead letting the network
+optimize a non-metric monotonic implicit field defined in 3D space. To fit our
+field, we design a learning system integrating a monotonicity loss that enables
+optimizing neural monotonic fields and leverages recent progress in large-scale
+3D mapping. Our algorithm achieves high-quality dense 3D mapping performance as
+captured by multiple quantitative and perceptual measures and visual results
+obtained for Mai City, Newer College, and KITTI benchmarks. The code of our
+approach will be made publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FERGI: Automatic Annotation of User Preferences for Text-to-Image
+  Generation from Spontaneous Facial Expression Reaction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.03187v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.03187v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuangquan Feng, Junhua Ma, Virginia R. de Sa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Researchers have proposed to use data of human preference feedback to
+fine-tune text-to-image generative models. However, the scalability of human
+feedback collection has been limited by its reliance on manual annotation.
+Therefore, we develop and test a method to automatically score user preferences
+from their spontaneous facial expression reaction to the generated images. We
+collect a dataset of Facial Expression Reaction to Generated Images (FERGI) and
+show that the activations of multiple facial action units (AUs) are highly
+correlated with user evaluations of the generated images. We develop an FAU-Net
+(Facial Action Units Neural Network), which receives inputs from an AU
+estimation model, to automatically score user preferences for text-to-image
+generation based on their facial expression reactions, which is complementary
+to the pre-trained scoring models based on the input text prompts and generated
+images. Integrating our FAU-Net valence score with the pre-trained scoring
+models improves their consistency with human preferences. This method of
+automatic annotation with facial expression analysis can be potentially
+generalized to other generation tasks. The code is available at
+https://github.com/ShuangquanFeng/FERGI, and the dataset is also available at
+the same link for research purposes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Trade-off between Gradient Measurement Efficiency and Expressivity in
+  Deep Quantum Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.18316v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.18316v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Koki Chinzei, Shinichiro Yamano, Quoc Hoan Tran, Yasuhiro Endo, Hirotaka Oshima
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantum neural networks (QNNs) require an efficient training algorithm to
+achieve practical quantum advantages. A promising approach is the use of
+gradient-based optimization algorithms, where gradients are estimated through
+quantum measurements. However, general QNNs lack an efficient gradient
+measurement algorithm, which poses a fundamental and practical challenge to
+realizing scalable QNNs. In this work, we rigorously prove a trade-off between
+gradient measurement efficiency, defined as the mean number of simultaneously
+measurable gradient components, and expressivity in a wide class of deep QNNs,
+elucidating the theoretical limits and possibilities of efficient gradient
+estimation. This trade-off implies that a more expressive QNN requires a higher
+measurement cost in gradient estimation, whereas we can increase gradient
+measurement efficiency by reducing the QNN expressivity to suit a given task.
+We further propose a general QNN ansatz called the stabilizer-logical product
+ansatz (SLPA), which can reach the upper limit of the trade-off inequality by
+leveraging the symmetric structure of the quantum circuit. In learning an
+unknown symmetric function, the SLPA drastically reduces the quantum resources
+required for training while maintaining accuracy and trainability compared to a
+well-designed symmetric circuit based on the parameter-shift method. Our
+results not only reveal a theoretical understanding of efficient training in
+QNNs but also provide a standard and broadly applicable efficient QNN design.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Domain-decoupled Physics-informed Neural Networks with Closed-form
+  Gradients for Fast Model Learning of Dynamical Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14951v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14951v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Henrik Krauss, Tim-Lukas Habich, Max Bartholdt, Thomas Seel, Moritz Schappler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Physics-informed neural networks (PINNs) are trained using physical equations
+and can also incorporate unmodeled effects by learning from data. PINNs for
+control (PINCs) of dynamical systems are gaining interest due to their
+prediction speed compared to classical numerical integration methods for
+nonlinear state-space models, making them suitable for real-time control
+applications. We introduce the domain-decoupled physics-informed neural network
+(DD-PINN) to address current limitations of PINC in handling large and complex
+nonlinear dynamical systems. The time domain is decoupled from the feed-forward
+neural network to construct an Ansatz function, allowing for calculation of
+gradients in closed form. This approach significantly reduces training times,
+especially for large dynamical systems, compared to PINC, which relies on
+graph-based automatic differentiation. Additionally, the DD-PINN inherently
+fulfills the initial condition and supports higher-order excitation inputs,
+simplifying the training process and enabling improved prediction accuracy.
+Validation on three systems - a nonlinear mass-spring-damper, a
+five-mass-chain, and a two-link robot - demonstrates that the DD-PINN achieves
+significantly shorter training times. In cases where the PINC's prediction
+diverges, the DD-PINN's prediction remains stable and accurate due to higher
+physics loss reduction or use of a higher-order excitation input. The DD-PINN
+allows for fast and accurate learning of large dynamical systems previously out
+of reach for the PINC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to International Conference on Informatics in Control,
+  Automation and Robotics (ICINCO) 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Solid Waste Detection, Monitoring and Mapping in Remote Sensing Images:
+  A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.09066v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.09066v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Piero Fraternali, Luca Morandini, Sergio Luis Herrera González
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The detection and characterization of illegal solid waste disposal sites are
+essential for environmental protection, particularly for mitigating pollution
+and health hazards. Improperly managed landfills contaminate soil and
+groundwater via rainwater infiltration, posing threats to both animals and
+humans. Traditional landfill identification approaches, such as on-site
+inspections, are time-consuming and expensive. Remote sensing is a
+cost-effective solution for the identification and monitoring of solid waste
+disposal sites that enables broad coverage and repeated acquisitions over time.
+Earth Observation (EO) satellites, equipped with an array of sensors and
+imaging capabilities, have been providing high-resolution data for several
+decades. Researchers proposed specialized techniques that leverage remote
+sensing imagery to perform a range of tasks such as waste site detection,
+dumping site monitoring, and assessment of suitable locations for new
+landfills. This review aims to provide a detailed illustration of the most
+relevant proposals for the detection and monitoring of solid waste sites by
+describing and comparing the approaches, the implemented techniques, and the
+employed data. Furthermore, since the data sources are of the utmost importance
+for developing an effective solid waste detection model, a comprehensive
+overview of the satellites and publicly available data sets is presented.
+Finally, this paper identifies the open issues in the state-of-the-art and
+discusses the relevant research directions for reducing the costs and improving
+the effectiveness of novel solid waste detection methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond Uniform Query Distribution: Key-Driven Grouped Query Attention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.08454v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.08454v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zohaib Khan, Muhammad Khaquan, Omer Tafveez, Burhanuddin Samiwala, Agha Ali Raza
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Transformer architecture has revolutionized deep learning through its
+Self-Attention mechanism, which effectively captures contextual information.
+However, the memory footprint of Self-Attention presents significant challenges
+for long-sequence tasks. Grouped Query Attention (GQA) addresses this issue by
+grouping queries and mean-pooling the corresponding key-value heads - reducing
+the number of overall parameters and memory requirements in a flexible manner
+without adversely compromising model accuracy. In this work, we introduce
+enhancements to GQA, focusing on two novel approaches that deviate from the
+static nature of grouping: Key-Distributed GQA (KDGQA) and Dynamic
+Key-Distributed GQA (DGQA), which leverage information from the norms of the
+key heads to inform query allocation. Specifically, KDGQA looks at the ratios
+of the norms of the key heads during each forward pass, while DGQA examines the
+ratios of the norms as they evolve through training. Additionally, we present
+Perturbed GQA (PGQA) as a case-study, which introduces variability in (static)
+group formation via subtracting noise from the attention maps. Our experiments
+with up-trained Vision Transformers, for Image Classification on datasets such
+as CIFAR-10, CIFAR-100, Food101, and Tiny ImageNet, demonstrate the promise of
+these variants in improving upon the original GQA through more informed and
+adaptive grouping mechanisms: specifically ViT-L experiences accuracy gains of
+up to 8% when utilizing DGQA in comparison to GQA and other variants. We
+further analyze the impact of the number of Key-Value Heads on performance,
+underscoring the importance of utilizing query-key affinities. Code is
+available on GitHub.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Symplectic Bregman divergences 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.12961v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.12961v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Frank Nielsen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a generalization of Bregman divergences in symplectic vector
+spaces that we term symplectic Bregman divergences. Symplectic Bregman
+divergences are derived from a symplectic generalization of the Fenchel-Young
+inequality which relies on the notion of symplectic subdifferentials. The
+symplectic Fenchel-Young inequality is obtained using the symplectic Fenchel
+transform which is defined with respect to the symplectic form. Since
+symplectic forms can be generically built from pairings of dual systems, we get
+a generalization of Bregman divergences in dual systems obtained by equivalent
+symplectic Bregman divergences. In particular, when the symplectic form is
+derived from an inner product, we show that the corresponding symplectic
+Bregman divergences amount to ordinary Bregman divergences with respect to
+composite inner products. Some potential applications of symplectic divergences
+in geometric mechanics, information geometry, and learning dynamics in machine
+learning are touched upon.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AnomalyLLM: Few-shot Anomaly Edge Detection for Dynamic Graphs using
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.07626v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.07626v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuo Liu, Di Yao, Lanting Fang, Zhetao Li, Wenbin Li, Kaiyu Feng, XiaoWen Ji, Jingping Bi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting anomaly edges for dynamic graphs aims to identify edges
+significantly deviating from the normal pattern and can be applied in various
+domains, such as cybersecurity, financial transactions and AIOps. With the
+evolving of time, the types of anomaly edges are emerging and the labeled
+anomaly samples are few for each type. Current methods are either designed to
+detect randomly inserted edges or require sufficient labeled data for model
+training, which harms their applicability for real-world applications. In this
+paper, we study this problem by cooperating with the rich knowledge encoded in
+large language models(LLMs) and propose a method, namely AnomalyLLM. To align
+the dynamic graph with LLMs, AnomalyLLM pre-trains a dynamic-aware encoder to
+generate the representations of edges and reprograms the edges using the
+prototypes of word embeddings. Along with the encoder, we design an in-context
+learning framework that integrates the information of a few labeled samples to
+achieve few-shot anomaly detection. Experiments on four datasets reveal that
+AnomalyLLM can not only significantly improve the performance of few-shot
+anomaly detection, but also achieve superior results on new anomalies without
+any update of model parameters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ViIK: Flow-based Vision Inverse Kinematics Solver with Fusing Collision
+  Checking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.11293v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.11293v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qinglong Meng, Chongkun Xia, Xueqian Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Inverse Kinematics (IK) is to find the robot's configurations that satisfy
+the target pose of the end effector. In motion planning, diverse configurations
+were required in case a feasible trajectory was not found. Meanwhile, collision
+checking (CC), e.g. Oriented bounding box (OBB), Discrete Oriented Polytope
+(DOP), and Quickhull \cite{quickhull}, needs to be done for each configuration
+provided by the IK solver to ensure every goal configuration for motion
+planning is available. This means the classical IK solver and CC algorithm
+should be executed repeatedly for every configuration. Thus, the preparation
+time is long when the required number of goal configurations is large, e.g.
+motion planning in cluster environments. Moreover, structured maps, which might
+be difficult to obtain, were required by classical collision-checking
+algorithms. To sidestep such two issues, we propose a flow-based vision method
+that can output diverse available configurations by fusing inverse kinematics
+and collision checking, named Vision Inverse Kinematics solver (ViIK).
+Moreover, ViIK uses RGB images as the perception of environments. ViIK can
+output 1000 configurations within 40 ms, and the accuracy is about 3
+millimeters and 1.5 degrees. The higher accuracy can be obtained by being
+refined by the classical IK solver within a few iterations. The self-collision
+rates can be lower than 2%. The collision-with-env rates can be lower than 10%
+in most scenes. The code is available at: https://github.com/AdamQLMeng/ViIK.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Physics-Informed Neural Network for Concrete Manufacturing Process
+  Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14502v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14502v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sam Varghese, Rahul Anand, Dr. Gaurav Paliwal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Concrete manufacturing projects are one of the most common ones for
+consulting agencies. Because of the highly non-linear dependency of input
+materials like ash, water, cement, superplastic, etc; with the resultant
+strength of concrete, it gets difficult for machine learning models to
+successfully capture this relation and perform cost optimizations. This paper
+highlights how PINNs (Physics Informed Neural Networks) can be useful in the
+given situation. This state-of-the-art model shall also get compared with
+traditional models like Linear Regression, Random Forest, Gradient Boosting,
+and Deep Neural Network. Results of the research highlights how well PINNs
+performed even with reduced dataset, thus resolving one of the biggest issues
+of limited data availability for ML models. On an average, PINN got the loss
+value reduced by 26.3% even with 40% lesser data compared to the Deep Neural
+Network. In addition to predicting strength of the concrete given the quantity
+of raw materials, the paper also highlights the use of heuristic optimization
+method like Particle Swarm Optimization (PSO) in predicting quantity of raw
+materials required to manufacture concrete of given strength with least cost.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Procedural Adherence and Interpretability Through Neuro-Symbolic
+  Generative Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.16905v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.16905v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Raven Rothkopf, Hannah Tongxin Zeng, Mark Santolucito
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The surge in popularity of large language models (LLMs) has opened doors for
+new approaches to the creation of interactive agents. However, managing and
+interpreting the temporal behavior of such agents over the course of a
+potentially infinite interaction remain challenging. The stateful, long-term
+horizon reasoning required for coherent agent behavior does not fit well into
+the LLM paradigm. We propose a combination of formal logic-based program
+synthesis and LLM content generation to bring guarantees of procedural
+adherence and interpretability to generative agent behavior. To illustrate the
+benefit of procedural adherence and interpretability, we use Temporal Stream
+Logic (TSL) to generate an automaton that enforces an interpretable, high-level
+temporal structure on an agent. With the automaton tracking the context of the
+interaction and making decisions to guide the conversation accordingly, we can
+drive content generation in a way that allows the LLM to focus on a shorter
+context window. We evaluated our approach on different tasks involved in
+creating an interactive agent specialized for generating
+choose-your-own-adventure games. We found that over all of the tasks, an
+automaton-enhanced agent with procedural guarantees achieves at least 96%
+adherence to its temporal constraints, whereas a purely LLM-based agent
+demonstrates as low as 14.67% adherence.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Wireless Channel Aware Data Augmentation Methods for Deep Learning-Based
+  Indoor Localization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.06452v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.06452v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Omer Gokalp Serbetci, Daoud Burghal, Andreas F. Molisch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Indoor localization is a challenging problem that - unlike outdoor
+localization - lacks a universal and robust solution. Machine Learning (ML),
+particularly Deep Learning (DL), methods have been investigated as a promising
+approach. Although such methods bring remarkable localization accuracy, they
+heavily depend on the training data collected from the environment. The data
+collection is usually a laborious and time-consuming task, but Data
+Augmentation (DA) can be used to alleviate this issue. In this paper, different
+from previously used DA, we propose methods that utilize the domain knowledge
+about wireless propagation channels and devices. The methods exploit the
+typical hardware component drift in the transceivers and/or the statistical
+behavior of the channel, in combination with the measured Power Delay Profile
+(PDP). We comprehensively evaluate the proposed methods to demonstrate their
+effectiveness. This investigation mainly focuses on the impact of factors such
+as the number of measurements, augmentation proportion, and the environment of
+interest impact the effectiveness of the different DA methods. We show that in
+the low-data regime (few actual measurements available), localization accuracy
+increases up to 50%, matching non-augmented results in the high-data regime. In
+addition, the proposed methods may outperform the measurement-only high-data
+performance by up to 33% using only 1/4 of the amount of measured data. We also
+exhibit the effect of different training data distribution and quality on the
+effectiveness of DA. Finally, we demonstrate the power of the proposed methods
+when employed along with Transfer Learning (TL) to address the data scarcity in
+target and/or source environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 14 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Lipschitz-regularized gradient flows and generative particle algorithms
+  for high-dimensional scarce data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.17230v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.17230v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyemin Gu, Panagiota Birmpa, Yannis Pantazis, Luc Rey-Bellet, Markos A. Katsoulakis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We build a new class of generative algorithms capable of efficiently learning
+an arbitrary target distribution from possibly scarce, high-dimensional data
+and subsequently generate new samples. These generative algorithms are
+particle-based and are constructed as gradient flows of Lipschitz-regularized
+Kullback-Leibler or other $f$-divergences, where data from a source
+distribution can be stably transported as particles, towards the vicinity of
+the target distribution. As a highlighted result in data integration, we
+demonstrate that the proposed algorithms correctly transport gene expression
+data points with dimension exceeding 54K, while the sample size is typically
+only in the hundreds.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Decentralized Online Learning for Random Inverse Problems Over Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.11789v8">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.11789v8.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tao Li, Xiwei Zhang, Yan Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a decentralized online learning algorithm for distributed random
+inverse problems over network graphs with online measurements, and unifies the
+distributed parameter estimation in Hilbert spaces and the least mean square
+problem in reproducing kernel Hilbert spaces (RKHS-LMS). We transform the
+convergence of the algorithm into the asymptotic stability of a class of
+inhomogeneous random difference equations in Hilbert spaces with
+$L_{2}$-bounded martingale difference terms and develop the $L_2$-asymptotic
+stability theory in Hilbert spaces. We show that if the network graph is
+connected and the sequence of forward operators satisfies the
+infinite-dimensional spatio-temporal persistence of excitation condition, then
+the estimates of all nodes are mean square and almost surely strongly
+consistent. Moreover, we propose a decentralized online learning algorithm in
+RKHS based on non-stationary online data streams, and prove that the algorithm
+is mean square and almost surely strongly consistent if the operators induced
+by the random input data satisfy the infinite-dimensional spatio-temporal
+persistence of excitation condition.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Quantum-machine-assisted Drug Discovery: <span class="highlight-title">Survey</span> and Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13479v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13479v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yidong Zhou, Jintai Chen, Jinglei Cheng, Gopal Karemore, Marinka Zitnik, Frederic T. Chong, Junyu Liu, Tianfan Fu, Zhiding Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Drug discovery and development is a highly complex and costly endeavor,
+typically requiring over a decade and substantial financial investment to bring
+a new drug to market. Traditional computer-aided drug design (CADD) has made
+significant progress in accelerating this process, but the development of
+quantum computing offers potential due to its unique capabilities. This paper
+discusses the integration of quantum computing into drug discovery and
+development, focusing on how quantum technologies might accelerate and enhance
+various stages of the drug development cycle. Specifically, we explore the
+application of quantum computing in addressing challenges related to drug
+discovery, such as molecular simulation and the prediction of drug-target
+interactions, as well as the optimization of clinical trial outcomes. By
+leveraging the inherent capabilities of quantum computing, we might be able to
+reduce the time and cost associated with bringing new drugs to market,
+ultimately benefiting public health.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Heat Death of Generative Models in Closed-Loop Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.02325v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.02325v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matteo Marchi, Stefano Soatto, Pratik Chaudhari, Paulo Tabuada
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Improvement and adoption of generative machine learning models is rapidly
+accelerating, as exemplified by the popularity of LLMs (Large Language Models)
+for text, and diffusion models for image generation. As generative models
+become widespread, data they generate is incorporated into shared content
+through the public web. This opens the question of what happens when data
+generated by a model is fed back to the model in subsequent training campaigns.
+This is a question about the stability of the training process, whether the
+distribution of publicly accessible content, which we refer to as "knowledge",
+remains stable or collapses.
+  Small scale empirical experiments reported in the literature show that this
+closed-loop training process is prone to degenerating. Models may start
+producing gibberish data, or sample from only a small subset of the desired
+data distribution (a phenomenon referred to as mode collapse). So far there has
+been only limited theoretical understanding of this process, in part due to the
+complexity of the deep networks underlying these generative models.
+  The aim of this paper is to provide insights into this process (that we refer
+to as "generative closed-loop learning") by studying the learning dynamics of
+generative models that are fed back their own produced content in addition to
+their original training dataset. The sampling of many of these models can be
+controlled via a "temperature" parameter. Using dynamical systems tools, we
+show that, unless a sufficient amount of external data is introduced at each
+iteration, any non-trivial temperature leads the model to asymptotically
+degenerate. In fact, either the generative distribution collapses to a small
+set of outputs or becomes uniform over a large set of outputs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Graphical vs. Deep Generative Models: Measuring the Impact of
+  Differentially Private Mechanisms and Budgets on Utility <span class="chip">CCS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.10994v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.10994v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Georgi Ganev, Kai Xu, Emiliano De Cristofaro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative models trained with Differential Privacy (DP) can produce
+synthetic data while reducing privacy risks. However, navigating their
+privacy-utility tradeoffs makes finding the best models for specific
+settings/tasks challenging. This paper bridges this gap by profiling how DP
+generative models for tabular data distribute privacy budgets across rows and
+columns, which is one of the primary sources of utility degradation. We compare
+graphical and deep generative models, focusing on the key factors contributing
+to how privacy budgets are spent, i.e., underlying modeling techniques, DP
+mechanisms, and data dimensionality.
+  Through our measurement study, we shed light on the characteristics that make
+different models suitable for various settings and tasks. For instance, we find
+that graphical models distribute privacy budgets horizontally and thus cannot
+handle relatively wide datasets for a fixed training time; also, the
+performance on the task they were optimized for monotonically increases with
+more data but could also overfit. Deep generative models spend their budgets
+per iteration, so their behavior is less predictable with varying dataset
+dimensions, but are more flexible as they could perform better if trained on
+more features. Moreover, low levels of privacy ($\epsilon\geq100$) could help
+some models generalize, achieving better results than without applying DP. We
+believe our work will aid the deployment of DP synthetic data techniques by
+navigating through the best candidate models vis-a-vis the dataset features,
+desired privacy levels, and downstream tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>A shorter version of this paper appears in the Proceedings of the
+  31st ACM Conference on Computer and Communications Security (ACM CCS 2024).
+  This is the full version</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">7</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Kangaroo: A Powerful Video-Language Model Supporting Long-context Video
+  Input 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15542v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15542v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiajun Liu, Yibing Wang, Hanghang Ma, Xiaoping Wu, Xiaoqi Ma, Xiaoming Wei, Jianbin Jiao, Enhua Wu, Jie Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Rapid advancements have been made in extending Large Language Models (LLMs)
+to Large Multi-modal Models (LMMs). However, extending input modality of LLMs
+to video data remains a challenging endeavor, especially for long videos. Due
+to insufficient access to large-scale high-quality video data and the excessive
+compression of visual features, current methods exhibit limitations in
+effectively processing long videos. In this paper, we introduce Kangaroo, a
+powerful Video LMM aimed at addressing these challenges. Confronted with issue
+of inadequate training data, we develop a data curation system to build a
+large-scale dataset with high-quality annotations for vision-language
+pre-training and instruction tuning. In addition, we design a curriculum
+training pipeline with gradually increasing resolution and number of input
+frames to accommodate long videos. Evaluation results demonstrate that, with 8B
+parameters, Kangaroo achieves state-of-the-art performance across a variety of
+video understanding benchmarks while exhibiting competitive results on others.
+Particularly, on benchmarks specialized for long videos, Kangaroo excels some
+larger models with over 10B parameters and proprietary models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Simple Baseline with Single-encoder for Referring Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15521v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15521v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seonghoon Yu, Ilchae Jung, Byeongju Han, Taeoh Kim, Yunho Kim, Dongyoon Wee, Jeany Son
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Referring image segmentation (RIS) requires dense vision-language
+interactions between visual pixels and textual words to segment objects based
+on a given description. However, commonly adapted dual-encoders in RIS, e.g.,
+Swin transformer and BERT (uni-modal encoders) or CLIP (a multi-modal
+dual-encoder), lack dense multi-modal interactions during pre-training, leading
+to a gap with a pixel-level RIS task. To bridge this gap, existing RIS methods
+often rely on multi-modal fusion modules that interact two encoders, but this
+approach leads to high computational costs. In this paper, we present a novel
+RIS method with a single-encoder, i.e., BEiT-3, maximizing the potential of
+shared self-attention across all framework components. This enables seamless
+interactions of two modalities from input to final prediction, producing
+granularly aligned multi-modal features. Furthermore, we propose lightweight
+yet effective decoder modules, a Shared FPN and a Shared Mask Decoder, which
+contribute to the high efficiency of our model. Our simple baseline with a
+single encoder achieves outstanding performances on the RIS benchmark datasets
+while maintaining computational efficiency, compared to the most recent SoTA
+methods based on dual-encoders.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ArXiv pre-print</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hand1000: Generating Realistic Hands from Text with Only 1,000 Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15461v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15461v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haozhuo Zhang, Bin Zhu, Yu Cao, Yanbin Hao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-image generation models have achieved remarkable advancements in
+recent years, aiming to produce realistic images from textual descriptions.
+However, these models often struggle with generating anatomically accurate
+representations of human hands. The resulting images frequently exhibit issues
+such as incorrect numbers of fingers, unnatural twisting or interlacing of
+fingers, or blurred and indistinct hands. These issues stem from the inherent
+complexity of hand structures and the difficulty in aligning textual
+descriptions with precise visual depictions of hands. To address these
+challenges, we propose a novel approach named Hand1000 that enables the
+generation of realistic hand images with target gesture using only 1,000
+training samples. The training of Hand1000 is divided into three stages with
+the first stage aiming to enhance the model's understanding of hand anatomy by
+using a pre-trained hand gesture recognition model to extract gesture
+representation. The second stage further optimizes text embedding by
+incorporating the extracted hand gesture representation, to improve alignment
+between the textual descriptions and the generated hand images. The third stage
+utilizes the optimized embedding to fine-tune the Stable Diffusion model to
+generate realistic hand images. In addition, we construct the first publicly
+available dataset specifically designed for text-to-hand image generation.
+Based on the existing hand gesture recognition dataset, we adopt advanced image
+captioning models and LLaMA3 to generate high-quality textual descriptions
+enriched with detailed gesture information. Extensive experiments demonstrate
+that Hand1000 significantly outperforms existing models in producing
+anatomically correct hand images while faithfully representing other details in
+the text, such as faces, clothing, and colors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page https://haozhuo-zhang.github.io/Hand1000-project-page/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SVDD 2024: The Inaugural Singing Voice Deepfake Detection Challenge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.16132v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.16132v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        You Zhang, Yongyi Zang, Jiatong Shi, Ryuichi Yamamoto, Tomoki Toda, Zhiyao Duan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the advancements in singing voice generation and the growing presence of
+AI singers on media platforms, the inaugural Singing Voice Deepfake Detection
+(SVDD) Challenge aims to advance research in identifying AI-generated singing
+voices from authentic singers. This challenge features two tracks: a controlled
+setting track (CtrSVDD) and an in-the-wild scenario track (WildSVDD). The
+CtrSVDD track utilizes publicly available singing vocal data to generate
+deepfakes using state-of-the-art singing voice synthesis and conversion
+systems. Meanwhile, the WildSVDD track expands upon the existing SingFake
+dataset, which includes data sourced from popular user-generated content
+websites. For the CtrSVDD track, we received submissions from 47 teams, with 37
+surpassing our baselines and the top team achieving a 1.65% equal error rate.
+For the WildSVDD track, we benchmarked the baselines. This paper reviews these
+results, discusses key findings, and outlines future directions for SVDD
+research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Proceedings of The second international workshop on eXplainable AI for
+  the Arts (XAIxArts) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.14485v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.14485v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nick Bryan-Kinns, Corey Ford, Shuoyang Zheng, Helen Kennedy, Alan Chamberlain, Makayla Lewis, Drew Hemment, Zijin Li, Qiong Wu, Lanxi Xiao, Gus Xia, Jeba Rezwana, Michael Clemens, Gabriel Vigliensoni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This second international workshop on explainable AI for the Arts (XAIxArts)
+brought together a community of researchers in HCI, Interaction Design, AI,
+explainable AI (XAI), and digital arts to explore the role of XAI for the Arts.
+Workshop held at the 16th ACM Conference on Creativity and Cognition (C&C
+2024), Chicago, USA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Proceedings of The second international workshop on eXplainable AI
+  for the Arts (XAIxArts)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MambaGesture: Enhancing Co-Speech Gesture Generation with Mamba and
+  Disentangled Multi-Modality Fusion <span class="chip">ACM MM 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.19976v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.19976v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chencan Fu, Yabiao Wang, Jiangning Zhang, Zhengkai Jiang, Xiaofeng Mao, Jiafu Wu, Weijian Cao, Chengjie Wang, Yanhao Ge, Yong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Co-speech gesture generation is crucial for producing synchronized and
+realistic human gestures that accompany speech, enhancing the animation of
+lifelike avatars in virtual environments. While diffusion models have shown
+impressive capabilities, current approaches often overlook a wide range of
+modalities and their interactions, resulting in less dynamic and contextually
+varied gestures. To address these challenges, we present MambaGesture, a novel
+framework integrating a Mamba-based attention block, MambaAttn, with a
+multi-modality feature fusion module, SEAD. The MambaAttn block combines the
+sequential data processing strengths of the Mamba model with the contextual
+richness of attention mechanisms, enhancing the temporal coherence of generated
+gestures. SEAD adeptly fuses audio, text, style, and emotion modalities,
+employing disentanglement to deepen the fusion process and yield gestures with
+greater realism and diversity. Our approach, rigorously evaluated on the
+multi-modal BEAT dataset, demonstrates significant improvements in Fr\'echet
+Gesture Distance (FGD), diversity scores, and beat alignment, achieving
+state-of-the-art performance in co-speech gesture generation. Project website:
+$\href{https://fcchit.github.io/mambagesture/}{\textit{https://fcchit.github.io/mambagesture/}}$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM MM 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AIM 2024 Challenge on Compressed Video Quality Assessment: Methods and
+  Results 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.11982v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.11982v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maksim Smirnov, Aleksandr Gushchin, Anastasia Antsiferova, Dmitry Vatolin, Radu Timofte, Ziheng Jia, Zicheng Zhang, Wei Sun, Jiaying Qian, Yuqin Cao, Yinan Sun, Yuxin Zhu, Xiongkuo Min, Guangtao Zhai, Kanjar De, Qing Luo, Ao-Xiang Zhang, Peng Zhang, Haibo Lei, Linyan Jiang, Yaqing Li, Wenhui Meng, Xiaoheng Tan, Haiqiang Wang, Xiaozhong Xu, Shan Liu, Zhenzhong Chen, Zhengxue Cheng, Jiahao Xiao, Jun Xu, Chenlong He, Qi Zheng, Ruoxi Zhu, Min Li, Yibo Fan, Zhengzhong Tu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video quality assessment (VQA) is a crucial task in the development of video
+compression standards, as it directly impacts the viewer experience. This paper
+presents the results of the Compressed Video Quality Assessment challenge, held
+in conjunction with the Advances in Image Manipulation (AIM) workshop at ECCV
+2024. The challenge aimed to evaluate the performance of VQA methods on a
+diverse dataset of 459 videos, encoded with 14 codecs of various compression
+standards (AVC/H.264, HEVC/H.265, AV1, and VVC/H.266) and containing a
+comprehensive collection of compression artifacts. To measure the methods
+performance, we employed traditional correlation coefficients between their
+predictions and subjective scores, which were collected via large-scale
+crowdsourced pairwise human comparisons. For training purposes, participants
+were provided with the Compressed Video Quality Assessment Dataset (CVQAD), a
+previously developed dataset of 1022 videos. Up to 30 participating teams
+registered for the challenge, while we report the results of 6 teams, which
+submitted valid final solutions and code for reproducing the results. Moreover,
+we calculated and present the performance of state-of-the-art VQA methods on
+the developed dataset, providing a comprehensive benchmark for future research.
+The dataset, results, and online leaderboard are publicly available at
+https://challenges.videoprocessing.ai/challenges/compressedvideo-quality-assessment.html.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2024-08-27T00:00:00Z">2024-08-27</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">86</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Into the Unknown Unknowns: Engaged Human Learning through Participation
+  in Language Model Agent Conversations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15232v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15232v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yucheng Jiang, Yijia Shao, Dekun Ma, Sina J. Semnani, Monica S. Lam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While language model (LM)-powered chatbots and generative search engines
+excel at answering concrete queries, discovering information in the terrain of
+unknown unknowns remains challenging for users. To emulate the common
+educational scenario where children/students learn by listening to and
+participating in conversations of their parents/teachers, we create
+Collaborative STORM (Co-STORM). Unlike QA systems that require users to ask all
+the questions, Co-STORM lets users observe and occasionally steer the discourse
+among several LM agents. The agents ask questions on the user's behalf,
+allowing the user to discover unknown unknowns serendipitously. To facilitate
+user interaction, Co-STORM assists users in tracking the discourse by
+organizing the uncovered information into a dynamic mind map, ultimately
+generating a comprehensive report as takeaways. For automatic evaluation, we
+construct the WildSeek dataset by collecting real information-seeking records
+with user goals. Co-STORM outperforms baseline methods on both discourse trace
+and report quality. In a further human evaluation, 70% of participants prefer
+Co-STORM over a search engine, and 78% favor it over a RAG chatbot.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLM Defenses Are Not Robust to Multi-Turn Human Jailbreaks Yet 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15221v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15221v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nathaniel Li, Ziwen Han, Ian Steneker, Willow Primack, Riley Goodside, Hugh Zhang, Zifan Wang, Cristina Menghini, Summer Yue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent large language model (LLM) defenses have greatly improved models'
+ability to refuse harmful queries, even when adversarially attacked. However,
+LLM defenses are primarily evaluated against automated adversarial attacks in a
+single turn of conversation, an insufficient threat model for real-world
+malicious use. We demonstrate that multi-turn human jailbreaks uncover
+significant vulnerabilities, exceeding 70% attack success rate (ASR) on
+HarmBench against defenses that report single-digit ASRs with automated
+single-turn attacks. Human jailbreaks also reveal vulnerabilities in machine
+unlearning defenses, successfully recovering dual-use biosecurity knowledge
+from unlearned models. We compile these results into Multi-Turn Human
+Jailbreaks (MHJ), a dataset of 2,912 prompts across 537 multi-turn jailbreaks.
+We publicly release MHJ alongside a compendium of jailbreak tactics developed
+across dozens of commercial red teaming engagements, supporting research
+towards stronger LLM defenses.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Classifying populist language in American presidential and governor
+  speeches using automatic text analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15213v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15213v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Olaf van der Veen, Semir Dzebo, Levi Littvay, Kirk Hawkins, Oren Dar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Populism is a concept that is often used but notoriously difficult to
+measure. Common qualitative measurements like holistic grading or content
+analysis require great amounts of time and labour, making it difficult to
+quickly scope out which politicians should be classified as populist and which
+should not, while quantitative methods show mixed results when it comes to
+classifying populist rhetoric. In this paper, we develop a pipeline to train
+and validate an automated classification model to estimate the use of populist
+language. We train models based on sentences that were identified as populist
+and pluralist in 300 US governors' speeches from 2010 to 2018 and in 45
+speeches of presidential candidates in 2016. We find that these models classify
+most speeches correctly, including 84% of governor speeches and 89% of
+presidential speeches. These results extend to different time periods (with 92%
+accuracy on more recent American governors), different amounts of data (with as
+few as 70 training sentences per category achieving similar results), and when
+classifying politicians instead of individual speeches. This pipeline is thus
+an effective tool that can optimise the systematic and swift classification of
+the use of populist language in politicians' speeches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can Unconfident LLM Annotations Be Used for Confident Conclusions? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15204v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15204v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kristina Gligorić, Tijana Zrnic, Cinoo Lee, Emmanuel J. Candès, Dan Jurafsky
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have shown high agreement with human raters
+across a variety of tasks, demonstrating potential to ease the challenges of
+human data collection. In computational social science (CSS), researchers are
+increasingly leveraging LLM annotations to complement slow and expensive human
+annotations. Still, guidelines for collecting and using LLM annotations,
+without compromising the validity of downstream conclusions, remain limited. We
+introduce Confidence-Driven Inference: a method that combines LLM annotations
+and LLM confidence indicators to strategically select which human annotations
+should be collected, with the goal of producing accurate statistical estimates
+and provably valid confidence intervals while reducing the number of human
+annotations needed. Our approach comes with safeguards against LLM annotations
+of poor quality, guaranteeing that the conclusions will be both valid and no
+less accurate than if we only relied on human annotations. We demonstrate the
+effectiveness of Confidence-Driven Inference over baselines in statistical
+estimation tasks across three CSS settings--text politeness, stance, and
+bias--reducing the needed number of human annotations by over 25% in each.
+Although we use CSS settings for demonstration, Confidence-Driven Inference can
+be used to estimate most standard quantities across a broad range of NLP
+problems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Infusing Acoustic Pause Context into Text-Based Dementia Assessment <span class="chip">INTERSPEECH 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15188v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15188v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Franziska Braun, Sebastian P. Bayerl, Florian Hönig, Hartmut Lehfeld, Thomas Hillemacher, Tobias Bocklet, Korbinian Riedhammer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speech pauses, alongside content and structure, offer a valuable and
+non-invasive biomarker for detecting dementia. This work investigates the use
+of pause-enriched transcripts in transformer-based language models to
+differentiate the cognitive states of subjects with no cognitive impairment,
+mild cognitive impairment, and Alzheimer's dementia based on their speech from
+a clinical assessment. We address three binary classification tasks: Onset,
+monitoring, and dementia exclusion. The performance is evaluated through
+experiments on a German Verbal Fluency Test and a Picture Description Test,
+comparing the model's effectiveness across different speech production
+contexts. Starting from a textual baseline, we investigate the effect of
+incorporation of pause information and acoustic context. We show the test
+should be chosen depending on the task, and similarly, lexical pause
+information and acoustic cross-attention contribute differently.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at INTERSPEECH 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unlocking Potential in <span class="highlight-title">Pre-Train</span>ed Music Language Models for Versatile
+  Multi-Track Music Arrangement <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15176v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15176v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Longshen Ou, Jingwei Zhao, Ziyu Wang, Gus Xia, Ye Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models have shown significant capabilities across various
+domains, including symbolic music generation. However, leveraging these
+pre-trained models for controllable music arrangement tasks, each requiring
+different forms of musical information as control, remains a novel challenge.
+In this paper, we propose a unified sequence-to-sequence framework that enables
+the fine-tuning of a symbolic music language model for multiple multi-track
+arrangement tasks, including band arrangement, piano reduction, drum
+arrangement, and voice separation. Our experiments demonstrate that the
+proposed approach consistently achieves higher musical quality compared to
+task-specific baselines across all four tasks. Furthermore, through additional
+experiments on probing analysis, we show the pre-training phase equips the
+model with essential knowledge to understand musical conditions, which is hard
+to acquired solely through task-specific fine-tuning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ X-Reflect: Cross-Reflection <span class="highlight-title">Prompt</span>ing for Multimodal Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15172v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15172v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanjia Lyu, Ryan Rossi, Xiang Chen, Md Mehrab Tanjim, Stefano Petrangeli, Somdeb Sarkhel, Jiebo Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) and Large Multimodal Models (LMMs) have been
+shown to enhance the effectiveness of enriching item descriptions, thereby
+improving the accuracy of recommendation systems. However, most existing
+approaches either rely on text-only prompting or employ basic multimodal
+strategies that do not fully exploit the complementary information available
+from both textual and visual modalities. This paper introduces a novel
+framework, Cross-Reflection Prompting, termed X-Reflect, designed to address
+these limitations by prompting LMMs to explicitly identify and reconcile
+supportive and conflicting information between text and images. By capturing
+nuanced insights from both modalities, this approach generates more
+comprehensive and contextually richer item representations. Extensive
+experiments conducted on two widely used benchmarks demonstrate that our method
+outperforms existing prompting baselines in downstream recommendation accuracy.
+Additionally, we evaluate the generalizability of our framework across
+different LMM backbones and the robustness of the prompting strategies,
+offering insights for optimization. This work underscores the importance of
+integrating multimodal information and presents a novel solution for improving
+item understanding in multimodal recommendation systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Measuring text summarization factuality using atomic facts entailment
+  metrics in the context of retrieval augmented generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15171v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15171v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        N. E. Kriman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The use of large language models (LLMs) has significantly increased since the
+introduction of ChatGPT in 2022, demonstrating their value across various
+applications. However, a major challenge for enterprise and commercial adoption
+of LLMs is their tendency to generate inaccurate information, a phenomenon
+known as "hallucination." This project proposes a method for estimating the
+factuality of a summary generated by LLMs when compared to a source text. Our
+approach utilizes Naive Bayes classification to assess the accuracy of the
+content produced.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How <span class="highlight-title">transformer</span>s learn structured data: insights from hierarchical
+  filtering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15138v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15138v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jerome Garnier-Brun, Marc Mézard, Emanuele Moscato, Luca Saglietti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a hierarchical filtering procedure for generative models of
+sequences on trees, enabling control over the range of positional correlations
+in the data. Leveraging this controlled setting, we provide evidence that
+vanilla encoder-only transformer architectures can implement the optimal Belief
+Propagation algorithm on both root classification and masked language modeling
+tasks. Correlations at larger distances corresponding to increasing layers of
+the hierarchy are sequentially included as the network is trained. We analyze
+how the transformer layers succeed by focusing on attention maps from models
+trained with varying degrees of filtering. These attention maps show clear
+evidence for iterative hierarchical reconstruction of correlations, and we can
+relate these observations to a plausible implementation of the exact inference
+algorithm for the network sizes considered.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Relation Also Knows: Rethinking the Recall and Editing of Factual
+  Associations in Auto-Regressive <span class="highlight-title">Transformer</span> Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15091v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15091v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiyu Liu, Zhengxiao Liu, Naibin Gu, Zheng Lin, Wanli Ma, Ji Xiang, Weiping Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The storage and recall of factual associations in auto-regressive transformer
+language models (LMs) have drawn a great deal of attention, inspiring knowledge
+editing by directly modifying the located model weights. Most editing works
+achieve knowledge editing under the guidance of existing interpretations of
+knowledge recall that mainly focus on subject knowledge. However, these
+interpretations are seriously flawed, neglecting relation information and
+leading to the over-generalizing problem for editing. In this work, we discover
+a novel relation-focused perspective to interpret the knowledge recall of
+transformer LMs during inference and apply it on knowledge editing to avoid
+over-generalizing. Experimental results on the dataset supplemented with a new
+R-Specificity criterion demonstrate that our editing approach significantly
+alleviates over-generalizing while remaining competitive on other criteria,
+breaking the domination of subject-focused editing for future research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BaichuanSEED: Sharing the Potential of ExtensivE Data Collection and
+  Deduplication by Introducing a Competitive Large Language Model Baseline 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15079v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15079v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guosheng Dong, Da Pan, Yiding Sun, Shusen Zhang, Zheng Liang, Xin Wu, Yanjun Shen, Fan Yang, Haoze Sun, Tianpeng Li, Mingan Lin, Jianhua Xu, Yufan Zhang, Xiaonan Nie, Lei Su, Bingning Wang, Wentao Zhang, Jiaxin Mao, Zenan Zhou, Weipeng Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The general capabilities of Large Language Models (LLM) highly rely on the
+composition and selection on extensive pretraining datasets, treated as
+commercial secrets by several institutions. To mitigate this issue, we
+open-source the details of a universally applicable data processing pipeline
+and validate its effectiveness and potential by introducing a competitive LLM
+baseline. Specifically, the data processing pipeline consists of broad
+collection to scale up and reweighting to improve quality. We then pretrain a
+7B model BaichuanSEED with 3T tokens processed by our pipeline without any
+deliberate downstream task-related optimization, followed by an easy but
+effective supervised fine-tuning stage. BaichuanSEED demonstrates consistency
+and predictability throughout training and achieves comparable performance on
+comprehensive benchmarks with several commercial advanced large language
+models, such as Qwen1.5 and Llama3. We also conduct several heuristic
+experiments to discuss the potential for further optimization of downstream
+tasks, such as mathematics and coding.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Self-supervised</span> Topic Taxonomy Discovery in the Box Embedding Space <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15050v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15050v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuyin Lu, Hegang Chen, Pengbo Mao, Yanghui Rao, Haoran Xie, Fu Lee Wang, Qing Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Topic taxonomy discovery aims at uncovering topics of different abstraction
+levels and constructing hierarchical relations between them. Unfortunately,
+most of prior work can hardly model semantic scopes of words and topics by
+holding the Euclidean embedding space assumption. What's worse, they infer
+asymmetric hierarchical relations by symmetric distances between topic
+embeddings. As a result, existing methods suffer from problems of low-quality
+topics at high abstraction levels and inaccurate hierarchical relations. To
+alleviate these problems, this paper develops a Box embedding-based Topic Model
+(BoxTM) that maps words and topics into the box embedding space, where the
+asymmetric metric is defined to properly infer hierarchical relations among
+topics. Additionally, our BoxTM explicitly infers upper-level topics based on
+correlation between specific topics through recursive clustering on topic
+boxes. Finally, extensive experiments validate high-quality of the topic
+taxonomy learned by BoxTM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>to be published in TACL</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Survey</span> of Large Language Models for European Languages 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15040v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15040v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wazir Ali, Sampo Pyysalo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have gained significant attention due to their
+high performance on a wide range of natural language tasks since the release of
+ChatGPT. The LLMs learn to understand and generate language by training
+billions of model parameters on vast volumes of text data. Despite being a
+relatively new field, LLM research is rapidly advancing in various directions.
+In this paper, we present an overview of LLM families, including LLaMA, PaLM,
+GPT, and MoE, and the methods developed to create and enhance LLMs for official
+European Union (EU) languages. We provide a comprehensive summary of common
+monolingual and multilingual datasets used for pretraining LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evidence-Enhanced Triplet Generation Framework for Hallucination
+  Alleviation in Generative Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15037v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15037v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haowei Du, Huishuai Zhang, Dongyan Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To address the hallucination in generative question answering (GQA) where the
+answer can not be derived from the document, we propose a novel
+evidence-enhanced triplet generation framework, EATQA, encouraging the model to
+predict all the combinations of (Question, Evidence, Answer) triplet by
+flipping the source pair and the target label to understand their logical
+relationships, i.e., predict Answer(A), Question(Q), and Evidence(E) given a
+QE, EA, and QA pairs, respectively. Furthermore, we bridge the distribution gap
+to distill the knowledge from evidence in inference stage. Our framework
+ensures the model to learn the logical relation between query, evidence and
+answer, which simultaneously improves the evidence generation and query
+answering. In this paper, we apply EATQA to LLama and it outperforms other
+LLMs-based methods and hallucination mitigation approaches on two challenging
+GQA benchmarks. Further analysis shows that our method not only keeps prior
+knowledge within LLM, but also mitigates hallucination and generates faithful
+answers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Speech Recognition <span class="highlight-title">Transformer</span>s: Topological-lingualism Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14991v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14991v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shruti Singh, Muskaan Singh, Virender Kadyan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers have evolved with great success in various artificial
+intelligence tasks. Thanks to our recent prevalence of self-attention
+mechanisms, which capture long-term dependency, phenomenal outcomes in speech
+processing and recognition tasks have been produced. The paper presents a
+comprehensive survey of transformer techniques oriented in speech modality. The
+main contents of this survey include (1) background of traditional ASR,
+end-to-end transformer ecosystem, and speech transformers (2) foundational
+models in a speech via lingualism paradigm, i.e., monolingual, bilingual,
+multilingual, and cross-lingual (3) dataset and languages, acoustic features,
+architecture, decoding, and evaluation metric from a specific topological
+lingualism perspective (4) popular speech transformer toolkit for building
+end-to-end ASR systems. Finally, highlight the discussion of open challenges
+and potential research directions for the community to conduct further research
+in this domain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AgentMonitor: A Plug-and-Play Framework for Predictive and Secure
+  Multi-Agent Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14972v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14972v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chi-Min Chan, Jianxuan Yu, Weize Chen, Chunyang Jiang, Xinyu Liu, Weijie Shi, Zhiyuan Liu, Wei Xue, Yike Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid advancement of large language models (LLMs) has led to the rise of
+LLM-based agents. Recent research shows that multi-agent systems (MAS), where
+each agent plays a specific role, can outperform individual LLMs. However,
+configuring an MAS for a task remains challenging, with performance only
+observable post-execution. Inspired by scaling laws in LLM development, we
+investigate whether MAS performance can be predicted beforehand. We introduce
+AgentMonitor, a framework that integrates at the agent level to capture inputs
+and outputs, transforming them into statistics for training a regression model
+to predict task performance. Additionally, it can further apply real-time
+corrections to address security risks posed by malicious agents, mitigating
+negative impacts and enhancing MAS security. Experiments demonstrate that an
+XGBoost model achieves a Spearman correlation of 0.89 in-domain and 0.58 in
+more challenging scenarios. Furthermore, using AgentMonitor reduces harmful
+content by 6.2% and increases helpful content by 1.8% on average, enhancing
+safety and reliability. Code is available at
+\url{https://github.com/chanchimin/AgentMonitor}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MRSE: An Efficient Multi-modality Retrieval System for Large Scale
+  E-commerce 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14968v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14968v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Jiang, Haoxiang Zhang, Qingshan Hou, Chaofeng Chen, Weisi Lin, Jingchang Zhang, Annan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Providing high-quality item recall for text queries is crucial in large-scale
+e-commerce search systems. Current Embedding-based Retrieval Systems (ERS)
+embed queries and items into a shared low-dimensional space, but uni-modality
+ERS rely too heavily on textual features, making them unreliable in complex
+contexts. While multi-modality ERS incorporate various data sources, they often
+overlook individual preferences for different modalities, leading to suboptimal
+results. To address these issues, we propose MRSE, a Multi-modality Retrieval
+System that integrates text, item images, and user preferences through
+lightweight mixture-of-expert (LMoE) modules to better align features across
+and within modalities. MRSE also builds user profiles at a multi-modality level
+and introduces a novel hybrid loss function that enhances consistency and
+robustness using hard negative sampling. Experiments on a large-scale dataset
+from Shopee and online A/B testing show that MRSE achieves an 18.9% improvement
+in offline relevance and a 3.7% gain in online core metrics compared to
+Shopee's state-of-the-art uni-modality system.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multilingual Arbitrage: Optimizing Data Pools to Accelerate Multilingual
+  Progress 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14960v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14960v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ayomide Odumakinde, Daniel D'souza, Pat Verga, Beyza Ermis, Sara Hooker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The use of synthetic data has played a critical role in recent state-of-art
+breakthroughs. However, overly relying on a single oracle teacher model to
+generate data has been shown to lead to model collapse and invite propagation
+of biases. These limitations are particularly evident in multilingual settings,
+where the absence of a universally effective teacher model that excels across
+all languages presents significant challenges. In this work, we address these
+extreme difference by introducing "multilingual arbitrage", which capitalizes
+on performance variations between multiple models for a given language. To do
+so, we strategically route samples through a diverse pool of models, each with
+unique strengths in different languages. Across exhaustive experiments on
+state-of-art models, our work suggests that arbitrage techniques allow for
+spectacular gains in performance that far outperform relying on a single
+teacher. In particular, compared to the best single teacher, we observe gains
+of up to 56.5% improvement in win rates averaged across all languages when
+switching to multilingual arbitrage. We observe the most significant gains for
+the least resourced languages in our pool.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SpikingSSMs: Learning Long Sequences with Sparse and Parallel Spiking
+  State Space Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14909v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14909v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuaijie Shen, Chao Wang, Renzhuo Huang, Yan Zhong, Qinghai Guo, Zhichao Lu, Jianguo Zhang, Luziwei Leng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Known as low energy consumption networks, spiking neural networks (SNNs) have
+gained a lot of attention within the past decades. While SNNs are increasing
+competitive with artificial neural networks (ANNs) for vision tasks, they are
+rarely used for long sequence tasks, despite their intrinsic temporal dynamics.
+In this work, we develop spiking state space models (SpikingSSMs) for long
+sequence learning by leveraging on the sequence learning abilities of state
+space models (SSMs). Inspired by dendritic neuron structure, we hierarchically
+integrate neuronal dynamics with the original SSM block, meanwhile realizing
+sparse synaptic computation. Furthermore, to solve the conflict of event-driven
+neuronal dynamics with parallel computing, we propose a light-weight surrogate
+dynamic network which accurately predicts the after-reset membrane potential
+and compatible to learnable thresholds, enabling orders of acceleration in
+training speed compared with conventional iterative methods. On the long range
+arena benchmark task, SpikingSSM achieves competitive performance to
+state-of-the-art SSMs meanwhile realizing on average 90\% of network sparsity.
+On language modeling, our network significantly surpasses existing spiking
+large language models (spikingLLMs) on the WikiText-103 dataset with only a
+third of the model size, demonstrating its potential as backbone architecture
+for low computation cost LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Triplètoile: Extraction of Knowledge from Microblogging Text 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14908v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14908v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vanni Zavarella, Sergio Consoli, Diego Reforgiato Recupero, Gianni Fenu, Simone Angioni, Davide Buscaldi, Danilo Dessì, Francesco Osborne
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Numerous methods and pipelines have recently emerged for the automatic
+extraction of knowledge graphs from documents such as scientific publications
+and patents. However, adapting these methods to incorporate alternative text
+sources like micro-blogging posts and news has proven challenging as they
+struggle to model open-domain entities and relations, typically found in these
+sources. In this paper, we propose an enhanced information extraction pipeline
+tailored to the extraction of a knowledge graph comprising open-domain entities
+from micro-blogging posts on social media platforms. Our pipeline leverages
+dependency parsing and classifies entity relations in an unsupervised manner
+through hierarchical clustering over word embeddings. We provide a use case on
+extracting semantic triples from a corpus of 100 thousand tweets about digital
+transformation and publicly release the generated knowledge graph. On the same
+dataset, we conduct two experimental evaluations, showing that the system
+produces triples with precision over 95% and outperforms similar pipelines of
+around 5% in terms of precision, while generating a comparatively higher number
+of triples.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>42 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Writing in the Margins: Better Inference Pattern for Long Context
+  Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14906v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14906v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Melisa Russak, Umar Jamil, Christopher Bryant, Kiran Kamble, Axel Magnuson, Mateusz Russak, Waseem AlShikh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce Writing in the Margins (WiM), a new inference
+pattern for Large Language Models designed to optimize the handling of long
+input sequences in retrieval-oriented tasks. This approach leverages the
+chunked prefill of the key-value cache to perform segment-wise inference, which
+enables efficient processing of extensive contexts along with the generation
+and classification of intermediate information ("margins") that guide the model
+towards specific tasks. This method increases computational overhead marginally
+while significantly enhancing the performance of off-the-shelf models without
+the need for fine-tuning. Specifically, we observe that WiM provides an average
+enhancement of 7.5% in accuracy for reasoning skills (HotpotQA, MultiHop-RAG)
+and more than a 30.0% increase in the F1-score for aggregation tasks (CWE).
+Additionally, we show how the proposed pattern fits into an interactive
+retrieval design that provides end-users with ongoing updates about the
+progress of context processing, and pinpoints the integration of relevant
+information into the final response. We release our implementation of WiM using
+Hugging Face Transformers library at
+https://github.com/writer/writing-in-the-margins.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VHAKG: A Multi-modal Knowledge Graph Based on Synchronized Multi-view
+  Videos of Daily Activities <span class="chip">CIKM2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14895v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14895v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shusaku Egami, Takahiro Ugai, Ken Fukuda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal knowledge graphs (MMKGs), which ground various non-symbolic data
+(e.g., images and videos) into symbols, have attracted attention as resources
+enabling knowledge processing and machine learning across modalities. However,
+the construction of MMKGs for videos consisting of multiple events, such as
+daily activities, is still in the early stages. In this paper, we construct an
+MMKG based on synchronized multi-view simulated videos of daily activities.
+Besides representing the content of daily life videos as event-centric
+knowledge, our MMKG also includes frame-by-frame fine-grained changes, such as
+bounding boxes within video frames. In addition, we provide support tools for
+querying our MMKG. As an application example, we demonstrate that our MMKG
+facilitates benchmarking vision-language models by providing the necessary
+vision-language datasets for a tailored task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages,4 figures, accepted by CIKM2024 Resource Track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Functional Trade-off between Prosodic and Semantic Cues in Conveying
+  Sarcasm 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14892v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14892v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhu Li, Xiyuan Gao, Yuqing Zhang, Shekhar Nayak, Matt Coler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study investigates the acoustic features of sarcasm and disentangles the
+interplay between the propensity of an utterance being used sarcastically and
+the presence of prosodic cues signaling sarcasm. Using a dataset of sarcastic
+utterances compiled from television shows, we analyze the prosodic features
+within utterances and key phrases belonging to three distinct sarcasm
+categories (embedded, propositional, and illocutionary), which vary in the
+degree of semantic cues present, and compare them to neutral expressions.
+Results show that in phrases where the sarcastic meaning is salient from the
+semantics, the prosodic cues are less relevant than when the sarcastic meaning
+is not evident from the semantics, suggesting a trade-off between prosodic and
+semantic cues of sarcasm at the phrase level. These findings highlight a
+lessened reliance on prosodic modulation in semantically dense sarcastic
+expressions and a nuanced interaction that shapes the communication of
+sarcastic intent.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted at Interspeech 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Inverse-Q*: Token Level Reinforcement Learning for Aligning Large
+  Language Models Without Preference Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14874v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14874v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Han Xia, Songyang Gao, Qiming Ge, Zhiheng Xi, Qi Zhang, Xuanjing Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement Learning from Human Feedback (RLHF) has proven effective in
+aligning large language models with human intentions, yet it often relies on
+complex methodologies like Proximal Policy Optimization (PPO) that require
+extensive hyper-parameter tuning and present challenges in sample efficiency
+and stability. In this paper, we introduce Inverse-Q*, an innovative framework
+that transcends traditional RL methods by optimizing token-level reinforcement
+learning without the need for additional reward or value models. Inverse-Q*
+leverages direct preference optimization techniques but extends them by
+estimating the conditionally optimal policy directly from the model's
+responses, facilitating more granular and flexible policy shaping. Our approach
+reduces reliance on human annotation and external supervision, making it
+especially suitable for low-resource settings. We present extensive
+experimental results demonstrating that Inverse-Q* not only matches but
+potentially exceeds the effectiveness of PPO in terms of convergence speed and
+the alignment of model responses with human preferences. Our findings suggest
+that Inverse-Q* offers a practical and robust alternative to conventional RLHF
+approaches, paving the way for more efficient and adaptable model training
+approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Advancing Adversarial Suffix Transfer Learning on Aligned Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14866v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14866v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongfu Liu, Yuxi Xie, Ye Wang, Michael Shieh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language Language Models (LLMs) face safety concerns due to potential misuse
+by malicious users. Recent red-teaming efforts have identified adversarial
+suffixes capable of jailbreaking LLMs using the gradient-based search algorithm
+Greedy Coordinate Gradient (GCG). However, GCG struggles with computational
+inefficiency, limiting further investigations regarding suffix transferability
+and scalability across models and data. In this work, we bridge the connection
+between search efficiency and suffix transferability. We propose a two-stage
+transfer learning framework, DeGCG, which decouples the search process into
+behavior-agnostic pre-searching and behavior-relevant post-searching.
+Specifically, we employ direct first target token optimization in pre-searching
+to facilitate the search process. We apply our approach to cross-model,
+cross-data, and self-transfer scenarios. Furthermore, we introduce an
+interleaved variant of our approach, i-DeGCG, which iteratively leverages
+self-transferability to accelerate the search process. Experiments on HarmBench
+demonstrate the efficiency of our approach across various models and domains.
+Notably, our i-DeGCG outperforms the baseline on Llama2-chat-7b with ASRs of
+$43.9$ ($+22.2$) and $39.0$ ($+19.5$) on valid and test sets, respectively.
+Further analysis on cross-model transfer indicates the pivotal role of first
+target token optimization in leveraging suffix transferability for efficient
+searching.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Detecting AI Flaws: Target-Driven Attacks on Internal Faults in Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14853v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14853v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhao Du, Zhuo Li, Pengyu Cheng, Xiang Wan, Anningzhe Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have become a focal point in the rapidly
+evolving field of artificial intelligence. However, a critical concern is the
+presence of toxic content within the pre-training corpus of these models, which
+can lead to the generation of inappropriate outputs. Investigating methods for
+detecting internal faults in LLMs can help us understand their limitations and
+improve their security. Existing methods primarily focus on jailbreaking
+attacks, which involve manually or automatically constructing adversarial
+content to prompt the target LLM to generate unexpected responses. These
+methods rely heavily on prompt engineering, which is time-consuming and usually
+requires specially designed questions. To address these challenges, this paper
+proposes a target-driven attack paradigm that focuses on directly eliciting the
+target response instead of optimizing the prompts. We introduce the use of
+another LLM as the detector for toxic content, referred to as ToxDet. Given a
+target toxic response, ToxDet can generate a possible question and a
+preliminary answer to provoke the target model into producing desired toxic
+responses with meanings equivalent to the provided one. ToxDet is trained by
+interacting with the target LLM and receiving reward signals from it, utilizing
+reinforcement learning for the optimization process. While the primary focus of
+the target models is on open-source LLMs, the fine-tuned ToxDet can also be
+transferred to attack black-box models such as GPT-4o, achieving notable
+results. Experimental results on AdvBench and HH-Harmless datasets demonstrate
+the effectiveness of our methods in detecting the tendencies of target LLMs to
+generate harmful responses. This algorithm not only exposes vulnerabilities but
+also provides a valuable resource for researchers to strengthen their models
+against such attacks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Project SHADOW: Symbolic Higher-order Associative Deductive reasoning On
+  Wikidata using LM probing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14849v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14849v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanna Abi Akl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce SHADOW, a fine-tuned language model trained on an intermediate
+task using associative deductive reasoning, and measure its performance on a
+knowledge base construction task using Wikidata triple completion. We evaluate
+SHADOW on the LM-KBC 2024 challenge and show that it outperforms the baseline
+solution by 20% with a F1 score of 68.72%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AAVENUE: Detecting LLM Biases on NLU Tasks in AAVE via a Novel Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14845v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14845v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhay Gupta, Philip Meng, Ece Yurtseven, Sean O'Brien, Kevin Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting biases in natural language understanding (NLU) for African American
+Vernacular English (AAVE) is crucial to developing inclusive natural language
+processing (NLP) systems. To address dialect-induced performance discrepancies,
+we introduce AAVENUE ({AAVE} {N}atural Language {U}nderstanding {E}valuation),
+a benchmark for evaluating large language model (LLM) performance on NLU tasks
+in AAVE and Standard American English (SAE). AAVENUE builds upon and extends
+existing benchmarks like VALUE, replacing deterministic syntactic and
+morphological transformations with a more flexible methodology leveraging
+LLM-based translation with few-shot prompting, improving performance across our
+evaluation metrics when translating key tasks from the GLUE and SuperGLUE
+benchmarks. We compare AAVENUE and VALUE translations using five popular LLMs
+and a comprehensive set of metrics including fluency, BARTScore, quality,
+coherence, and understandability. Additionally, we recruit fluent AAVE speakers
+to validate our translations for authenticity. Our evaluations reveal that LLMs
+consistently perform better on SAE tasks than AAVE-translated versions,
+underscoring inherent biases and highlighting the need for more inclusive NLP
+models. We have open-sourced our source code on GitHub and created a website to
+showcase our work at https://aavenue.live.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CL4KGE: A Curriculum Learning Method for Knowledge Graph Embedding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14840v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14840v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Liu, Chuan Zhou, Peng Zhang, Yanan Cao, Yongchao Liu, Zhao Li, Hongyang Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge graph embedding (KGE) constitutes a foundational task, directed
+towards learning representations for entities and relations within knowledge
+graphs (KGs), with the objective of crafting representations comprehensive
+enough to approximate the logical and symbolic interconnections among entities.
+In this paper, we define a metric Z-counts to measure the difficulty of
+training each triple ($<$head entity, relation, tail entity$>$) in KGs with
+theoretical analysis. Based on this metric, we propose \textbf{CL4KGE}, an
+efficient \textbf{C}urriculum \textbf{L}earning based training strategy for
+\textbf{KGE}. This method includes a difficulty measurer and a training
+scheduler that aids in the training of KGE models. Our approach possesses the
+flexibility to act as a plugin within a wide range of KGE models, with the
+added advantage of adaptability to the majority of KGs in existence. The
+proposed method has been evaluated on popular KGE models, and the results
+demonstrate that it enhances the state-of-the-art methods. The use of Z-counts
+as a metric has enabled the identification of challenging triples in KGs, which
+helps in devising effective training strategies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PolicyLR: A Logic Representation For Privacy Policies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14830v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14830v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ashish Hooda, Rishabh Khandelwal, Prasad Chalasani, Kassem Fawaz, Somesh Jha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Privacy policies are crucial in the online ecosystem, defining how services
+handle user data and adhere to regulations such as GDPR and CCPA. However,
+their complexity and frequent updates often make them difficult for
+stakeholders to understand and analyze. Current automated analysis methods,
+which utilize natural language processing, have limitations. They typically
+focus on individual tasks and fail to capture the full context of the policies.
+We propose PolicyLR, a new paradigm that offers a comprehensive
+machine-readable representation of privacy policies, serving as an all-in-one
+solution for multiple downstream tasks. PolicyLR converts privacy policies into
+a machine-readable format using valuations of atomic formulae, allowing for
+formal definitions of tasks like compliance and consistency. We have developed
+a compiler that transforms unstructured policy text into this format using
+off-the-shelf Large Language Models (LLMs). This compiler breaks down the
+transformation task into a two-stage translation and entailment procedure. This
+procedure considers the full context of the privacy policy to infer a complex
+formula, where each formula consists of simpler atomic formulae. The advantage
+of this model is that PolicyLR is interpretable by design and grounded in
+segments of the privacy policy. We evaluated the compiler using ToS;DR, a
+community-annotated privacy policy entailment dataset. Utilizing open-source
+LLMs, our compiler achieves precision and recall values of 0.91 and 0.88,
+respectively. Finally, we demonstrate the utility of PolicyLR in three privacy
+tasks: Policy Compliance, Inconsistency Detection, and Privacy Comparison
+Shopping.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Rule-Based Models to Deep Learning <span class="highlight-title">Transformer</span>s Architectures for
+  Natural Language Processing and Sign Language Translation Systems: <span class="highlight-title">Survey</span>,
+  Taxonomy and Performance Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14825v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14825v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nada Shahin, Leila Ismail
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the growing Deaf and Hard of Hearing population worldwide and the
+persistent shortage of certified sign language interpreters, there is a
+pressing need for an efficient, signs-driven, integrated end-to-end translation
+system, from sign to gloss to text and vice-versa. There has been a wealth of
+research on machine translations and related reviews. However, there are few
+works on sign language machine translation considering the particularity of the
+language being continuous and dynamic. This paper aims to address this void,
+providing a retrospective analysis of the temporal evolution of sign language
+machine translation algorithms and a taxonomy of the Transformers
+architectures, the most used approach in language translation. We also present
+the requirements of a real-time Quality-of-Service sign language ma-chine
+translation system underpinned by accurate deep learning algorithms. We propose
+future research directions for sign language translation systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GSIFN: A Graph-Structured and Interlaced-Masked Multimodal <span class="highlight-title">Transformer</span>
+  Based Fusion Network for Multimodal Sentiment Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14809v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14809v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yijie Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal Sentiment Analysis (MSA) leverages multiple modals to analyze
+sentiments. Typically, advanced fusion methods and representation
+learning-based methods are designed to tackle it. Our proposed GSIFN solves two
+key problems to be solved in MSA: (i) In multimodal fusion, the decoupling of
+modal combinations and tremendous parameter redundancy in existing fusion
+methods, which lead to poor fusion performance and efficiency. (ii) The
+trade-off between representation capability and computation overhead of the
+unimodal feature extractors and enhancers. GSIFN incorporates two main
+components to solve these problems: (i) Graph-Structured and Interlaced-Masked
+Multimodal Transformer. It adopts the Interlaced Mask mechanism to construct
+robust multimodal graph embedding, achieve all-modal-in-one Transformer-based
+fusion, and greatly reduce the computation overhead. (ii) A self-supervised
+learning framework with low computation overhead and high performance, which
+utilizes a parallelized LSTM with matrix memory to enhance non-verbal modal
+feature for unimodal label generation. Evaluated on the MSA datasets CMU-MOSI,
+CMU-MOSEI, and CH-SIMS, GSIFN demonstrates superior performance with
+significantly lower computation overhead compared with state-of-the-art
+methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Instruct-SkillMix: A Powerful Pipeline for LLM Instruction Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14774v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14774v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simran Kaur, Simon Park, Anirudh Goyal, Sanjeev Arora
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Instruct-SkillMix, an automated approach for creating diverse,
+high quality SFT data. The Instruct-SkillMix pipeline involves two stages, each
+leveraging an existing powerful LLM: (1) Skill extraction: uses the LLM to
+extract core "skills" for instruction-following, either from existing datasets,
+or by directly prompting the model; (2) Data generation: uses the powerful LLM
+to generate (instruction, response) data that exhibit a randomly chosen pair of
+these skills. Here, the use of random skill combinations promotes diversity and
+difficulty.
+  Vanilla SFT (i.e., no PPO, DPO, or RL methods) on data generated from
+Instruct-SkillMix leads to strong gains on instruction following benchmarks
+such as AlpacaEval 2.0, MT-Bench, and WildBench. With just $4$K examples,
+LLaMA-3-8B-Base achieves 42.76% length-controlled win rate on AlpacaEval 2.0.
+To our knowledge, this achieves state-of-the-art performance among all models
+that have only undergone SFT (no RL methods) and competes with proprietary
+models such as Claude 3 Opus and LLaMA-3.1-405B-Instruct.
+  Ablation studies also suggest plausible reasons for why creating open
+instruction-tuning datasets via naive crowd-sourcing has proved difficult.
+Introducing low quality answers ("shirkers") in $20\%$ of Instruct-SkillMix
+examples causes performance to plummet, sometimes catastrophically.
+  The Instruct-SkillMix pipeline is flexible and is adaptable to other
+settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A global AI community requires language-diverse publishing <span class="chip">ICLR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14772v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14772v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haley Lepp, Parth Sarin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this provocation, we discuss the English dominance of the AI research
+community, arguing that the requirement for English language publishing upholds
+and reinforces broader regimes of extraction in AI. While large language models
+and machine translation have been celebrated as a way to break down barriers,
+we regard their use as a symptom of linguistic exclusion of scientists and
+potential readers. We propose alternative futures for a healthier publishing
+culture, organized around three themes: administering conferences in the
+languages of the country in which they are held, instructing peer reviewers not
+to adjudicate the language appropriateness of papers, and offering
+opportunities to publish and present in multiple languages. We welcome new
+translations of this piece. Please contact the authors if you would like to
+contribute one.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Translations by Michael Hardy (Guarani), Vandana Sarin and Vivek
+  Sarin (Hindi), Roshna Omer Abdulrahman (Soran\^i Kurdish), Gabriel Poesia
+  (Portuguese), and Mat\'ias Grinberg (Spanish). In the proceedings of the
+  Global AI Cultures Workshop at the Twelfth International Conference on
+  Learning Representations (ICLR) 2024, Vienna, Austria, May 7-11, 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LyCon: Lyrics Reconstruction from the Bag-of-Words Using Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14750v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14750v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haven Kim, Kahyun Choi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper addresses the unique challenge of conducting research in lyric
+studies, where direct use of lyrics is often restricted due to copyright
+concerns. Unlike typical data, internet-sourced lyrics are frequently protected
+under copyright law, necessitating alternative approaches. Our study introduces
+a novel method for generating copyright-free lyrics from publicly available
+Bag-of-Words (BoW) datasets, which contain the vocabulary of lyrics but not the
+lyrics themselves. Utilizing metadata associated with BoW datasets and large
+language models, we successfully reconstructed lyrics. We have compiled and
+made available a dataset of reconstructed lyrics, LyCon, aligned with metadata
+from renowned sources including the Million Song Dataset, Deezer Mood Detection
+Dataset, and AllMusic Genre Dataset, available for public access. We believe
+that the integration of metadata such as mood annotations or genres enables a
+variety of academic experiments on lyrics, such as conditional lyric
+generation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Dataset downlodable at https://github.com/havenpersona/lycon</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PAT: Pruning-Aware Tuning for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14721v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14721v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yijiang Liu, Huanrui Yang, Youxin Chen, Rongyu Zhang, Miao Wang, Yuan Du, Li Du
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) excel in language tasks, especially with
+supervised fine-tuning after pre-training. However, their substantial memory
+and computational requirements hinder practical applications. Structural
+pruning, which reduces less significant weight dimensions, is one solution.
+Yet, traditional post-hoc pruning often leads to significant performance loss,
+with limited recovery from further fine-tuning due to reduced capacity. Since
+the model fine-tuning refines the general and chaotic knowledge in pre-trained
+models, we aim to incorporate structural pruning with the fine-tuning, and
+propose the Pruning-Aware Tuning (PAT) paradigm to eliminate model redundancy
+while preserving the model performance to the maximum extend. Specifically, we
+insert the innovative Hybrid Sparsification Modules (HSMs) between the
+Attention and FFN components to accordingly sparsify the upstream and
+downstream linear modules. The HSM comprises a lightweight operator and a
+globally shared trainable mask. The lightweight operator maintains a training
+overhead comparable to that of LoRA, while the trainable mask unifies the
+channels to be sparsified, ensuring structural pruning. Additionally, we
+propose the Identity Loss which decouples the transformation and scaling
+properties of the HSMs to enhance training robustness. Extensive experiments
+demonstrate that PAT excels in both performance and efficiency. For example,
+our Llama2-7b model with a 25\% pruning ratio achieves 1.33$\times$ speedup
+while outperforming the LoRA-finetuned model by up to 1.26\% in accuracy with a
+similar training cost. Code:
+https://github.com/kriskrisliu/PAT_Pruning-Aware-Tuning
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Implicit Geometry of Next-token Prediction: From Language Sparsity
+  Patterns to Model Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15417v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15417v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yize Zhao, Tina Behnia, Vala Vakilian, Christos Thrampoulidis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Next-token prediction (NTP) over large text corpora has become the go-to
+paradigm to train large language models. Yet, it remains unclear how NTP
+influences the mapping of linguistic patterns to geometric properties of the
+resulting model representations. We frame training of large language models as
+soft-label classification over sparse probabilistic label vectors, coupled with
+an analytical approximation that allows unrestricted generation of context
+embeddings. This approach links NTP training to rank-constrained, nuclear-norm
+regularized optimization in the logit domain, offering a framework for
+analyzing the geometry of word and context embeddings. In large embedding
+spaces, we find that NTP implicitly favors learning logits with a sparse plus
+low-rank structure. While the sparse component captures the co-occurrence
+frequency of context-word pairs, the orthogonal low-rank component, which
+becomes dominant as training progresses, depends solely on the sparsity pattern
+of the co-occurrence matrix. Consequently, when projected onto an appropriate
+subspace, representations of contexts that are followed by the same set of
+next-tokens collapse, a phenomenon we term subspace-collapse. We validate our
+findings on synthetic and small-scale real language datasets. Finally, we
+outline potential research directions aimed at deepening the understanding of
+NTP's influence on the learning of linguistic patterns and regularities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at COLM 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Awes, Laws, and Flaws From Today's LLM Research 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15409v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15409v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adrian de Wynter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We perform a critical examination of the scientific methodology behind
+contemporary large language model (LLM) research. For this we assess over 2,000
+research works based on criteria typical of what is considered good research
+(e.g. presence of statistical tests and reproducibility) and cross-validate it
+with arguments that are at the centre of controversy (e.g., claims of emergent
+behaviour, the use of LLMs as evaluators). We find multiple trends, such as
+declines in claims of emergent behaviour and the presence of ethics
+disclaimers; and the rise of LLMs as evaluators. This paper underscores the
+need for more scrutiny and rigour by and from this field. Critical reading and
+familiarity with the literature are crucial to live up to the fundamentals of a
+responsible scientific method that is ethical, reproducible, systematic, and
+open to criticism.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Intertwined Biases Across Social Media Spheres: Unpacking Correlations
+  in Media Bias Dimensions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15406v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15406v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Liu, Yike Li, Dong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Media bias significantly shapes public perception by reinforcing stereotypes
+and exacerbating societal divisions. Prior research has often focused on
+isolated media bias dimensions such as \textit{political bias} or
+\textit{racial bias}, neglecting the complex interrelationships among various
+bias dimensions across different topic domains. Moreover, we observe that
+models trained on existing media bias benchmarks fail to generalize effectively
+on recent social media posts, particularly in certain bias identification
+tasks. This shortfall primarily arises because these benchmarks do not
+adequately reflect the rapidly evolving nature of social media content, which
+is characterized by shifting user behaviors and emerging trends. In response to
+these limitations, our research introduces a novel dataset collected from
+YouTube and Reddit over the past five years. Our dataset includes automated
+annotations for YouTube content across a broad spectrum of bias dimensions,
+such as gender, racial, and political biases, as well as hate speech, among
+others. It spans diverse domains including politics, sports, healthcare,
+education, and entertainment, reflecting the complex interplay of biases across
+different societal sectors. Through comprehensive statistical analysis, we
+identify significant differences in bias expression patterns and intra-domain
+bias correlations across these domains. By utilizing our understanding of the
+correlations among various bias dimensions, we lay the groundwork for creating
+advanced systems capable of detecting multiple biases simultaneously. Overall,
+our dataset advances the field of media bias identification, contributing to
+the development of tools that promote fairer media consumption. The
+comprehensive awareness of existing media bias fosters more ethical journalism,
+promotes cultural sensitivity, and supports a more informed and equitable
+public discourse.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ASONAM 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Statistical Framework for Data-dependent Retrieval-Augmented Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15399v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15399v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soumya Basu, Ankit Singh Rawat, Manzil Zaheer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern ML systems increasingly augment input instances with additional
+relevant information to enhance final prediction. Despite growing interest in
+such retrieval-augmented models, their fundamental properties and training are
+not well understood. We propose a statistical framework to study such models
+with two components: 1) a {\em retriever} to identify the relevant information
+out of a large corpus via a data-dependent metric; and 2) a {\em predictor}
+that consumes the input instances along with the retrieved information to make
+the final predictions. We present a principled method for end-to-end training
+of both components and draw connections with various training approaches in the
+literature. Furthermore, we establish excess risk bounds for
+retrieval-augmented models while delineating the contributions of both
+retriever and predictor towards the model performance. We validate the utility
+of our proposed training methods along with the key takeaways from our
+statistical analysis on open domain question answering task where retrieval
+augmentation is important.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DualKanbaFormer: Kolmogorov-Arnold Networks and State Space Model
+  DualKanbaFormer: Kolmogorov-Arnold Networks and State Space Model <span class="highlight-title">Transformer</span>
+  for Multimodal Aspect-based Sentiment Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15379v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15379v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adamu Lawan, Juhua Pu, Haruna Yunusa, Muhammad Lawan, Aliyu Umar, Adamu Sani Yahya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal aspect-based sentiment analysis (MABSA) enhances sentiment
+detection by combining text with other data types like images. However, despite
+setting significant benchmarks, attention mechanisms exhibit limitations in
+efficiently modelling long-range dependencies between aspect and opinion
+targets within the text. They also face challenges in capturing global-context
+dependencies for visual representations. To this end, we propose
+Kolmogorov-Arnold Networks (KANs) and Selective State Space model (Mamba)
+transformer (DualKanbaFormer), a novel architecture to address the above
+issues. We leverage the power of Mamba to capture global context dependencies,
+Multi-head Attention (MHA) to capture local context dependencies, and KANs to
+capture non-linear modelling patterns for both textual representations (textual
+KanbaFormer) and visual representations (visual KanbaFormer). Furthermore, we
+fuse the textual KanbaFormer and visual KanbaFomer with a gated fusion layer to
+capture the inter-modality dynamics. According to extensive experimental
+results, our model outperforms some state-of-the-art (SOTA) studies on two
+public datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 2 figures, and 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Pitfalls and Outlooks in Using COMET 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15366v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15366v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vilém Zouhar, Pinzhen Chen, Tsz Kin Lam, Nikita Moghe, Barry Haddow
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Since its introduction, the COMET metric has blazed a trail in the machine
+translation community, given its strong correlation with human judgements of
+translation quality. Its success stems from being a modified pre-trained
+multilingual model finetuned for quality assessment. However, it being a
+machine learning model also gives rise to a new set of pitfalls that may not be
+widely known. We investigate these unexpected behaviours from three aspects: 1)
+technical: obsolete software versions and compute precision; 2) data: empty
+content, language mismatch, and translationese at test time as well as
+distribution and domain biases in training; 3) usage and reporting:
+multi-reference support and model referencing in the literature. All of these
+problems imply that COMET scores is not comparable between papers or even
+technical setups and we put forward our perspective on fixing each issue.
+Furthermore, we release the SacreCOMET package that can generate a signature
+for the software and model configuration as well as an appropriate citation.
+The goal of this work is to help the community make more sound use of the COMET
+metric.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UNA: Unifying Alignments of RLHF/PPO, DPO and KTO by a Generalized
+  Implicit Reward Function 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15339v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15339v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhichao Wang, Bin Bi, Can Huang, Shiva Kumar Pentyala, Zixu James Zhu, Sitaram Asur, Na Claire Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  An LLM is pretrained on trillions of tokens, but the pretrained LLM may still
+generate undesired responses. To solve this problem, alignment techniques such
+as RLHF, DPO and KTO are proposed. However, these alignment techniques have
+limitations. For example, RLHF requires training the reward model and policy
+separately, which is complex, time-consuming, memory intensive and unstable
+during training processes. DPO proposes a mapping between an optimal policy and
+a reward, greatly simplifying the training process of RLHF. However, it can not
+take full advantages of a reward model and it is limited to pairwise preference
+data.
+  In this paper, we propose \textbf{UN}ified \textbf{A}lignment (UNA) which
+unifies RLHF/PPO, DPO and KTO. Firstly, we mathematically prove that given the
+classical RLHF objective, the optimal policy is induced by a generalize
+implicit reward function. With this novel mapping between a reward model and an
+optimal policy, UNA can 1. unify RLHF/PPO, DPO and KTO into a supervised
+learning of minimizing the difference between an implicit reward and an
+explicit reward; 2. outperform RLHF/PPO while simplify, stabilize, speed up and
+reduce memory burden of RL fine-tuning process; 3. accommodate different
+feedback types including pairwise, binary and scalar feedback. Downstream
+experiments show UNA outperforms DPO, KTO and RLHF.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bi-Factorial Preference Optimization: Balancing Safety-Helpfulness in
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15313v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15313v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenxuan Zhang, Philip H. S. Torr, Mohamed Elhoseiny, Adel Bibi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fine-tuning large language models (LLMs) on human preferences, typically
+through reinforcement learning from human feedback (RLHF), has proven
+successful in enhancing their capabilities. However, ensuring the safety of
+LLMs during the fine-tuning remains a critical concern, and mitigating the
+potential conflicts in safety and helpfulness is costly in RLHF. To address
+this issue, we propose a supervised learning framework called Bi-Factorial
+Preference Optimization (BFPO), which re-parameterizes a joint RLHF objective
+of both safety and helpfulness into a single supervised learning objective. In
+the supervised optimization, a labeling function is used to capture global
+preferences ranking to balance both safety and helpfulness. To evaluate BFPO,
+we develop a benchmark including comprehensive discriminative and generative
+tasks for helpfulness and harmlessness. The results indicate that our method
+significantly outperforms existing approaches in both safety and helpfulness.
+Moreover, BFPO eliminates the need for human prompting and annotation in LLM
+fine-tuning while achieving the same level of safety as methods that heavily
+rely on human labor, with less than 10% of the computational resources. The
+training recipes and models will be released.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ YOLO-Stutter: End-to-end Region-Wise Speech Dysfluency Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15297v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15297v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuanru Zhou, Anshul Kashyap, Steve Li, Ayati Sharma, Brittany Morin, David Baquirin, Jet Vonk, Zoe Ezzes, Zachary Miller, Maria Luisa Gorno Tempini, Jiachen Lian, Gopala Krishna Anumanchipalli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dysfluent speech detection is the bottleneck for disordered speech analysis
+and spoken language learning. Current state-of-the-art models are governed by
+rule-based systems which lack efficiency and robustness, and are sensitive to
+template design. In this paper, we propose YOLO-Stutter: a first end-to-end
+method that detects dysfluencies in a time-accurate manner. YOLO-Stutter takes
+imperfect speech-text alignment as input, followed by a spatial feature
+aggregator, and a temporal dependency extractor to perform region-wise boundary
+and class predictions. We also introduce two dysfluency corpus, VCTK-Stutter
+and VCTK-TTS, that simulate natural spoken dysfluencies including repetition,
+block, missing, replacement, and prolongation. Our end-to-end method achieves
+state-of-the-art performance with a minimum number of trainable parameters for
+on both simulated data and real aphasia speech. Code and datasets are
+open-sourced at https://github.com/rorizzz/YOLO-Stutter
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Interspeech 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Granularity Representation for Temporal Knowledge Graph
+  Completion <span class="chip">ICONIP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15293v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15293v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinchuan Zhang, Tianqi Wan, Chong Mu, Guangxi Lu, Ling Tian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Temporal Knowledge Graphs (TKGs) incorporate temporal information to reflect
+the dynamic structural knowledge and evolutionary patterns of real-world facts.
+Nevertheless, TKGs are still limited in downstream applications due to the
+problem of incompleteness. Consequently, TKG completion (also known as link
+prediction) has been widely studied, with recent research focusing on
+incorporating independent embeddings of time or combining them with entities
+and relations to form temporal representations. However, most existing methods
+overlook the impact of history from a multi-granularity aspect. The inherent
+semantics of human-defined temporal granularities, such as ordinal dates,
+reveal general patterns to which facts typically adhere. To counter this
+limitation, this paper proposes \textbf{L}earning \textbf{G}ranularity
+\textbf{Re}presentation (termed $\mathsf{LGRe}$) for TKG completion. It
+comprises two main components: Granularity Representation Learning (GRL) and
+Adaptive Granularity Balancing (AGB). Specifically, GRL employs time-specific
+multi-layer convolutional neural networks to capture interactions between
+entities and relations at different granularities. After that, AGB generates
+adaptive weights for these embeddings according to temporal semantics,
+resulting in expressive representations of predictions. Moreover, to reflect
+similar semantics of adjacent timestamps, a temporal loss function is
+introduced. Extensive experimental results on four event benchmarks demonstrate
+the effectiveness of $\mathsf{LGRe}$ in learning time-related representations.
+To ensure reproducibility, our code is available at
+https://github.com/KcAcoZhang/LGRe.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages. Accepted at ICONIP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SelectLLM: Can LLMs Select Important Instructions to Annotate? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.16553v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.16553v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ritik Sachin Parkar, Jaehyung Kim, Jong Inn Park, Dongyeop Kang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Instruction tuning benefits from large and diverse datasets; however,
+creating such datasets involves a high cost of human labeling. While synthetic
+datasets generated by large language models (LLMs) have partly solved this
+issue, they often contain low-quality data. One effective solution is
+selectively annotating unlabelled instructions, especially given the relative
+ease of acquiring unlabeled instructions or texts from various sources.
+However, how to select unlabelled instructions is not well-explored, especially
+in the context of LLMs. Therefore, we introduce SelectLLM, an alternative
+framework that leverages the capabilities of LLMs to select unlabeled
+instructions more effectively. Specifically, SelectLLM consists of two key
+steps: Coreset-based clustering of unlabelled instructions for enlarging
+diversity and prompting of LLM to identify the most beneficial instructions
+within each cluster. We evaluate SelectLLM on AlpacaEval2 and MT-Bench,
+demonstrating its ability to outperform state-of-the-art methods like
+Alpagasus. In addition, we compare the performance and compatibility of
+SelectLLM with various LLMs, such as ChatGPT, LLaMA-3.1-70B, and Gemma-2-27b.
+SelectLLM's adaptability and robustness are further evidenced by its ability to
+maintain high performance across both human and synthetic datasets. All code
+and data are publicly available (https://github.com/minnesotanlp/select-llm).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>First Authors: Ritik Sachin Parkar and Jaehyung Kim | Second Author:
+  Jong Inn Park | PI: Dongyeop Kang</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DIVERSE: A <span class="highlight-title">Dataset</span> of YouTube Video Comment Stances with a Data
+  Programming Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.03334v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.03334v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Iain J. Cruickshank, Amir Soofi, Lynnette Hui Xian Ng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stance detection of social media text is a key component of many real-world
+applications like evaluating marketing campaigns, evaluating political policies
+or candidates, or evaluating information environments. However, creating
+automatic stance labeling systems requires the manual annotation of stances,
+which is both tedious and resource-intensive. This paper introduces a stance
+labeling method that makes use of weak signals of sentence tone, then
+consolidating these signals with a Data Programmingmodel for the final stance
+label. In a time of international conflict, understanding the public opinion
+towards the country's military is crucial for recruitment. We present DIVERSE,
+a dataset involve stances towards YouTube videos of the US military (Dataset
+available at https://doi.org/10.5281/zenodo.10493803). On average, the videos
+have 200 comments each, and the stances skew slightly towards the "against"
+characterization for both the US army and the video.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Development of a Large Language Model-based Multi-Agent Clinical
+  Decision Support System for Korean Triage and Acuity Scale (KTAS)-Based
+  Triage and Treatment Planning in Emergency Departments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.07531v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.07531v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seungjun Han, Wongyung Choi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Emergency department (ED) overcrowding and the complexity of rapid
+decision-making in critical care settings pose significant challenges to
+healthcare systems worldwide. While clinical decision support systems (CDSS)
+have shown promise, the integration of large language models (LLMs) offers new
+possibilities for enhancing triage accuracy and clinical decision-making. This
+study presents an LLM-driven CDSS designed to assist ED physicians and nurses
+in patient triage, treatment planning, and overall emergency care management.
+  We developed a multi-agent CDSS utilizing Llama-3-70b as the base LLM,
+orchestrated by CrewAI and Langchain. The system comprises four AI agents
+emulating key ED roles: Triage Nurse, Emergency Physician, Pharmacist, and ED
+Coordinator. It incorporates the Korean Triage and Acuity Scale (KTAS) for
+triage assessment and integrates with the RxNorm API for medication management.
+  The model was evaluated using the Asclepius dataset, with performance
+assessed by a clinical emergency medicine specialist. The CDSS demonstrated
+high accuracy in triage decision-making compared to the baseline of a
+single-agent system. Furthermore, the system exhibited strong performance in
+critical areas, including primary diagnosis, critical findings identification,
+disposition decision-making, treatment planning, and resource allocation.
+  Our multi-agent CDSS demonstrates significant potential for supporting
+comprehensive emergency care management. By leveraging state-of-the-art AI
+technologies, this system offers a scalable and adaptable tool that could
+enhance emergency medical care delivery, potentially alleviating ED
+overcrowding and improving patient outcomes. This work contributes to the
+growing field of AI applications in emergency medicine and offers a promising
+direction for future research and clinical implementation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PRODIGy: a PROfile-based DIalogue Generation <span class="highlight-title">dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.05195v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.05195v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniela Occhipinti, Serra Sinem Tekiroglu, Marco Guerini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Providing dialogue agents with a profile representation can improve their
+consistency and coherence, leading to better conversations. However, current
+profile-based dialogue datasets for training such agents contain either
+explicit profile representations that are simple and dialogue-specific, or
+implicit representations that are difficult to collect. In this work, we
+propose a unified framework in which we bring together both standard and more
+sophisticated profile representations by creating a new resource where each
+dialogue is aligned with all possible speaker representations such as
+communication style, biographies, and personality. This framework allows to
+test several baselines built using generative language models with several
+profile configurations. The automatic evaluation shows that profile-based
+models have better generalisation capabilities than models trained on dialogues
+only, both in-domain and cross-domain settings. These results are consistent
+for fine-tuned models and instruction-based LLMs. Additionally, human
+evaluation demonstrates a clear preference for generations consistent with both
+profile and context. Finally, to account for possible privacy concerns, all
+experiments are done under two configurations: inter-character and
+intra-character. In the former, the LM stores the information about the
+character in its internal representation, while in the latter, the LM does not
+retain any personal information but uses it only at inference time.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Foundation Models for Music: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14340v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14340v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yinghao Ma, Anders Øland, Anton Ragni, Bleiz MacSen Del Sette, Charalampos Saitis, Chris Donahue, Chenghua Lin, Christos Plachouras, Emmanouil Benetos, Elio Quinton, Elona Shatri, Fabio Morreale, Ge Zhang, György Fazekas, Gus Xia, Huan Zhang, Ilaria Manco, Jiawen Huang, Julien Guinot, Liwei Lin, Luca Marinelli, Max W. Y. Lam, Megha Sharma, Qiuqiang Kong, Roger B. Dannenberg, Ruibin Yuan, Shangda Wu, Shih-Lun Wu, Shuqi Dai, Shun Lei, Shiyin Kang, Simon Dixon, Wenhu Chen, Wenhao Huang, Xingjian Du, Xingwei Qu, Xu Tan, Yizhi Li, Zeyue Tian, Zhiyong Wu, Zhizheng Wu, Ziyang Ma, Ziyu Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, foundation models (FMs) such as large language models (LLMs)
+and latent diffusion models (LDMs) have profoundly impacted diverse sectors,
+including music. This comprehensive review examines state-of-the-art (SOTA)
+pre-trained models and foundation models in music, spanning from representation
+learning, generative learning and multimodal learning. We first contextualise
+the significance of music in various industries and trace the evolution of AI
+in music. By delineating the modalities targeted by foundation models, we
+discover many of the music representations are underexplored in FM development.
+Then, emphasis is placed on the lack of versatility of previous methods on
+diverse music applications, along with the potential of FMs in music
+understanding, generation and medical application. By comprehensively exploring
+the details of the model pre-training paradigm, architectural choices,
+tokenisation, finetuning methodologies and controllability, we emphasise the
+important topics that should have been well explored, like instruction tuning
+and in-context learning, scaling law and emergent ability, as well as
+long-sequence modelling etc. A dedicated section presents insights into music
+agents, accompanied by a thorough analysis of datasets and evaluations
+essential for pre-training and downstream tasks. Finally, by underscoring the
+vital importance of ethical considerations, we advocate that following research
+on FM for music should focus more on such issues as interpretability,
+transparency, human responsibility, and copyright issues. The paper offers
+insights into future challenges and trends on FMs for music, aiming to shape
+the trajectory of human-AI collaboration in the music realm.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dr.E Bridges Graphs with Large Language Models through Words 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.15504v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.15504v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zipeng Liu, Likang Wu, Ming He, Zhong Guan, Hongke Zhao, Nan Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Significant efforts have been dedicated to integrating the powerful Large
+Language Models (LLMs) with diverse modalities, particularly focusing on the
+fusion of language, vision and audio data. However, the graph-structured data,
+which is inherently rich in structural and domain-specific knowledge, has not
+yet been gracefully adapted to LLMs. Existing methods either describe the graph
+with raw text, suffering the loss of graph structural information, or feed
+Graph Neural Network (GNN) embeddings into LLMs at the cost of losing
+explainable prompt semantics. To bridge this gap, we introduce an end-to-end
+modality-aligning framework for LLM-graph alignment: Dual-Residual Vector
+Quantized-Variational AutoEncoder, namely Dr.E. Our approach is purposefully
+designed to facilitate token-level alignment with LLMs, enabling an effective
+translation of the intrinsic `language' of graphs into comprehensible natural
+language. We also manage to enhance LLMs' more robust structural understanding
+of graphs by incorporating multiple views of the central nodes based on their
+surrounding nodes at various distances. Our experimental evaluations on
+standard graph tasks demonstrate competitive performance against other
+state-of-the-art (SOTA) approaches. Additionally, our framework ensures certain
+visual interpretability, efficiency, and robustness, marking the promising
+successful endeavor to achieve token-level alignment between LLMs and GNNs. Our
+code is available at: https://anonymous.4open.science/r/dre-817.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ANLS* -- A Universal Document Processing Metric for Generative Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.03848v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.03848v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Peer, Philemon Schöpf, Volckmar Nebendahl, Alexander Rietzler, Sebastian Stabinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditionally, discriminative models have been the predominant choice for
+tasks like document classification and information extraction. These models
+make predictions that fall into a limited number of predefined classes,
+facilitating a binary true or false evaluation and enabling the direct
+calculation of metrics such as the F1 score. However, recent advancements in
+generative large language models (GLLMs) have prompted a shift in the field due
+to their enhanced zero-shot capabilities, which eliminate the need for a
+downstream dataset and computationally expensive fine-tuning. However,
+evaluating GLLMs presents a challenge as the binary true or false evaluation
+used for discriminative models is not applicable to the predictions made by
+GLLMs.
+  This paper introduces a new metric for generative models called ANLS* for
+evaluating a wide variety of tasks, including information extraction and
+classification tasks. The ANLS* metric extends existing ANLS metrics as a
+drop-in-replacement and is still compatible with previously reported ANLS
+scores. An evaluation of 7 different datasets, and more than 10 different GLLMs
+together with 3 different prompting methods using the ANLS* metric is also
+provided, demonstrating the importance of the proposed metric.
+  We also benchmark a novel approach to generate prompts for documents, called
+SFT, against other prompting techniques such as LATIN. In almost all cases, SFT
+outperforms other techniques and improves the state-of-the-art, sometimes by as
+much as $10$ percentage points.
+  Sources are available at https://github.com/deepopinion/anls_star_metric
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning to Decode Collaboratively with Multiple Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.03870v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.03870v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shannon Zejiang Shen, Hunter Lang, Bailin Wang, Yoon Kim, David Sontag
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a method to teach multiple large language models (LLM) to
+collaborate by interleaving their generations at the token level. We model the
+decision of which LLM generates the next token as a latent variable. By
+optimizing the marginal likelihood of a training set under our latent variable
+model, the base LLM automatically learns when to generate itself and when to
+call on one of the ``assistant'' language models to generate, all without
+direct supervision. Token-level collaboration during decoding allows for a
+fusion of each model's expertise in a manner tailored to the specific task at
+hand. Our collaborative decoding is especially useful in cross-domain settings
+where a generalist base LLM learns to invoke domain expert models. On
+instruction-following, domain-specific QA, and reasoning tasks, we show that
+the performance of the joint system exceeds that of the individual models.
+Through qualitative analysis of the learned latent decisions, we show models
+trained with our method exhibit several interesting collaboration patterns,
+e.g., template-filling. Our code is available at
+https://github.com/clinicalml/co-llm.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 4 figures, 11 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Depression Diagnosis with Chain-of-Thought <span class="highlight-title">Prompt</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14053v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14053v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elysia Shi, Adithri Manda, London Chowdhury, Runeema Arun, Kevin Zhu, Michael Lam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When using AI to detect signs of depressive disorder, AI models habitually
+draw preemptive conclusions. We theorize that using chain-of-thought (CoT)
+prompting to evaluate Patient Health Questionnaire-8 (PHQ-8) scores will
+improve the accuracy of the scores determined by AI models. In our findings,
+when the models reasoned with CoT, the estimated PHQ-8 scores were consistently
+closer on average to the accepted true scores reported by each participant
+compared to when not using CoT. Our goal is to expand upon AI models'
+understanding of the intricacies of human conversation, allowing them to more
+effectively assess a patient's feelings and tone, therefore being able to more
+accurately discern mental disorder symptoms; ultimately, we hope to augment AI
+models' abilities, so that they can be widely accessible and used in the
+medical field.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Chain-of-Thought Augmentation with Logit Contrast for Enhanced Reasoning
+  in Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.03600v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.03600v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jay Shim, Grant Kruttschnitt, Alyssa Ma, Daniel Kim, Benjamin Chek, Athul Anand, Kevin Zhu, Sean O'Brien
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Rapidly increasing model scales coupled with steering methods such as
+chain-of-thought prompting have led to drastic improvements in language model
+reasoning. At the same time, models struggle with compositional generalization
+and are far from human performance on many reasoning-based benchmarks.
+Leveraging the success of chain-of-thought prompting, and also taking
+inspiration from context-aware decoding (CAD), we explore input-based
+contrasting methods to further encourage the type of reasoning induced by
+chain-of-thought prompting. While work remains to stabilize these results
+across datasets and models, the improvements we find warrant further
+investigation into input-based steering methods for context-aware reasoning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Affective Visual Dialog: A Large-Scale Benchmark for Emotional Reasoning
+  Based on Visually Grounded Conversations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.16349v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.16349v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kilichbek Haydarov, Xiaoqian Shen, Avinash Madasu, Mahmoud Salem, Li-Jia Li, Gamaleldin Elsayed, Mohamed Elhoseiny
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Affective Visual Dialog, an emotion explanation and reasoning
+task as a testbed for research on understanding the formation of emotions in
+visually grounded conversations. The task involves three skills: (1)
+Dialog-based Question Answering (2) Dialog-based Emotion Prediction and (3)
+Affective emotion explanation generation based on the dialog. Our key
+contribution is the collection of a large-scale dataset, dubbed AffectVisDial,
+consisting of 50K 10-turn visually grounded dialogs as well as concluding
+emotion attributions and dialog-informed textual emotion explanations,
+resulting in a total of 27,180 working hours. We explain our design decisions
+in collecting the dataset and introduce the questioner and answerer tasks that
+are associated with the participants in the conversation. We train and
+demonstrate solid Affective Visual Dialog baselines adapted from
+state-of-the-art models. Remarkably, the responses generated by our models show
+promising emotional reasoning abilities in response to visually grounded
+conversations. Our project page is available at
+https://affective-visual-dialog.github.io.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Are Large Language Models Actually Good at Text Style Transfer? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.05885v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.05885v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sourabrata Mukherjee, Atul Kr. Ojha, Ondřej Dušek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We analyze the performance of large language models (LLMs) on Text Style
+Transfer (TST), specifically focusing on sentiment transfer and text
+detoxification across three languages: English, Hindi, and Bengali. Text Style
+Transfer involves modifying the linguistic style of a text while preserving its
+core content. We evaluate the capabilities of pre-trained LLMs using zero-shot
+and few-shot prompting as well as parameter-efficient finetuning on publicly
+available datasets. Our evaluation using automatic metrics, GPT-4 and human
+evaluations reveals that while some prompted LLMs perform well in English,
+their performance in on other languages (Hindi, Bengali) remains average.
+However, finetuning significantly improves results compared to zero-shot and
+few-shot prompting, making them comparable to previous state-of-the-art. This
+underscores the necessity of dedicated datasets and specialized models for
+effective TST.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multilingual Text Style Transfer: <span class="highlight-title">Dataset</span>s & Models for Indian Languages 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20805v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20805v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sourabrata Mukherjee, Atul Kr. Ojha, Akanksha Bansal, Deepak Alok, John P. McCrae, Ondřej Dušek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text style transfer (TST) involves altering the linguistic style of a text
+while preserving its core content. This paper focuses on sentiment transfer, a
+popular TST subtask, across a spectrum of Indian languages: Hindi, Magahi,
+Malayalam, Marathi, Punjabi, Odia, Telugu, and Urdu, expanding upon previous
+work on English-Bangla sentiment transfer (Mukherjee et al., 2023). We
+introduce dedicated datasets of 1,000 positive and 1,000 negative
+style-parallel sentences for each of these eight languages. We then evaluate
+the performance of various benchmark models categorized into parallel,
+non-parallel, cross-lingual, and shared learning approaches, including the
+Llama2 and GPT-3.5 large language models (LLMs). Our experiments highlight the
+significance of parallel data in TST and demonstrate the effectiveness of the
+Masked Style Filling (MSF) approach (Mukherjee et al., 2023) in non-parallel
+techniques. Moreover, cross-lingual and joint multilingual learning methods
+show promise, offering insights into selecting optimal models tailored to the
+specific language and task requirements. To the best of our knowledge, this
+work represents the first comprehensive exploration of the TST task as
+sentiment transfer across a diverse set of languages.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FLEXTAF: Enhancing Table Reasoning with Flexible Tabular Formats 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.08841v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.08841v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuanliang Zhang, Dingzirui Wang, Longxu Dou, Baoxin Wang, Dayong Wu, Qingfu Zhu, Wanxiang Che
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The table reasoning task aims to answer the question according to the given
+table. Currently, using Large Language Models (LLMs) is the predominant method
+for table reasoning. Most existing methods employ a fixed tabular format to
+represent the table, which could limit the performance. Given that each
+instance requires different capabilities and models possess varying abilities,
+we assert that different instances and models suit different tabular formats.
+We prove the aforementioned claim through quantitative analysis of experimental
+results, where different instances and models achieve different performances
+using various tabular formats. Building on this discussion, we propose
+FLEXTAF-Single and FLEXTAF-Vote to enhance table reasoning performance by
+employing flexible tabular formats. Specifically, (i) FLEXTAF-Single trains a
+classifier to predict the most suitable tabular format based on the instance
+and the LLM. (ii) FLEXTAF-Vote integrates the results across different formats.
+Our experiments on WikiTableQuestions and TabFact reveal significant
+improvements, with average gains of 2.3% and 4.8% compared to the best
+performance achieved using a fixed tabular format with greedy decoding and
+self-consistency decoding, thereby validating the effectiveness of our methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Taxonomy-Guided Zero-Shot Recommendations with LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.14043v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.14043v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yueqing Liang, Liangwei Yang, Chen Wang, Xiongxiao Xu, Philip S. Yu, Kai Shu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the emergence of large language models (LLMs) and their ability to
+perform a variety of tasks, their application in recommender systems (RecSys)
+has shown promise. However, we are facing significant challenges when deploying
+LLMs into RecSys, such as limited prompt length, unstructured item information,
+and un-constrained generation of recommendations, leading to sub-optimal
+performance. To address these issues, we propose a novel method using a
+taxonomy dictionary. This method provides a systematic framework for
+categorizing and organizing items, improving the clarity and structure of item
+information. By incorporating the taxonomy dictionary into LLM prompts, we
+achieve efficient token utilization and controlled feature generation, leading
+to more accurate and contextually relevant recommendations. Our Taxonomy-guided
+Recommendation (TaxRec) approach features a two-step process: one-time taxonomy
+categorization and LLM-based recommendation, enabling zero-shot recommendations
+without the need for domain-specific fine-tuning. Experimental results
+demonstrate TaxRec significantly enhances recommendation quality compared to
+traditional zero-shot approaches, showcasing its efficacy as personal
+recommender with LLMs. Code is available at
+https://github.com/yueqingliang1/TaxRec.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DAC: Decomposed Automation Correction for Text-to-SQL 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.08779v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.08779v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dingzirui Wang, Longxu Dou, Xuanliang Zhang, Qingfu Zhu, Wanxiang Che
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-SQL is an important task that helps people obtain information from
+databases by automatically generating SQL queries. Considering the brilliant
+performance, approaches based on Large Language Models (LLMs) become the
+mainstream for text-to-SQL. Among these approaches, automated correction is an
+effective approach that further enhances performance by correcting the mistakes
+in the generated results. The existing correction methods require LLMs to
+directly correct with generated SQL, while previous research shows that LLMs do
+not know how to detect mistakes, leading to poor performance. Therefore, in
+this paper, we propose to employ the decomposed correction to enhance
+text-to-SQL performance. We first demonstrate that decomposed correction
+outperforms direct correction since detecting and fixing mistakes with the
+results of the decomposed sub-tasks is easier than with SQL. Based on this
+analysis, we introduce Decomposed Automation Correction (DAC), which corrects
+SQL by decomposing text-to-SQL into entity linking and skeleton parsing. DAC
+first generates the entity and skeleton corresponding to the question and then
+compares the differences between the initial SQL and the generated entities and
+skeleton as feedback for correction. Experimental results show that our method
+improves performance by $3.7\%$ on average of Spider, Bird, and KaggleDBQA
+compared with the baseline method, demonstrating the effectiveness of DAC.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ I-SHEEP: Self-Alignment of LLM from Scratch through an Iterative
+  Self-Enhancement Paradigm 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.08072v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.08072v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiming Liang, Ge Zhang, Xingwei Qu, Tianyu Zheng, Jiawei Guo, Xinrun Du, Zhenzhu Yang, Jiaheng Liu, Chenghua Lin, Lei Ma, Wenhao Huang, Jiajun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have achieved significant advancements, however,
+the common learning paradigm treats LLMs as passive information repositories,
+neglecting their potential for active learning and alignment. Some approaches
+train LLMs using their own generated synthetic data, exploring the possibility
+of active alignment. However, there is still a huge gap between these one-time
+alignment methods and the continuous automatic alignment of humans. In this
+paper, we introduce \textbf{I-SHEEP}, an \textbf{I}terative
+\textbf{S}elf-En\textbf{H}anc\textbf{E}m\textbf{E}nt \textbf{P}aradigm.This
+human-like paradigm enables LLMs to \textbf{continuously self-align from
+scratch with nothing}. Compared to the one-time alignment method Dromedary
+\cite{sun2023principledriven}, which refers to the first iteration in this
+paper, I-SHEEP can significantly enhance capacities on both Qwen and Llama
+models. I-SHEEP achieves a maximum relative improvement of 78.2\% in the Alpaca
+Eval, 24.0\% in the MT Bench, and an absolute increase of 8.88\% in the IFEval
+accuracy over subsequent iterations in Qwen-1.5 72B model. Additionally,
+I-SHEEP surpasses the base model in various standard benchmark generation
+tasks, achieving an average improvement of 24.77\% in code generation tasks,
+12.04\% in TrivialQA, and 20.29\% in SQuAD. We also provide new insights based
+on the experiment results. Our codes, datasets, and models are available at
+\textbf{https://anonymous.4open.science/r/I-SHEEP}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient and Accurate Memorable Conversation Model using DPO based on
+  sLLM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.06537v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.06537v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youngkyung Seo, Yoonseok Heo, Jun-Seok Koh, Du-Seong Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In multi-session dialog system, it is essential to continuously update the
+memory as the session progresses. Simply accumulating memory can make it
+difficult to focus on the content of the conversation for inference due to the
+limited input sentence size. Therefore, efficient and accurate conversation
+model that is capable of managing memory to reflect the conversation history
+continuously is necessary. This paper presents a conversation model that
+efficiently manages memory as sessions progress and incorporates this into the
+model to reflect the conversation history accurately with 3 methodologies: SFT,
+DPO and DPO with SFT model. Our model using DPO algorithm shows an improvement
+about 0.0591 of BERTScore in memory accuracy, and the rate of responses
+reflecting the memory increased as well. Also, response generation performance
+enhanced about 4.292 in fluency, 3.935 in coherence, and 2.896 in consistency.
+This paper describes a training method that yields better performance than
+models with more than twice the parameter size, even when the model size is
+smaller. Thus, our model demonstrates efficiency not only in terms of accuracy
+but also in resource utilization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SpeechGLUE: How Well Can <span class="highlight-title">Self-Supervised</span> Speech Models Capture
+  Linguistic Knowledge? <span class="chip">INTERSPEECH 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.08374v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.08374v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Takanori Ashihara, Takafumi Moriya, Kohei Matsuura, Tomohiro Tanaka, Yusuke Ijima, Taichi Asami, Marc Delcroix, Yukinori Honma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning (SSL) for speech representation has been
+successfully applied in various downstream tasks, such as speech and speaker
+recognition. More recently, speech SSL models have also been shown to be
+beneficial in advancing spoken language understanding tasks, implying that the
+SSL models have the potential to learn not only acoustic but also linguistic
+information. In this paper, we aim to clarify if speech SSL techniques can well
+capture linguistic knowledge. For this purpose, we introduce SpeechGLUE, a
+speech version of the General Language Understanding Evaluation (GLUE)
+benchmark. Since GLUE comprises a variety of natural language understanding
+tasks, SpeechGLUE can elucidate the degree of linguistic ability of speech SSL
+models. Experiments demonstrate that speech SSL models, although inferior to
+text-based SSL models, perform better than baselines, suggesting that they can
+acquire a certain amount of general linguistic knowledge from just unlabeled
+speech data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at INTERSPEECH 2023. This paper has been extended in a
+  subsequent journal paper, see
+  https://ieeexplore.ieee.org/abstract/document/10597571</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploiting the Potential of Seq2Seq Models as Robust Few-Shot Learners 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14856v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14856v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jihyeon Lee, Dain Kim, Doohae Jung, Boseop Kim, Kyoung-Woon On
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In-context learning, which offers substantial advantages over fine-tuning, is
+predominantly observed in decoder-only models, while encoder-decoder (i.e.,
+seq2seq) models excel in methods that rely on weight updates. Recently, a few
+studies have demonstrated the feasibility of few-shot learning with seq2seq
+models; however, this has been limited to tasks that align well with the
+seq2seq architecture, such as summarization and translation. Inspired by these
+initial studies, we provide a first-ever extensive experiment comparing the
+in-context few-shot learning capabilities of decoder-only and encoder-decoder
+models on a broad range of tasks. Furthermore, we propose two methods to more
+effectively elicit in-context learning ability in seq2seq models:
+objective-aligned prompting and a fusion-based approach. Remarkably, our
+approach outperforms a decoder-only model that is six times larger and exhibits
+significant performance improvements compared to conventional seq2seq models
+across a variety of settings. We posit that, with the right configuration and
+prompt design, seq2seq models can be highly effective few-shot learners for a
+wide spectrum of applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to COLM'2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Step-by-Step Unmasking for Parameter-Efficient Fine-tuning of Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14470v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14470v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aradhye Agarwal, Suhas K Ramesh, Ayan Sengupta, Tanmoy Chakraborty
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fine-tuning large language models (LLMs) on downstream tasks requires
+substantial computational resources. A class of parameter-efficient fine-tuning
+(PEFT) aims to mitigate these computational challenges by selectively
+fine-tuning only a small fraction of the model parameters. Although
+computationally efficient, these techniques often fail to match the performance
+of fully fine-tuned models, primarily due to inherent biases introduced during
+parameter selection. Traditional selective PEFT techniques use a fixed set of
+parameters based on a predefined budget (a process also known as unmasking),
+failing to capture parameter importance dynamically and often ending up
+exceeding the budget. We introduce $\text{ID}^3$, a novel selective PEFT method
+that calculates parameter importance continually and dynamically unmasks
+parameters by balancing exploration and exploitation in parameter selection.
+Our empirical study on 15 tasks spanning natural language understanding and
+generative tasks demonstrates the effectiveness of our method compared to
+fixed-masking-based PEFT techniques. We analytically show that $\text{ID}^3$
+reduces the number of gradient updates by a factor of two, enhancing
+computational efficiency. $\text{ID}^3$ is robust to random initialization of
+neurons and, therefore, can be seamlessly integrated into existing additive and
+reparametrization-based PEFT modules such as adapters and LoRA for dynamic
+sparsification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 7 tables, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A StrongREJECT for Empty Jailbreaks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.10260v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.10260v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexandra Souly, Qingyuan Lu, Dillon Bowen, Tu Trinh, Elvis Hsieh, Sana Pandey, Pieter Abbeel, Justin Svegliato, Scott Emmons, Olivia Watkins, Sam Toyer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most jailbreak papers claim the jailbreaks they propose are highly effective,
+often boasting near-100% attack success rates. However, it is perhaps more
+common than not for jailbreak developers to substantially exaggerate the
+effectiveness of their jailbreaks. We suggest this problem arises because
+jailbreak researchers lack a standard, high-quality benchmark for evaluating
+jailbreak performance, leaving researchers to create their own. To create a
+benchmark, researchers must choose a dataset of forbidden prompts to which a
+victim model will respond, along with an evaluation method that scores the
+harmfulness of the victim model's responses. We show that existing benchmarks
+suffer from significant shortcomings and introduce the StrongREJECT benchmark
+to address these issues. StrongREJECT's dataset contains prompts that victim
+models must answer with specific, harmful information, while its automated
+evaluator measures the extent to which a response gives useful information to
+forbidden prompts. In doing so, the StrongREJECT evaluator achieves
+state-of-the-art agreement with human judgments of jailbreak effectiveness.
+Notably, we find that existing evaluation methods significantly overstate
+jailbreak effectiveness compared to human judgments and the StrongREJECT
+evaluator. We describe a surprising and novel phenomenon that explains this
+discrepancy: jailbreaks bypassing a victim model's safety fine-tuning tend to
+reduce its capabilities. Together, our findings underscore the need for
+researchers to use a high-quality benchmark, such as StrongREJECT, when
+developing new jailbreak attacks. We release the StrongREJECT code and data at
+https://strong-reject.readthedocs.io/en/latest/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code and data at https://strong-reject.readthedocs.io/en/latest/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can LLM be a Good Path Planner based on <span class="highlight-title">Prompt</span> Engineering? Mitigating
+  the Hallucination for Path Planning <span class="chip">ICASSP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13184v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13184v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hourui Deng, Hongjie Zhang, Jie Ou, Chaosheng Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spatial reasoning in Large Language Models (LLMs) is the foundation for
+embodied intelligence. However, even in simple maze environments, LLMs still
+encounter challenges in long-term path-planning, primarily influenced by their
+spatial hallucination and context inconsistency hallucination by long-term
+reasoning. To address this challenge, this study proposes an innovative model,
+Spatial-to-Relational Transformation and Curriculum Q-Learning (S2RCQL). To
+address the spatial hallucination of LLMs, we propose the Spatial-to-Relational
+approach, which transforms spatial prompts into entity relations and paths
+representing entity relation chains. This approach fully taps the potential of
+LLMs in terms of sequential thinking. As a result, we design a path-planning
+algorithm based on Q-learning to mitigate the context inconsistency
+hallucination, which enhances the reasoning ability of LLMs. Using the Q-value
+of state-action as auxiliary information for prompts, we correct the
+hallucinations of LLMs, thereby guiding LLMs to learn the optimal path.
+Finally, we propose a reverse curriculum learning technique based on LLMs to
+further mitigate the context inconsistency hallucination. LLMs can rapidly
+accumulate successful experiences by reducing task difficulty and leveraging
+them to tackle more complex tasks. We performed comprehensive experiments based
+on Baidu's self-developed LLM: ERNIE-Bot 4.0. The results showed that our
+S2RCQL achieved a 23%--40% improvement in both success and optimality rates
+compared with advanced prompt engineering.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to ICASSP</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RAGEval: Scenario Specific RAG Evaluation <span class="highlight-title">Dataset</span> Generation Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.01262v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.01262v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kunlun Zhu, Yifan Luo, Dingling Xu, Ruobing Wang, Shi Yu, Shuo Wang, Yukun Yan, Zhenghao Liu, Xu Han, Zhiyuan Liu, Maosong Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-Augmented Generation (RAG) systems have demonstrated their
+advantages in alleviating the hallucination of Large Language Models (LLMs).
+Existing RAG benchmarks mainly focus on evaluating whether LLMs can correctly
+answer the general knowledge. However, they are unable to evaluate the
+effectiveness of the RAG system in dealing with the data from different
+vertical domains. This paper introduces RAGEval, a framework for automatically
+generating evaluation datasets to evaluate the knowledge usage ability of
+different LLMs in different scenarios. Specifically, RAGEval summarizes a
+schema from seed documents, applies the configurations to generate diverse
+documents, and constructs question-answering pairs according to both articles
+and configurations. We propose three novel metrics, Completeness,
+Hallucination, and Irrelevance, to carefully evaluate the responses generated
+by LLMs. By benchmarking RAG models in vertical domains, RAGEval has the
+ability to better evaluate the knowledge usage ability of LLMs, which avoids
+the confusion regarding the source of knowledge in answering question in
+existing QA datasets--whether it comes from parameterized memory or retrieval.
+The code and dataset will be released.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>add github repo</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BEYOND DIALOGUE: A Profile-Dialogue Alignment Framework Towards General
+  Role-Playing Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.10903v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.10903v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yeyong Yu, Runsheng Yu, Haojie Wei, Zhanqiu Zhang, Quan Qian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid advancement of large language models (LLMs) has revolutionized
+role-playing, enabling the development of general role-playing models. However,
+current role-playing training has two significant issues: (I) Using a
+predefined role profile to prompt dialogue training for specific scenarios
+usually leads to inconsistencies and even conflicts between the dialogue and
+the profile, resulting in training biases. (II) The model learns to imitate the
+role based solely on the profile, neglecting profile-dialogue alignment at the
+sentence level. In this work, we propose a simple yet effective framework
+called BEYOND DIALOGUE, designed to overcome these hurdles. This framework
+innovatively introduces "beyond dialogue" tasks to align dialogue with profile
+traits based on each specific scenario, thereby eliminating biases during
+training. Furthermore, by adopting an innovative prompting mechanism that
+generates reasoning outcomes for training, the framework allows the model to
+achieve fine-grained alignment between profile and dialogue at the sentence
+level. The aforementioned methods are fully automated and low-cost.
+Additionally, the integration of automated dialogue and objective evaluation
+methods forms a comprehensive framework, paving the way for general
+role-playing. Experimental results demonstrate that our model excels in
+adhering to and reflecting various dimensions of role profiles, outperforming
+most proprietary general and specialized role-playing baselines. All code and
+datasets are available at https://github.com/yuyouyu32/BeyondDialogue.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Integrating Paralinguistics in Speech-Empowered Large Language Models
+  for Natural Conversation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.05706v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.05706v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Heeseung Kim, Soonshin Seo, Kyeongseok Jeong, Ohsung Kwon, Soyoon Kim, Jungwhan Kim, Jaehong Lee, Eunwoo Song, Myungwoo Oh, Jung-Woo Ha, Sungroh Yoon, Kang Min Yoo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work shows promising results in expanding the capabilities of large
+language models (LLM) to directly understand and synthesize speech. However, an
+LLM-based strategy for modeling spoken dialogs remains elusive, calling for
+further investigation. This paper introduces an extensive speech-text LLM
+framework, the Unified Spoken Dialog Model (USDM), designed to generate
+coherent spoken responses with naturally occurring prosodic features relevant
+to the given input speech without relying on explicit automatic speech
+recognition (ASR) or text-to-speech (TTS) systems. We have verified the
+inclusion of prosody in speech tokens that predominantly contain semantic
+information and have used this foundation to construct a prosody-infused
+speech-text model. Additionally, we propose a generalized speech-text
+pretraining scheme that enhances the capture of cross-modal semantics. To
+construct USDM, we fine-tune our speech-text model on spoken dialog data using
+a multi-step spoken dialog template that stimulates the chain-of-reasoning
+capabilities exhibited by the underlying LLM. Automatic and human evaluations
+on the DailyTalk dataset demonstrate that our approach effectively generates
+natural-sounding spoken responses, surpassing previous and cascaded baselines.
+We will make our code and checkpoints publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OWSM-CTC: An Open Encoder-Only Speech Foundation Model for Speech
+  Recognition, Translation, and Language Identification <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.12654v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.12654v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Peng, Yui Sudo, Muhammad Shakeel, Shinji Watanabe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There has been an increasing interest in large speech models that can perform
+multiple tasks in a single model. Such models usually adopt an encoder-decoder
+or decoder-only architecture due to their popularity and good performance in
+many domains. However, autoregressive models can be slower during inference
+compared to non-autoregressive models and also have potential risks of
+hallucination. Though prior studies observed promising results of
+non-autoregressive models for certain tasks at small scales, it remains unclear
+if they can be scaled to speech-to-text generation in diverse languages and
+tasks. Inspired by the Open Whisper-style Speech Model (OWSM) project, we
+propose OWSM-CTC, a novel encoder-only speech foundation model based on
+Connectionist Temporal Classification (CTC). It is trained on 180k hours of
+public audio data for multilingual automatic speech recognition (ASR), speech
+translation (ST), and language identification (LID). Compared to
+encoder-decoder OWSM, our OWSM-CTC achieves competitive results on ASR and up
+to 24% relative improvement on ST, while it is more robust and 3 to 4 times
+faster for inference. OWSM-CTC also improves the long-form ASR result with 20x
+speed-up. We will publicly release our code, pre-trained model, and training
+logs to promote open science in speech foundation models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACL 2024 main conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OWSM v3.1: Better and Faster Open Whisper-Style Speech Models based on
+  E-Branchformer <span class="chip">INTERSPEECH 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.16658v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.16658v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Peng, Jinchuan Tian, William Chen, Siddhant Arora, Brian Yan, Yui Sudo, Muhammad Shakeel, Kwanghee Choi, Jiatong Shi, Xuankai Chang, Jee-weon Jung, Shinji Watanabe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent studies have highlighted the importance of fully open foundation
+models. The Open Whisper-style Speech Model (OWSM) is an initial step towards
+reproducing OpenAI Whisper using public data and open-source toolkits. However,
+previous versions of OWSM (v1 to v3) are still based on standard Transformer,
+which might lead to inferior performance compared to state-of-the-art speech
+encoder architectures. This work aims to improve the performance and efficiency
+of OWSM without additional data. We present a series of E-Branchformer-based
+models named OWSM v3.1, ranging from 100M to 1B parameters. OWSM v3.1
+outperforms its predecessor, OWSM v3, in most evaluation benchmarks, while
+showing an improved inference speed of up to 25%. We further reveal the
+emergent ability of OWSM v3.1 in zero-shot contextual biasing speech
+recognition. We also provide a model trained on a subset of data with low
+license restrictions. We will publicly release the code, pre-trained models,
+and training logs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at INTERSPEECH 2024. Webpage:
+  https://www.wavlab.org/activities/2024/owsm/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Computational Analysis of Lyric Similarity Perception 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.02342v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.02342v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haven Kim, Taketo Akama
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In musical compositions that include vocals, lyrics significantly contribute
+to artistic expression. Consequently, previous studies have introduced the
+concept of a recommendation system that suggests lyrics similar to a user's
+favorites or personalized preferences, aiding in the discovery of lyrics among
+millions of tracks. However, many of these systems do not fully consider human
+perceptions of lyric similarity, primarily due to limited research in this
+area. To bridge this gap, we conducted a comparative analysis of computational
+methods for modeling lyric similarity with human perception. Results indicated
+that computational models based on similarities between embeddings from
+pre-trained BERT-based models, the audio from which the lyrics are derived, and
+phonetic components are indicative of perceptual lyric similarity. This finding
+underscores the importance of semantic, stylistic, and phonetic similarities in
+human perception about lyric similarity. We anticipate that our findings will
+enhance the development of similarity-based lyric recommendation systems by
+offering pseudo-labels for neural network development and introducing objective
+evaluation metrics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In the process of a detailed revision</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unboxing Occupational Bias: Grounded Debiasing of LLMs with U.S. Labor
+  Data <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.11247v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.11247v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Atmika Gorti, Manas Gaur, Aman Chadha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) are prone to inheriting and amplifying societal
+biases embedded within their training data, potentially reinforcing harmful
+stereotypes related to gender, occupation, and other sensitive categories. This
+issue becomes particularly problematic as biased LLMs can have far-reaching
+consequences, leading to unfair practices and exacerbating social inequalities
+across various domains, such as recruitment, online content moderation, or even
+the criminal justice system. Although prior research has focused on detecting
+bias in LLMs using specialized datasets designed to highlight intrinsic biases,
+there has been a notable lack of investigation into how these findings
+correlate with authoritative datasets, such as those from the U.S. National
+Bureau of Labor Statistics (NBLS). To address this gap, we conduct empirical
+research that evaluates LLMs in a ``bias-out-of-the-box" setting, analyzing how
+the generated outputs compare with the distributions found in NBLS data.
+Furthermore, we propose a straightforward yet effective debiasing mechanism
+that directly incorporates NBLS instances to mitigate bias within LLMs. Our
+study spans seven different LLMs, including instructable, base, and
+mixture-of-expert models, and reveals significant levels of bias that are often
+overlooked by existing bias detection techniques. Importantly, our debiasing
+method, which does not rely on external datasets, demonstrates a substantial
+reduction in bias scores, highlighting the efficacy of our approach in creating
+fairer and more reliable LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in AAAI Spring Symposium 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Gated Linear Attention <span class="highlight-title">Transformer</span>s with Hardware-Efficient Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.06635v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.06635v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Songlin Yang, Bailin Wang, Yikang Shen, Rameswar Panda, Yoon Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers with linear attention allow for efficient parallel training but
+can simultaneously be formulated as an RNN with 2D (matrix-valued) hidden
+states, thus enjoying linear-time inference complexity. However, linear
+attention generally underperforms ordinary softmax attention. Moreover, current
+implementations of linear attention lack I/O-awareness and are thus slower than
+highly optimized implementations of softmax attention. This work describes a
+hardware-efficient algorithm for linear attention that trades off memory
+movement against parallelizability. The resulting implementation, dubbed
+FLASHLINEARATTENTION, is faster than FLASHATTENTION-2 (Dao, 2023) as a
+standalone layer even on short sequence lengths (e.g., 1K). We then generalize
+this algorithm to a more expressive variant of linear attention with
+data-dependent gates. When used as a replacement for the standard attention
+layer in Transformers, the resulting gated linear attention (GLA) Transformer
+is found to perform competitively against the LLaMA-architecture Transformer
+(Touvron et al., 2023) as well recent linear-time-inference baselines such as
+RetNet (Sun et al., 2023a) and Mamba (Gu & Dao, 2023) on moderate-scale
+language modeling experiments. GLA Transformer is especially effective at
+length generalization, enabling a model trained on 2K to generalize to
+sequences longer than 20K without significant perplexity degradations. For
+training speed, the GLA Transformer has higher throughput than a
+similarly-sized Mamba model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>minor update</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LQ-LoRA: Low-rank Plus Quantized Matrix Decomposition for Efficient
+  Language Model Finetuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.12023v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.12023v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Han Guo, Philip Greengard, Eric P. Xing, Yoon Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a simple approach for memory-efficient adaptation of pretrained
+language models. Our approach uses an iterative algorithm to decompose each
+pretrained matrix into a high-precision low-rank component and a
+memory-efficient quantized component. During finetuning, the quantized
+component remains fixed and only the low-rank component is updated. We present
+an integer linear programming formulation of the quantization component which
+enables dynamic configuration of quantization parameters (e.g., bit-width,
+block size) for each matrix given an overall target memory budget. We further
+explore a data-aware version of the algorithm which uses an approximation of
+the Fisher information matrix to weight the reconstruction objective during
+matrix decomposition. Experiments on finetuning RoBERTa and LLaMA-2 (7B and
+70B) demonstrate that our low-rank plus quantized matrix decomposition approach
+(LQ-LoRA) outperforms strong QLoRA and GPTQ-LoRA baselines and enables
+aggressive quantization to sub-3 bits with only minor performance degradations.
+When finetuned on a language modeling calibration dataset, LQ-LoRA can also be
+used for model compression; in this setting our 2.75-bit LLaMA-2-70B model
+(which has 2.85 bits on average when including the low-rank components and
+requires 27GB of GPU memory) performs respectably compared to the 16-bit
+baseline.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fast Matrix Multiplications for Lookup Table-Quantized LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.10960v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.10960v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Han Guo, William Brandon, Radostin Cholakov, Jonathan Ragan-Kelley, Eric P. Xing, Yoon Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The deployment of large language models (LLMs) is often constrained by memory
+bandwidth, where the primary bottleneck is the cost of transferring model
+parameters from the GPU's global memory to its registers. When coupled with
+custom kernels that fuse the dequantization and matmul operations, weight-only
+quantization can thus enable faster inference by reducing the amount of memory
+movement. However, developing high-performance kernels for weight-quantized
+LLMs presents substantial challenges, especially when the weights are
+compressed to non-evenly-divisible bit widths (e.g., 3 bits) with non-uniform,
+lookup table (LUT) quantization. This paper describes FLUTE, a flexible lookup
+table engine for LUT-quantized LLMs, which uses offline restructuring of the
+quantized weight matrix to minimize bit manipulations associated with
+unpacking, and vectorization and duplication of the lookup table to mitigate
+shared memory bandwidth constraints. At batch sizes < 32 and quantization group
+size of 128 (typical in LLM inference), the FLUTE kernel can be 2-4x faster
+than existing GEMM kernels. As an application of FLUTE, we explore a simple
+extension to lookup table-based NormalFloat quantization and apply it to
+quantize LLaMA3 to various configurations, obtaining competitive quantization
+performance against strong baselines while obtaining an end-to-end throughput
+increase of 1.5 to 2 times.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Measuring the Quality of Answers in Political Q&As with Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.08816v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.08816v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        R. Michael Alvarez, Jacob Morrier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a new approach for measuring the quality of answers in
+political question-and-answer sessions. We propose to measure answer quality
+based on the degree to which it allows to infer the initial question
+accurately. This measure of answer quality reflects how well the answer engages
+with and addresses the initial question. Drawing an analogy with semantic
+search, we demonstrate that this measurement approach can be implemented by
+fine-tuning a large language model on the corpus of observed questions and
+answers without additional labeled data. We showcase our approach within the
+context of the Question Period in the Canadian House of Commons, providing
+valuable insights into the correlates of answer quality. Our findings reveal
+significant variations in answer quality based on the party affiliation of the
+members of Parliament asking the question. Additionally, we find a meaningful
+correlation between answer quality and the topic raised in the question.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient LLM Training and Serving with Heterogeneous Context Sharding
+  among Attention Heads 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.17678v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.17678v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xihui Lin, Yunan Zhang, Suyu Ge, Barun Patra, Vishrav Chaudhary, Hao Peng, Xia Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing LLM training and inference frameworks struggle in boosting
+efficiency with sparsity while maintaining the integrity of context and model
+architecture. Inspired by the sharding concept in database and the fact that
+attention parallelizes over heads on accelerators, we propose Sparsely-Sharded
+(S2) Attention, an attention algorithm that allocates heterogeneous context
+partitions for different attention heads to divide and conquer. S2-Attention
+enforces each attention head to only attend to a partition of contexts
+following a strided sparsity pattern, while the full context is preserved as
+the union of all the shards. As attention heads are processed in separate
+thread blocks, the context reduction for each head can thus produce end-to-end
+speed-up and memory reduction. At inference, LLMs trained with S2-Attention can
+then take the KV cache reduction as free meals with guaranteed model quality
+preserve. In experiments, we show S2-Attentioncan provide as much as (1) 25.3X
+wall-clock attention speed-up over FlashAttention-2, resulting in 6X reduction
+in end-to-end training time and 10X inference latency, (2) on-par model
+training quality compared to default attention, (3)perfect needle retrieval
+accuracy over 32K context window. On top of the algorithm, we build DKernel, an
+LLM training and inference kernel library that allows users to customize
+sparsity patterns for their own models. We open-sourced DKerneland make it
+compatible with Megatron, Pytorch, and vLLM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Faithfulness Measurable Masked Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07819v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07819v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andreas Madsen, Siva Reddy, Sarath Chandar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A common approach to explaining NLP models is to use importance measures that
+express which tokens are important for a prediction. Unfortunately, such
+explanations are often wrong despite being persuasive. Therefore, it is
+essential to measure their faithfulness. One such metric is if tokens are truly
+important, then masking them should result in worse model performance. However,
+token masking introduces out-of-distribution issues, and existing solutions
+that address this are computationally expensive and employ proxy models.
+Furthermore, other metrics are very limited in scope. This work proposes an
+inherently faithfulness measurable model that addresses these challenges. This
+is achieved using a novel fine-tuning method that incorporates masking, such
+that masking tokens become in-distribution by design. This differs from
+existing approaches, which are completely model-agnostic but are inapplicable
+in practice. We demonstrate the generality of our approach by applying it to 16
+different datasets and validate it using statistical in-distribution tests. The
+faithfulness is then measured with 9 different importance measures. Because
+masking is in-distribution, importance measures that themselves use masking
+become consistently more faithful. Additionally, because the model makes
+faithfulness cheap to measure, we can optimize explanations towards maximal
+faithfulness; thus, our model becomes indirectly inherently explainable.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ML-EAT: A Multilevel Embedding Association Test for Interpretable and
+  Transparent Social Science 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.01966v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.01966v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Robert Wolfe, Alexis Hiniker, Bill Howe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This research introduces the Multilevel Embedding Association Test (ML-EAT),
+a method designed for interpretable and transparent measurement of intrinsic
+bias in language technologies. The ML-EAT addresses issues of ambiguity and
+difficulty in interpreting the traditional EAT measurement by quantifying bias
+at three levels of increasing granularity: the differential association between
+two target concepts with two attribute concepts; the individual effect size of
+each target concept with two attribute concepts; and the association between
+each individual target concept and each individual attribute concept. Using the
+ML-EAT, this research defines a taxonomy of EAT patterns describing the nine
+possible outcomes of an embedding association test, each of which is associated
+with a unique EAT-Map, a novel four-quadrant visualization for interpreting the
+ML-EAT. Empirical analysis of static and diachronic word embeddings, GPT-2
+language models, and a CLIP language-and-image model shows that EAT patterns
+add otherwise unobservable information about the component biases that make up
+an EAT; reveal the effects of prompting in zero-shot models; and can also
+identify situations when cosine similarity is an ineffective metric, rendering
+an EAT unreliable. Our work contributes a method for rendering bias more
+observable and interpretable, improving the transparency of computational
+investigations into human minds and societies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Artificial Intelligence, Ethics, and Society 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Dataset</span> Scale and Societal Consistency Mediate Facial Impression Bias in
+  Vision-Language AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.01959v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.01959v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Robert Wolfe, Aayushi Dangol, Alexis Hiniker, Bill Howe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal AI models capable of associating images and text hold promise for
+numerous domains, ranging from automated image captioning to accessibility
+applications for blind and low-vision users. However, uncertainty about bias
+has in some cases limited their adoption and availability. In the present work,
+we study 43 CLIP vision-language models to determine whether they learn
+human-like facial impression biases, and we find evidence that such biases are
+reflected across three distinct CLIP model families. We show for the first time
+that the the degree to which a bias is shared across a society predicts the
+degree to which it is reflected in a CLIP model. Human-like impressions of
+visually unobservable attributes, like trustworthiness and sexuality, emerge
+only in models trained on the largest dataset, indicating that a better fit to
+uncurated cultural data results in the reproduction of increasingly subtle
+social biases. Moreover, we use a hierarchical clustering approach to show that
+dataset size predicts the extent to which the underlying structure of facial
+impression bias resembles that of facial impression bias in humans. Finally, we
+show that Stable Diffusion models employing CLIP as a text encoder learn facial
+impression biases, and that these biases intersect with racial biases in Stable
+Diffusion XL-Turbo. While pretrained CLIP models may prove useful for
+scientific studies of bias, they will also require significant dataset curation
+when intended for use as general-purpose models in a zero-shot setting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Artificial Intelligence, Ethics, and Society 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On Tables with Numbers, with Numbers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.06062v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.06062v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Konstantinos Kogkalidis, Stergios Chatzikyriakidis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper is a critical reflection on the epistemic culture of contemporary
+computational linguistics, framed in the context of its growing obsession with
+tables with numbers. We argue against tables with numbers on the basis of their
+epistemic irrelevance, their environmental impact, their role in enabling and
+exacerbating social inequalities, and their deep ties to commercial
+applications and profit-driven research. We substantiate our arguments with
+empirical evidence drawn from a meta-analysis of computational linguistics
+research over the last decade.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>v3: Stergios' acknowledgements</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GNN: Graph Neural Network and Large Language Model for Data Discovery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13609v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13609v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas Hoang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Our algorithm GNN: Graph Neural Network and Large Language Model for Data
+Discovery inherit the benefits of \cite{hoang2024plod} (PLOD: Predictive
+Learning Optimal Data Discovery), \cite{Hoang2024BODBO} (BOD: Blindly Optimal
+Data Discovery) in terms of overcoming the challenges of having to predefine
+utility function and the human input for attribute ranking, which helps prevent
+the time-consuming loop process. In addition to these previous works, our
+algorithm GNN leverages the advantages of graph neural networks and large
+language models to understand text type values that cannot be understood by
+PLOD and MOD, thus making the task of predicting outcomes more reliable. GNN
+could be seen as an extension of PLOD in terms of understanding the text type
+value and the user's preferences, not only numerical values but also text
+values, making the promise of data science and analytics purposes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">155</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Drone-assisted Road Gaussian Splatting with Cross-view Uncertainty <span class="chip">BMVC2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15242v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15242v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saining Zhang, Baijun Ye, Xiaoxue Chen, Yuantao Chen, Zongzheng Zhang, Cheng Peng, Yongliang Shi, Hao Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robust and realistic rendering for large-scale road scenes is essential in
+autonomous driving simulation. Recently, 3D Gaussian Splatting (3D-GS) has made
+groundbreaking progress in neural rendering, but the general fidelity of
+large-scale road scene renderings is often limited by the input imagery, which
+usually has a narrow field of view and focuses mainly on the street-level local
+area. Intuitively, the data from the drone's perspective can provide a
+complementary viewpoint for the data from the ground vehicle's perspective,
+enhancing the completeness of scene reconstruction and rendering. However,
+training naively with aerial and ground images, which exhibit large view
+disparity, poses a significant convergence challenge for 3D-GS, and does not
+demonstrate remarkable improvements in performance on road views. In order to
+enhance the novel view synthesis of road views and to effectively use the
+aerial information, we design an uncertainty-aware training method that allows
+aerial images to assist in the synthesis of areas where ground images have poor
+learning outcomes instead of weighting all pixels equally in 3D-GS training
+like prior work did. We are the first to introduce the cross-view uncertainty
+to 3D-GS by matching the car-view ensemble-based rendering uncertainty to
+aerial images, weighting the contribution of each pixel to the training
+process. Additionally, to systematically quantify evaluation metrics, we
+assemble a high-quality synthesized dataset comprising both aerial and ground
+images for road scenes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>BMVC2024 Project Page: https://sainingzhang.github.io/project/uc-gs/
+  Code: https://github.com/SainingZhang/uc-gs/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GenRec: Unifying Video Generation and Recognition with Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15241v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15241v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zejia Weng, Xitong Yang, Zhen Xing, Zuxuan Wu, Yu-Gang Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video diffusion models are able to generate high-quality videos by learning
+strong spatial-temporal priors on large-scale datasets. In this paper, we aim
+to investigate whether such priors derived from a generative process are
+suitable for video recognition, and eventually joint optimization of generation
+and recognition. Building upon Stable Video Diffusion, we introduce GenRec, the
+first unified framework trained with a random-frame conditioning process so as
+to learn generalized spatial-temporal representations. The resulting framework
+can naturally supports generation and recognition, and more importantly is
+robust even when visual inputs contain limited information. Extensive
+experiments demonstrate the efficacy of GenRec for both recognition and
+generation. In particular, GenRec achieves competitive recognition performance,
+offering 75.8% and 87.2% accuracy on SSV2 and K400, respectively. GenRec also
+performs the best class-conditioned image-to-video generation results,
+achieving 46.5 and 49.3 FVD scores on SSV2 and EK-100 datasets. Furthermore,
+GenRec demonstrates extraordinary robustness in scenarios that only limited
+frames can be observed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 6 figures, 7 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Inbetweening: Adapting Image-to-Video Models for Keyframe
+  Interpolation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15239v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15239v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaojuan Wang, Boyang Zhou, Brian Curless, Ira Kemelmacher-Shlizerman, Aleksander Holynski, Steven M. Seitz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a method for generating video sequences with coherent motion
+between a pair of input key frames. We adapt a pretrained large-scale
+image-to-video diffusion model (originally trained to generate videos moving
+forward in time from a single input image) for key frame interpolation, i.e.,
+to produce a video in between two input frames. We accomplish this adaptation
+through a lightweight fine-tuning technique that produces a version of the
+model that instead predicts videos moving backwards in time from a single input
+image. This model (along with the original forward-moving model) is
+subsequently used in a dual-directional diffusion sampling process that
+combines the overlapping model estimates starting from each of the two
+keyframes. Our experiments show that our method outperforms both existing
+diffusion-based methods and traditional frame interpolation techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>project page: https://svd-keyframe-interpolation.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning-based Multi-View Stereo: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15235v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15235v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fangjinhua Wang, Qingtian Zhu, Di Chang, Quankai Gao, Junlin Han, Tong Zhang, Richard Hartley, Marc Pollefeys
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D reconstruction aims to recover the dense 3D structure of a scene. It plays
+an essential role in various applications such as Augmented/Virtual Reality
+(AR/VR), autonomous driving and robotics. Leveraging multiple views of a scene
+captured from different viewpoints, Multi-View Stereo (MVS) algorithms
+synthesize a comprehensive 3D representation, enabling precise reconstruction
+in complex environments. Due to its efficiency and effectiveness, MVS has
+become a pivotal method for image-based 3D reconstruction. Recently, with the
+success of deep learning, many learning-based MVS methods have been proposed,
+achieving impressive performance against traditional methods. We categorize
+these learning-based methods as: depth map-based, voxel-based, NeRF-based, 3D
+Gaussian Splatting-based, and large feed-forward methods. Among these, we focus
+significantly on depth map-based methods, which are the main family of MVS due
+to their conciseness, flexibility and scalability. In this survey, we provide a
+comprehensive review of the literature at the time of this writing. We
+investigate these learning-based methods, summarize their performances on
+popular benchmarks, and discuss promising future research directions in this
+area.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DCT-CryptoNets: Scaling Private Inference in the Frequency Domain 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15231v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15231v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arjun Roy, Kaushik Roy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The convergence of fully homomorphic encryption (FHE) and machine learning
+offers unprecedented opportunities for private inference of sensitive data. FHE
+enables computation directly on encrypted data, safeguarding the entire machine
+learning pipeline, including data and model confidentiality. However, existing
+FHE-based implementations for deep neural networks face significant challenges
+in computational cost, latency, and scalability, limiting their practical
+deployment. This paper introduces DCT-CryptoNets, a novel approach that
+leverages frequency-domain learning to tackle these issues. Our method operates
+directly in the frequency domain, utilizing the discrete cosine transform (DCT)
+commonly employed in JPEG compression. This approach is inherently compatible
+with remote computing services, where images are usually transmitted and stored
+in compressed formats. DCT-CryptoNets reduces the computational burden of
+homomorphic operations by focusing on perceptually relevant low-frequency
+components. This is demonstrated by substantial latency reduction of up to
+5.3$\times$ compared to prior work on image classification tasks, including a
+novel demonstration of ImageNet inference within 2.5 hours, down from 12.5
+hours compared to prior work on equivalent compute resources. Moreover,
+DCT-CryptoNets improves the reliability of encrypted accuracy by reducing
+variability (e.g., from $\pm$2.5\% to $\pm$1.0\% on ImageNet). This study
+demonstrates a promising avenue for achieving efficient and practical
+privacy-preserving deep learning on high resolution images seen in real-world
+applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review; 10 pages content, 3 pages appendix, 4 figures, 8
+  tables; Code TBD</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SAM & SAM 2 in 3D Slicer: SegmentWithSAM Extension for Annotating
+  Medical Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15224v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15224v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zafer Yildiz, Yuwen Chen, Maciej A. Mazurowski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Creating annotations for 3D medical data is time-consuming and often requires
+highly specialized expertise. Various tools have been implemented to aid this
+process. Segment Anything Model 2 (SAM 2) offers a general-purpose prompt-based
+segmentation algorithm designed to annotate videos. In this paper, we adapt
+this model to the annotation of 3D medical images and offer our implementation
+in the form of an extension to the popular annotation software: 3D Slicer. Our
+extension allows users to place point prompts on 2D slices to generate
+annotation masks and propagate these annotations across entire volumes in
+either single-directional or bi-directional manners. Our code is publicly
+available on https://github.com/mazurowski-lab/SlicerSegmentWithSAM and can be
+easily installed directly from the Extension Manager of 3D Slicer as well.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Future work: support for box and mask inputs for the video predictor
+  of SAM 2</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Histo-Diffusion: A Diffusion Super-Resolution Method for Digital
+  Pathology with Comprehensive Quality Assessment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15218v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15218v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuan Xu, Saarthak Kapse, Prateek Prasanna
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Digital pathology has advanced significantly over the last decade, with Whole
+Slide Images (WSIs) encompassing vast amounts of data essential for accurate
+disease diagnosis. High-resolution WSIs are essential for precise diagnosis but
+technical limitations in scanning equipment and variablity in slide preparation
+can hinder obtaining these images. Super-resolution techniques can enhance
+low-resolution images; while Generative Adversarial Networks (GANs) have been
+effective in natural image super-resolution tasks, they often struggle with
+histopathology due to overfitting and mode collapse. Traditional evaluation
+metrics fall short in assessing the complex characteristics of histopathology
+images, necessitating robust histology-specific evaluation methods.
+  We introduce Histo-Diffusion, a novel diffusion-based method specially
+designed for generating and evaluating super-resolution images in digital
+pathology. It includes a restoration module for histopathology prior and a
+controllable diffusion module for generating high-quality images. We have
+curated two histopathology datasets and proposed a comprehensive evaluation
+strategy which incorporates both full-reference and no-reference metrics to
+thoroughly assess the quality of digital pathology images.
+  Comparative analyses on multiple datasets with state-of-the-art methods
+reveal that Histo-Diffusion outperforms GANs. Our method offers a versatile
+solution for histopathology image super-resolution, capable of handling
+multi-resolution generation from varied input sizes, providing valuable support
+in diagnostic processes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>We have submitted our paper to Medical Image Analysis and are
+  currently awaiting feedback</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fundus2Video: Cross-Modal Angiography Video Generation from Static
+  Fundus Photography with Clinical Knowledge Guidance <span class="chip">MICCAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15217v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15217v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weiyi Zhang, Siyu Huang, Jiancheng Yang, Ruoyu Chen, Zongyuan Ge, Yingfeng Zheng, Danli Shi, Mingguang He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fundus Fluorescein Angiography (FFA) is a critical tool for assessing retinal
+vascular dynamics and aiding in the diagnosis of eye diseases. However, its
+invasive nature and less accessibility compared to Color Fundus (CF) images
+pose significant challenges. Current CF to FFA translation methods are limited
+to static generation. In this work, we pioneer dynamic FFA video generation
+from static CF images. We introduce an autoregressive GAN for smooth,
+memory-saving frame-by-frame FFA synthesis. To enhance the focus on dynamic
+lesion changes in FFA regions, we design a knowledge mask based on clinical
+experience. Leveraging this mask, our approach integrates innovative knowledge
+mask-guided techniques, including knowledge-boosted attention, knowledge-aware
+discriminators, and mask-enhanced patchNCE loss, aimed at refining generation
+in critical areas and addressing the pixel misalignment challenge. Our method
+achieves the best FVD of 1503.21 and PSNR of 11.81 compared to other common
+video generation approaches. Human assessment by an ophthalmologist confirms
+its high generation quality. Notably, our knowledge mask surpasses supervised
+lesion segmentation masks, offering a promising non-invasive alternative to
+traditional FFA for research and clinical applications. The code is available
+at https://github.com/Michi-3000/Fundus2Video.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The paper has been accepted by Medical Image Computing and Computer
+  Assisted Intervention Society (MICCAI) 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging Hallucinations to Reduce Manual <span class="highlight-title">Prompt</span> Dependency in
+  <span class="highlight-title">Prompt</span>able Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15205v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15205v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jian Hu, Jiayi Lin, Junchi Yan, Shaogang Gong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Promptable segmentation typically requires instance-specific manual prompts
+to guide the segmentation of each desired object. To minimize such a need,
+task-generic promptable segmentation has been introduced, which employs a
+single task-generic prompt to segment various images of different objects in
+the same task. Current methods use Multimodal Large Language Models (MLLMs) to
+reason detailed instance-specific prompts from a task-generic prompt for
+improving segmentation accuracy. The effectiveness of this segmentation heavily
+depends on the precision of these derived prompts. However, MLLMs often suffer
+hallucinations during reasoning, resulting in inaccurate prompting. While
+existing methods focus on eliminating hallucinations to improve a model, we
+argue that MLLM hallucinations can reveal valuable contextual insights when
+leveraged correctly, as they represent pre-trained large-scale knowledge beyond
+individual images. In this paper, we utilize hallucinations to mine
+task-related information from images and verify its accuracy for enhancing
+precision of the generated prompts. Specifically, we introduce an iterative
+Prompt-Mask Cycle generation framework (ProMaC) with a prompt generator and a
+mask generator.The prompt generator uses a multi-scale chain of thought
+prompting, initially exploring hallucinations for extracting extended
+contextual knowledge on a test image.These hallucinations are then reduced to
+formulate precise instance-specific prompts, directing the mask generator to
+produce masks that are consistent with task semantics by mask semantic
+alignment. The generated masks iteratively induce the prompt generator to focus
+more on task-relevant image areas and reduce irrelevant hallucinations,
+resulting jointly in better prompts and masks. Experiments on 5 benchmarks
+demonstrate the effectiveness of ProMaC. Code given in
+https://lwpyh.github.io/ProMaC/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>We propose using hallucinations as prior knowledge to extract and
+  validate task-related information, which helps generate instance-specific
+  prompts for reducing reliance on manual prompts in promptable segmentation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Investigation on The Position Encoding in Vision-Based Dynamics
+  Prediction <span class="chip">ECCV2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15201v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15201v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiageng Zhu, Hanchen Xie, Jiazhi Li, Mahyar Khayatkhoei, Wael AbdAlmageed
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the success of vision-based dynamics prediction models, which predict
+object states by utilizing RGB images and simple object descriptions, they were
+challenged by environment misalignments. Although the literature has
+demonstrated that unifying visual domains with both environment context and
+object abstract, such as semantic segmentation and bounding boxes, can
+effectively mitigate the visual domain misalignment challenge, discussions were
+focused on the abstract of environment context, and the insight of using
+bounding box as the object abstract is under-explored. Furthermore, we notice
+that, as empirical results shown in the literature, even when the visual
+appearance of objects is removed, object bounding boxes alone, instead of being
+directly fed into the network, can indirectly provide sufficient position
+information via the Region of Interest Pooling operation for dynamics
+prediction. However, previous literature overlooked discussions regarding how
+such position information is implicitly encoded in the dynamics prediction
+model. Thus, in this paper, we provide detailed studies to investigate the
+process and necessary conditions for encoding position information via using
+the bounding box as the object abstract into output features. Furthermore, we
+study the limitation of solely using object abstracts, such that the dynamics
+prediction performance will be jeopardized when the environment context varies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 4 tables, and 3 figures. Accepted to ECCV2024 eXCV workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PoseWatch: A <span class="highlight-title">Transformer</span>-based Architecture for Human-centric Video
+  Anomaly Detection Using Spatio-temporal Pose Tokenization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15185v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15185v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ghazal Alinezhad Noghre, Armin Danesh Pazho, Hamed Tabkhi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video Anomaly Detection (VAD) presents a significant challenge in computer
+vision, particularly due to the unpredictable and infrequent nature of
+anomalous events, coupled with the diverse and dynamic environments in which
+they occur. Human-centric VAD, a specialized area within this domain, faces
+additional complexities, including variations in human behavior, potential
+biases in data, and substantial privacy concerns related to human subjects.
+These issues complicate the development of models that are both robust and
+generalizable. To address these challenges, recent advancements have focused on
+pose-based VAD, which leverages human pose as a high-level feature to mitigate
+privacy concerns, reduce appearance biases, and minimize background
+interference. In this paper, we introduce PoseWatch, a novel transformer-based
+architecture designed specifically for human-centric pose-based VAD. PoseWatch
+features an innovative Spatio-Temporal Pose and Relative Pose (ST-PRP)
+tokenization method that enhances the representation of human motion over time,
+which is also beneficial for broader human behavior analysis tasks. The
+architecture's core, a Unified Encoder Twin Decoders (UETD) transformer,
+significantly improves the detection of anomalous behaviors in video data.
+Extensive evaluations across multiple benchmark datasets demonstrate that
+PoseWatch consistently outperforms existing methods, establishing a new
+state-of-the-art in pose-based VAD. This work not only demonstrates the
+efficacy of PoseWatch but also highlights the potential of integrating Natural
+Language Processing techniques with computer vision to advance human behavior
+analysis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Review</span> of <span class="highlight-title">Transformer</span>-Based Models for Computer Vision Tasks:
+  Capturing Global Context and Spatial Relationships 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15178v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15178v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gracile Astlin Pereira, Muhammad Hussain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer-based models have transformed the landscape of natural language
+processing (NLP) and are increasingly applied to computer vision tasks with
+remarkable success. These models, renowned for their ability to capture
+long-range dependencies and contextual information, offer a promising
+alternative to traditional convolutional neural networks (CNNs) in computer
+vision. In this review paper, we provide an extensive overview of various
+transformer architectures adapted for computer vision tasks. We delve into how
+these models capture global context and spatial relationships in images,
+empowering them to excel in tasks such as image classification, object
+detection, and segmentation. Analyzing the key components, training
+methodologies, and performance metrics of transformer-based models, we
+highlight their strengths, limitations, and recent advancements. Additionally,
+we discuss potential research directions and applications of transformer-based
+models in computer vision, offering insights into their implications for future
+advancements in the field.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ X-Reflect: Cross-Reflection <span class="highlight-title">Prompt</span>ing for Multimodal Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15172v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15172v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanjia Lyu, Ryan Rossi, Xiang Chen, Md Mehrab Tanjim, Stefano Petrangeli, Somdeb Sarkhel, Jiebo Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) and Large Multimodal Models (LMMs) have been
+shown to enhance the effectiveness of enriching item descriptions, thereby
+improving the accuracy of recommendation systems. However, most existing
+approaches either rely on text-only prompting or employ basic multimodal
+strategies that do not fully exploit the complementary information available
+from both textual and visual modalities. This paper introduces a novel
+framework, Cross-Reflection Prompting, termed X-Reflect, designed to address
+these limitations by prompting LMMs to explicitly identify and reconcile
+supportive and conflicting information between text and images. By capturing
+nuanced insights from both modalities, this approach generates more
+comprehensive and contextually richer item representations. Extensive
+experiments conducted on two widely used benchmarks demonstrate that our method
+outperforms existing prompting baselines in downstream recommendation accuracy.
+Additionally, we evaluate the generalizability of our framework across
+different LMM backbones and the robustness of the prompting strategies,
+offering insights for optimization. This work underscores the importance of
+integrating multimodal information and presents a novel solution for improving
+item understanding in multimodal recommendation systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Empowering Sign Language Communication: Integrating Sentiment and
+  Semantics for Facial Expression Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15159v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15159v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rafael Azevedo, Thiago Coutinho, João Ferreira, Thiago Gomes, Erickson Nascimento
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Translating written sentences from oral languages to a sequence of manual and
+non-manual gestures plays a crucial role in building a more inclusive society
+for deaf and hard-of-hearing people. Facial expressions (non-manual), in
+particular, are responsible for encoding the grammar of the sentence to be
+spoken, applying punctuation, pronouns, or emphasizing signs. These non-manual
+gestures are closely related to the semantics of the sentence being spoken and
+also to the utterance of the speaker's emotions. However, most Sign Language
+Production (SLP) approaches are centered on synthesizing manual gestures and do
+not focus on modeling the speakers expression. This paper introduces a new
+method focused in synthesizing facial expressions for sign language. Our goal
+is to improve sign language production by integrating sentiment information in
+facial expression generation. The approach leverages a sentence sentiment and
+semantic features to sample from a meaningful representation space, integrating
+the bias of the non-manual components into the sign language production
+process. To evaluate our method, we extend the Frechet Gesture Distance (FGD)
+and propose a new metric called Frechet Expression Distance (FED) and apply an
+extensive set of metrics to assess the quality of specific regions of the face.
+The experimental results showed that our method achieved state of the art,
+being superior to the competitors on How2Sign and PHOENIX14T datasets.
+Moreover, our architecture is based on a carefully designed graph pyramid that
+makes it simpler, easier to train, and capable of leveraging emotions to
+produce facial expressions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Preliminary Exploration Towards General Image Restoration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15143v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15143v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangtao Kong, Jinjin Gu, Yihao Liu, Wenlong Zhang, Xiangyu Chen, Yu Qiao, Chao Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the tremendous success of deep models in various individual image
+restoration tasks, there are at least two major technical challenges preventing
+these works from being applied to real-world usages: (1) the lack of
+generalization ability and (2) the complex and unknown degradations in
+real-world scenarios. Existing deep models, tailored for specific individual
+image restoration tasks, often fall short in effectively addressing these
+challenges. In this paper, we present a new problem called general image
+restoration (GIR) which aims to address these challenges within a unified
+model. GIR covers most individual image restoration tasks (\eg, image
+denoising, deblurring, deraining and super-resolution) and their combinations
+for general purposes. This paper proceeds to delineate the essential aspects of
+GIR, including problem definition and the overarching significance of
+generalization performance. Moreover, the establishment of new datasets and a
+thorough evaluation framework for GIR models is discussed. We conduct a
+comprehensive evaluation of existing approaches for tackling the GIR challenge,
+illuminating their strengths and pragmatic challenges. By analyzing these
+approaches, we not only underscore the effectiveness of GIR but also highlight
+the difficulties in its practical implementation. At last, we also try to
+understand and interpret these models' behaviors to inspire the future
+direction. Our work can open up new valuable research directions and contribute
+to the research of general vision.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ T-FAKE: Synthesizing Thermal Images for Facial Landmarking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15127v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15127v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Philipp Flotho, Moritz Piening, Anna Kukleva, Gabriele Steidl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Facial analysis is a key component in a wide range of applications such as
+security, autonomous driving, entertainment, and healthcare. Despite the
+availability of various facial RGB datasets, the thermal modality, which plays
+a crucial role in life sciences, medicine, and biometrics, has been largely
+overlooked. To address this gap, we introduce the T-FAKE dataset, a new
+large-scale synthetic thermal dataset with sparse and dense landmarks. To
+facilitate the creation of the dataset, we propose a novel RGB2Thermal loss
+function, which enables the transfer of thermal style to RGB faces. By
+utilizing the Wasserstein distance between thermal and RGB patches and the
+statistical analysis of clinical temperature distributions on faces, we ensure
+that the generated thermal images closely resemble real samples. Using
+RGB2Thermal style transfer based on our RGB2Thermal loss function, we create
+the T-FAKE dataset, a large-scale synthetic thermal dataset of faces.
+Leveraging our novel T-FAKE dataset, probabilistic landmark prediction, and
+label adaptation networks, we demonstrate significant improvements in landmark
+detection methods on thermal images across different landmark conventions. Our
+models show excellent performance with both sparse 70-point landmarks and dense
+478-point landmark annotations. Our code and models are available at
+https://github.com/phflot/tfake.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 12 figures, Philipp Flotho and Moritz Piening share equal
+  contribution</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Machine Learning for Methane Detection and Quantification from Space --
+  A <span class="highlight-title">survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15122v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15122v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Enno Tiemann, Shanyu Zhou, Alexander Kläser, Konrad Heidler, Rochelle Schneider, Xiao Xiang Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Methane (CH_4) is a potent anthropogenic greenhouse gas, contributing 86
+times more to global warming than Carbon Dioxide (CO_2) over 20 years, and it
+also acts as an air pollutant. Given its high radiative forcing potential and
+relatively short atmospheric lifetime (9\textpm1 years), methane has important
+implications for climate change, therefore, cutting methane emissions is
+crucial for effective climate change mitigation. This work expands existing
+information on operational methane point source detection sensors in the
+Short-Wave Infrared (SWIR) bands. It reviews the state-of-the-art for
+traditional as well as Machine Learning (ML) approaches. The architecture and
+data used in such ML models will be discussed separately for methane plume
+segmentation and emission rate estimation. Traditionally, experts rely on
+labor-intensive manually adjusted methods for methane detection. However, ML
+approaches offer greater scalability. Our analysis reveals that ML models
+outperform traditional methods, particularly those based on convolutional
+neural networks (CNN), which are based on the U-net and transformer
+architectures. These ML models extract valuable information from
+methane-sensitive spectral data, enabling a more accurate detection. Challenges
+arise when comparing these methods due to variations in data, sensor
+specifications, and evaluation metrics. To address this, we discuss existing
+datasets and metrics, providing an overview of available resources and
+identifying open research problems. Finally, we explore potential future
+advances in ML, emphasizing approaches for model comparability, large dataset
+creation, and the European Union's forthcoming methane strategy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Urdu Digital Text Word Optical Character Recognition Using Permuted Auto
+  Regressive Sequence Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15119v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15119v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmed Mustafa, Ijlal Baig, Hasan Sajid
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This research paper introduces an innovative word-level Optical Character
+Recognition (OCR) model specifically designed for digital Urdu text
+recognition. Utilizing transformer-based architectures and attention
+mechanisms, the model was trained on a comprehensive dataset of approximately
+160,000 Urdu text images, achieving a character error rate (CER) of 0.178,
+which highlights its superior accuracy in recognizing Urdu characters. The
+model's strength lies in its unique architecture, incorporating the permuted
+autoregressive sequence (PARSeq) model, which allows for context-aware
+inference and iterative refinement by leveraging bidirectional context
+information to enhance recognition accuracy. Furthermore, its capability to
+handle a diverse range of Urdu text styles, fonts, and variations enhances its
+applicability in real-world scenarios. Despite its promising results, the model
+has some limitations, such as difficulty with blurred images, non-horizontal
+orientations, and overlays of patterns, lines, or other text, which can
+occasionally lead to suboptimal performance. Additionally, trailing or
+following punctuation marks can introduce noise into the recognition process.
+Addressing these challenges will be a focus of future research, aiming to
+refine the model further, explore data augmentation techniques, optimize
+hyperparameters, and integrate contextual improvements for more accurate and
+efficient Urdu text recognition.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DIFR3CT: Latent Diffusion for Probabilistic 3D CT Reconstruction from
+  Few Planar X-Rays 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15118v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15118v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiran Sun, Hana Baroudi, Tucker Netherton, Laurence Court, Osama Mawlawi, Ashok Veeraraghavan, Guha Balakrishnan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Computed Tomography (CT) scans are the standard-of-care for the visualization
+and diagnosis of many clinical ailments, and are needed for the treatment
+planning of external beam radiotherapy. Unfortunately, the availability of CT
+scanners in low- and mid-resource settings is highly variable. Planar x-ray
+radiography units, in comparison, are far more prevalent, but can only provide
+limited 2D observations of the 3D anatomy. In this work we propose DIFR3CT, a
+3D latent diffusion model, that can generate a distribution of plausible CT
+volumes from one or few (<10) planar x-ray observations. DIFR3CT works by
+fusing 2D features from each x-ray into a joint 3D space, and performing
+diffusion conditioned on these fused features in a low-dimensional latent
+space. We conduct extensive experiments demonstrating that DIFR3CT is better
+than recent sparse CT reconstruction baselines in terms of standard pixel-level
+(PSNR, SSIM) on both the public LIDC and in-house post-mastectomy CT datasets.
+We also show that DIFR3CT supports uncertainty quantification via Monte Carlo
+sampling, which provides an opportunity to measure reconstruction reliability.
+Finally, we perform a preliminary pilot study evaluating DIFR3CT for automated
+breast radiotherapy contouring and planning -- and demonstrate promising
+feasibility. Our code is available at https://github.com/yransun/DIFR3CT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Few-Shot Unsupervised Implicit Neural Shape Representation Learning with
+  Spatial Adversaries <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15114v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15114v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amine Ouasfi, Adnane Boukhayma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Implicit Neural Representations have gained prominence as a powerful
+framework for capturing complex data modalities, encompassing a wide range from
+3D shapes to images and audio. Within the realm of 3D shape representation,
+Neural Signed Distance Functions (SDF) have demonstrated remarkable potential
+in faithfully encoding intricate shape geometry. However, learning SDFs from
+sparse 3D point clouds in the absence of ground truth supervision remains a
+very challenging task. While recent methods rely on smoothness priors to
+regularize the learning, our method introduces a regularization term that
+leverages adversarial samples around the shape to improve the learned SDFs.
+Through extensive experiments and evaluations, we illustrate the efficacy of
+our proposed method, highlighting its capacity to improve SDF learning with
+respect to baselines and the state-of-the-art using synthetic and real data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AnomalousPatchCore: Exploring the Use of Anomalous Samples in Industrial
+  Anomaly Detection <span class="chip">ECCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15113v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15113v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mykhailo Koshil, Tilman Wegener, Detlef Mentrup, Simone Frintrop, Christian Wilms
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual inspection, or industrial anomaly detection, is one of the most common
+quality control types in manufacturing. The task is to identify the presence of
+an anomaly given an image, e.g., a missing component on an image of a circuit
+board, for subsequent manual inspection. While industrial anomaly detection has
+seen a surge in recent years, most anomaly detection methods still utilize
+knowledge only from normal samples, failing to leverage the information from
+the frequently available anomalous samples. Additionally, they heavily rely on
+very general feature extractors pre-trained on common image classification
+datasets. In this paper, we address these shortcomings and propose the new
+anomaly detection system AnomalousPatchCore~(APC) based on a feature extractor
+fine-tuned with normal and anomalous in-domain samples and a subsequent memory
+bank for identifying unusual features. To fine-tune the feature extractor in
+APC, we propose three auxiliary tasks that address the different aspects of
+anomaly detection~(classification vs. localization) and mitigate the effect of
+the imbalance between normal and anomalous samples. Our extensive evaluation on
+the MVTec dataset shows that APC outperforms state-of-the-art systems in
+detecting anomalies, which is especially important in industrial anomaly
+detection given the subsequent manual inspection. In detailed ablation studies,
+we further investigate the properties of our APC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the 2nd workshop on Vision-based InduStrial InspectiON
+  (VISION) @ ECCV</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing License Plate Super-Resolution: A Layout-Aware and
+  Character-Driven Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15103v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15103v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Valfride Nascimento, Rayson Laroca, Rafael O. Ribeiro, William Robson Schwartz, David Menotti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite significant advancements in License Plate Recognition (LPR) through
+deep learning, most improvements rely on high-resolution images with clear
+characters. This scenario does not reflect real-world conditions where traffic
+surveillance often captures low-resolution and blurry images. Under these
+conditions, characters tend to blend with the background or neighboring
+characters, making accurate LPR challenging. To address this issue, we
+introduce a novel loss function, Layout and Character Oriented Focal Loss
+(LCOFL), which considers factors such as resolution, texture, and structural
+details, as well as the performance of the LPR task itself. We enhance
+character feature learning using deformable convolutions and shared weights in
+an attention module and employ a GAN-based training approach with an Optical
+Character Recognition (OCR) model as the discriminator to guide the
+super-resolution process. Our experimental results show significant
+improvements in character reconstruction quality, outperforming two
+state-of-the-art methods in both quantitative and qualitative measures. Our
+code is publicly available at https://github.com/valfride/lpsr-lacd
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for presentation at the Conference on Graphics, Patterns and
+  Images (SIBGRAPI) 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MTMamba++: Enhancing Multi-Task Dense Scene Understanding via
+  Mamba-Based Decoders 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15101v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15101v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Baijiong Lin, Weisen Jiang, Pengguang Chen, Shu Liu, Ying-Cong Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-task dense scene understanding, which trains a model for multiple dense
+prediction tasks, has a wide range of application scenarios. Capturing
+long-range dependency and enhancing cross-task interactions are crucial to
+multi-task dense prediction. In this paper, we propose MTMamba++, a novel
+architecture for multi-task scene understanding featuring with a Mamba-based
+decoder. It contains two types of core blocks: self-task Mamba (STM) block and
+cross-task Mamba (CTM) block. STM handles long-range dependency by leveraging
+state-space models, while CTM explicitly models task interactions to facilitate
+information exchange across tasks. We design two types of CTM block, namely
+F-CTM and S-CTM, to enhance cross-task interaction from feature and semantic
+perspectives, respectively. Experiments on NYUDv2, PASCAL-Context, and
+Cityscapes datasets demonstrate the superior performance of MTMamba++ over
+CNN-based and Transformer-based methods. The code is available at
+https://github.com/EnVision-Research/MTMamba.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2407.02228</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CLIP-AGIQA: Boosting the Performance of AI-Generated Image Quality
+  Assessment with CLIP <span class="chip">ICPR2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15098v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15098v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenchen Tang, Zichuan Wang, Bo Peng, Jing Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rapid development of generative technologies, AI-Generated Images
+(AIGIs) have been widely applied in various aspects of daily life. However, due
+to the immaturity of the technology, the quality of the generated images
+varies, so it is important to develop quality assessment techniques for the
+generated images. Although some models have been proposed to assess the quality
+of generated images, they are inadequate when faced with the ever-increasing
+and diverse categories of generated images. Consequently, the development of
+more advanced and effective models for evaluating the quality of generated
+images is urgently needed. Recent research has explored the significant
+potential of the visual language model CLIP in image quality assessment,
+finding that it performs well in evaluating the quality of natural images.
+However, its application to generated images has not been thoroughly
+investigated. In this paper, we build on this idea and further explore the
+potential of CLIP in evaluating the quality of generated images. We design
+CLIP-AGIQA, a CLIP-based regression model for quality assessment of generated
+images, leveraging rich visual and textual knowledge encapsulated in CLIP.
+Particularly, we implement multi-category learnable prompts to fully utilize
+the textual knowledge in CLIP for quality assessment. Extensive experiments on
+several generated image quality assessment benchmarks, including AGIQA-3K and
+AIGCIQA2023, demonstrate that CLIP-AGIQA outperforms existing IQA models,
+achieving excellent results in evaluating the quality of generated images.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted by ICPR2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Constrained Diffusion Models via Dual Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15094v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15094v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shervin Khalafi, Dongsheng Ding, Alejandro Ribeiro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have attained prominence for their ability to synthesize a
+probability distribution for a given dataset via a diffusion process, enabling
+the generation of new data points with high fidelity. However, diffusion
+processes are prone to generating biased data based on the training dataset. To
+address this issue, we develop constrained diffusion models by imposing
+diffusion constraints based on desired distributions that are informed by
+requirements. Specifically, we cast the training of diffusion models under
+requirements as a constrained distribution optimization problem that aims to
+reduce the distribution difference between original and generated data while
+obeying constraints on the distribution of generated data. We show that our
+constrained diffusion models generate new data from a mixture data distribution
+that achieves the optimal trade-off among objective and constraints. To train
+constrained diffusion models, we develop a dual training algorithm and
+characterize the optimality of the trained constrained diffusion model. We
+empirically demonstrate the effectiveness of our constrained models in two
+constrained generation tasks: (i) we consider a dataset with one or more
+underrepresented classes where we train the model with constraints to ensure
+fairly sampling from all classes during inference; (ii) we fine-tune a
+pre-trained diffusion model to sample from a new dataset while avoiding
+overfitting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>41 pages, 4 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MMASD+: A Novel <span class="highlight-title">Dataset</span> for Privacy-Preserving Behavior Analysis of
+  Children with Autism Spectrum Disorder 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15077v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15077v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pavan Uttej Ravva, Behdokht Kiafar, Pinar Kullu, Jicheng Li, Anjana Bhat, Roghayeh Leila Barmaki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autism spectrum disorder (ASD) is characterized by significant challenges in
+social interaction and comprehending communication signals. Recently,
+therapeutic interventions for ASD have increasingly utilized Deep learning
+powered-computer vision techniques to monitor individual progress over time.
+These models are trained on private, non-public datasets from the autism
+community, creating challenges in comparing results across different models due
+to privacy-preserving data-sharing issues. This work introduces MMASD+. MMASD+
+consists of diverse data modalities, including 3D-Skeleton, 3D Body Mesh, and
+Optical Flow data. It integrates the capabilities of Yolov8 and Deep SORT
+algorithms to distinguish between the therapist and children, addressing a
+significant barrier in the original dataset. Additionally, a Multimodal
+Transformer framework is proposed to predict 11 action types and the presence
+of ASD. This framework achieves an accuracy of 95.03% for predicting action
+types and 96.42% for predicting ASD presence, demonstrating over a 10%
+improvement compared to models trained on single data modalities. These
+findings highlight the advantages of integrating multiple data modalities
+within the Multimodal Transformer framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Geometric Artifact Correction for Symmetric Multi-Linear Trajectory CT:
+  Theory, Method, and Generalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15069v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15069v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhisheng Wang, Yanxu Sun, Shangyu Li, Legeng Lin, Shunli Wang, Junning Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For extending CT field-of-view to perform non-destructive testing, the
+Symmetric Multi-Linear trajectory Computed Tomography (SMLCT) has been
+developed as a successful example of non-standard CT scanning modes. However,
+inevitable geometric errors can cause severe artifacts in the reconstructed
+images. The existing calibration method for SMLCT is both crude and
+inefficient. It involves reconstructing hundreds of images by exhaustively
+substituting each potential error, and then manually identifying the images
+with the fewest geometric artifacts to estimate the final geometric errors for
+calibration. In this paper, we comprehensively and efficiently address the
+challenging geometric artifacts in SMLCT, , and the corresponding works mainly
+involve theory, method, and generalization. In particular, after identifying
+sensitive parameters and conducting some theory analysis of geometric
+artifacts, we summarize several key properties between sensitive geometric
+parameters and artifact characteristics. Then, we further construct
+mathematical relationships that relate sensitive geometric errors to the pixel
+offsets of reconstruction images with artifact characteristics. To accurately
+extract pixel bias, we innovatively adapt the Generalized Cross-Correlation
+with Phase Transform (GCC-PHAT) algorithm, commonly used in sound processing,
+for our image registration task for each paired symmetric LCT. This adaptation
+leads to the design of a highly efficient rigid translation registration
+method. Simulation and physical experiments have validated the excellent
+performance of this work. Additionally, our results demonstrate significant
+generalization to common rotated CT and a variant of SMLCT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adapting Segment Anything Model to Multi-modal Salient Object Detection
+  with Semantic Feature Fusion Guidance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15063v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15063v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kunpeng Wang, Keke Chen, Chenglong Li, Zhengzheng Tu, Bin Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although most existing multi-modal salient object detection (SOD) methods
+demonstrate effectiveness through training models from scratch, the limited
+multi-modal data hinders these methods from reaching optimality. In this paper,
+we propose a novel framework to explore and exploit the powerful feature
+representation and zero-shot generalization ability of the pre-trained Segment
+Anything Model (SAM) for multi-modal SOD. Despite serving as a recent vision
+fundamental model, driving the class-agnostic SAM to comprehend and detect
+salient objects accurately is non-trivial, especially in challenging scenes. To
+this end, we develop \underline{SAM} with se\underline{m}antic
+f\underline{e}ature fu\underline{s}ion guidanc\underline{e} (Sammese), which
+incorporates multi-modal saliency-specific knowledge into SAM to adapt SAM to
+multi-modal SOD tasks. However, it is difficult for SAM trained on single-modal
+data to directly mine the complementary benefits of multi-modal inputs and
+comprehensively utilize them to achieve accurate saliency prediction.To address
+these issues, we first design a multi-modal complementary fusion module to
+extract robust multi-modal semantic features by integrating information from
+visible and thermal or depth image pairs. Then, we feed the extracted
+multi-modal semantic features into both the SAM image encoder and mask decoder
+for fine-tuning and prompting, respectively. Specifically, in the image
+encoder, a multi-modal adapter is proposed to adapt the single-modal SAM to
+multi-modal information. In the mask decoder, a semantic-geometric prompt
+generation strategy is proposed to produce corresponding embeddings with
+various saliency cues. Extensive experiments on both RGB-D and RGB-T SOD
+benchmarks show the effectiveness of the proposed framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DocLayLLM: An Efficient and Effective Multi-modal Extension of Large
+  Language Models for Text-rich Document Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15045v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15045v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenhui Liao, Jiapeng Wang, Hongliang Li, Chengyu Wang, Jun Huang, Lianwen Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-rich document understanding (TDU) refers to analyzing and comprehending
+documents containing substantial textual content. With the rapid evolution of
+large language models (LLMs), they have been widely leveraged for TDU due to
+their remarkable versatility and generalization. In this paper, we introduce
+DocLayLLM, an efficient and effective multi-modal extension of LLMs
+specifically designed for TDU. By integrating visual patch tokens and 2D
+positional tokens into LLMs and encoding the document content using the LLMs
+themselves, we fully take advantage of the document comprehension capability of
+LLMs and enhance their perception of OCR information. We have also deeply
+considered the role of the chain-of-thought (CoT) and innovatively proposed the
+techniques of CoT Pre-training and CoT Annealing. Our DocLayLLM can achieve
+remarkable performances with lightweight training settings, showcasing its
+efficiency and effectiveness. Experimental results demonstrate that our
+DocLayLLM surpasses existing OCR-dependent methods and also outperforms
+OCR-free competitors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Interactive Occlusion Boundary Estimation through Exploitation of
+  Synthetic Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15038v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15038v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lintao Xu, Chaohui Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Occlusion boundaries (OBs) geometrically localize the occlusion events in a
+2D image, and contain useful information for addressing various scene
+understanding problems. To advance their study, we have led the investigation
+in the following three aspects. Firstly, we have studied interactive estimation
+of OBs, which is the first in the literature, and proposed an efficient
+deep-network-based method using multiple-scribble intervention, named DNMMSI,
+which significantly improves the performance over the state-of-the-art
+fully-automatic methods. Secondly, we propose to exploit the synthetic
+benchmark for the training process, thanks to the particularity that OBs are
+determined geometrically and unambiguously from the 3D scene. To this end, we
+have developed an efficient tool, named Mesh2OB, for the automatic generation
+of 2D images together with their ground-truth OBs, using which we have
+constructed a synthetic benchmark, named OB-FUTURE. Abundant experimental
+results demonstrate that leveraging such a synthetic benchmark for training
+achieves promising performance, even without the use of domain adaptation
+techniques. Finally, to achieve a more compelling and robust evaluation in
+OB-related research, we have created a real benchmark, named OB-LabName,
+consisting of 120 high-resolution images together with their ground-truth OBs,
+with precision surpassing that of previous benchmarks. We will release DNMMSI
+with pre-trained parameters, Mesh2OB, OB-FUTURE, and OB-LabName to support
+further research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mamba2MIL: State Space Duality Based Multiple Instance Learning for
+  Computational Pathology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15032v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15032v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuqi Zhang, Xiaoqian Zhang, Jiakai Wang, Yuancheng Yang, Taiying Peng, Chao Tong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Computational pathology (CPath) has significantly advanced the clinical
+practice of pathology. Despite the progress made, Multiple Instance Learning
+(MIL), a promising paradigm within CPath, continues to face challenges,
+particularly related to incomplete information utilization. Existing
+frameworks, such as those based on Convolutional Neural Networks (CNNs),
+attention, and selective scan space state sequential model (SSM), lack
+sufficient flexibility and scalability in fusing diverse features, and cannot
+effectively fuse diverse features. Additionally, current approaches do not
+adequately exploit order-related and order-independent features, resulting in
+suboptimal utilization of sequence information. To address these limitations,
+we propose a novel MIL framework called Mamba2MIL. Our framework utilizes the
+state space duality model (SSD) to model long sequences of patches of whole
+slide images (WSIs), which, combined with weighted feature selection, supports
+the fusion processing of more branching features and can be extended according
+to specific application needs. Moreover, we introduce a sequence transformation
+method tailored to varying WSI sizes, which enhances sequence-independent
+features while preserving local sequence information, thereby improving
+sequence information utilization. Extensive experiments demonstrate that
+Mamba2MIL surpasses state-of-the-art MIL methods. We conducted extensive
+experiments across multiple datasets, achieving improvements in nearly all
+performance metrics. Specifically, on the NSCLC dataset, Mamba2MIL achieves a
+binary tumor classification AUC of 0.9533 and an accuracy of 0.8794. On the
+BRACS dataset, it achieves a multiclass classification AUC of 0.7986 and an
+accuracy of 0.4981. The code is available at
+https://github.com/YuqiZhang-Buaa/Mamba2MIL.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sequence-aware <span class="highlight-title">Pre-train</span>ing for Echocardiography Probe Guidance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15026v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15026v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haojun Jiang, Zhenguo Sun, Yu Sun, Ning Jia, Meng Li, Shaqi Luo, Shiji Song, Gao Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cardiac ultrasound probe guidance aims to help novices adjust the 6-DOF probe
+pose to obtain high-quality sectional images. Cardiac ultrasound faces two
+major challenges: (1) the inherently complex structure of the heart, and (2)
+significant individual variations. Previous works have only learned the
+population-averaged 2D and 3D structures of the heart rather than personalized
+cardiac structural features, leading to a performance bottleneck. Clinically,
+we observed that sonographers adjust their understanding of a patient's cardiac
+structure based on prior scanning sequences, thereby modifying their scanning
+strategies. Inspired by this, we propose a sequence-aware self-supervised
+pre-training method. Specifically, our approach learns personalized 2D and 3D
+cardiac structural features by predicting the masked-out images and actions in
+a scanning sequence. We hypothesize that if the model can predict the missing
+content it has acquired a good understanding of the personalized cardiac
+structure. In the downstream probe guidance task, we also introduced a sequence
+modeling approach that models individual cardiac structural information based
+on the images and actions from historical scan data, enabling more accurate
+navigation decisions. Experiments on a large-scale dataset with 1.36 million
+samples demonstrated that our proposed sequence-aware paradigm can
+significantly reduce navigation errors, with translation errors decreasing by
+15.90% to 36.87% and rotation errors decreasing by 11.13% to 20.77%, compared
+to state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Tech Report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hierarchical Graph Interaction <span class="highlight-title">Transformer</span> with Dynamic Token Clustering
+  for Camouflaged Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15020v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15020v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyuan Yao, Hao Sun, Tian-Zhu Xiang, Xiao Wang, Xiaochun Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Camouflaged object detection (COD) aims to identify the objects that
+seamlessly blend into the surrounding backgrounds. Due to the intrinsic
+similarity between the camouflaged objects and the background region, it is
+extremely challenging to precisely distinguish the camouflaged objects by
+existing approaches. In this paper, we propose a hierarchical graph interaction
+network termed HGINet for camouflaged object detection, which is capable of
+discovering imperceptible objects via effective graph interaction among the
+hierarchical tokenized features. Specifically, we first design a region-aware
+token focusing attention (RTFA) with dynamic token clustering to excavate the
+potentially distinguishable tokens in the local region. Afterwards, a
+hierarchical graph interaction transformer (HGIT) is proposed to construct
+bi-directional aligned communication between hierarchical features in the
+latent interaction space for visual semantics enhancement. Furthermore, we
+propose a decoder network with confidence aggregated feature fusion (CAFF)
+modules, which progressively fuses the hierarchical interacted features to
+refine the local detail in ambiguous regions. Extensive experiments conducted
+on the prevalent datasets, i.e. COD10K, CAMO, NC4K and CHAMELEON demonstrate
+the superior performance of HGINet compared to existing state-of-the-art
+methods. Our code is available at https://github.com/Garyson1204/HGINet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to IEEE Transactions on Image Processing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Alternating Minimization Schemes for Computing
+  Rate-Distortion-Perception Functions with $f$-Divergence Perception
+  Constraints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15015v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15015v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giuseppe Serra, Photios A. Stavrou, Marios Kountouris
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the computation of the rate-distortion-perception function (RDPF)
+for discrete memoryless sources subject to a single-letter average distortion
+constraint and a perception constraint that belongs to the family of
+$f$-divergences. In this setting, the RDPF forms a convex programming problem
+for which we characterize the optimal parametric solutions. We employ the
+developed solutions in an alternating minimization scheme, namely Optimal
+Alternating Minimization (OAM), for which we provide convergence guarantees.
+Nevertheless, the OAM scheme does not lead to a direct implementation of a
+generalized Blahut-Arimoto (BA) type of algorithm due to the presence of
+implicit equations in the structure of the iteration. To overcome this
+difficulty, we propose two alternative minimization approaches whose
+applicability depends on the smoothness of the used perception metric: a
+Newton-based Alternating Minimization (NAM) scheme, relying on Newton's
+root-finding method for the approximation of the optimal iteration solution,
+and a Relaxed Alternating Minimization (RAM) scheme, based on a relaxation of
+the OAM iterates. Both schemes are shown, via the derivation of necessary and
+sufficient conditions, to guarantee convergence to a globally optimal solution.
+We also provide sufficient conditions on the distortion and the perception
+constraints which guarantee that the proposed algorithms converge exponentially
+fast in the number of iteration steps. We corroborate our theoretical results
+with numerical simulations and draw connections with existing results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted for possible publication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Pre-train</span>ing Everywhere: Parameter-Efficient Fine-Tuning for Medical
+  Image Analysis via Target Parameter <span class="highlight-title">Pre-train</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15011v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15011v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingliang Lei, Yiwen Ye, Ziyang Chen, Minglei Shu, Yong Xia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Parameter-efficient fine-tuning (PEFT) techniques have emerged to address
+issues of overfitting and high computational costs associated with fully
+fine-tuning in the paradigm of self-supervised learning. Mainstream methods
+based on PEFT involve adding a few trainable parameters while keeping the
+pre-trained parameters of the backbone fixed. These methods achieve
+comparative, and often superior, performance to fully fine-tuning,
+demonstrating the powerful representation ability of the pre-trained backbone.
+Despite its success, these methods typically ignore the initialization of the
+new parameters, often relying solely on random initialization. We argue that if
+pre-training is significantly beneficial, it should be applied to all
+parameters requiring representational capacity. Motivated by this insight, we
+propose a simple yet effective fine-tuning framework based on Target Parameter
+Pre-training (TPP). The target parameters refer to the new parameters
+introduced during fine-tuning. TPP includes an additional stage before PEFT to
+pre-train these target parameters. During this stage, the pre-trained backbone
+parameters are frozen, and only the target parameters are trainable. A defined
+pre-text task is used to encourage the target parameters to learn specific
+representations of downstream data. When PEFT is subsequently employed, the
+pre-trained target parameters are loaded to enhance fine-tuning efficiency. The
+proposed TPP framework is versatile, allowing for the integration of various
+pretext tasks for pre-training and supporting different PEFT methods as
+backbones. We evaluated the fine-tining performance of our method using five
+public datasets, including three modalities and two task types. The results
+demonstrate that the proposed TPP can be easily integrated into existing PEFT
+methods, significantly improving performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Knowledge Discovery in Optical Music Recognition: Enhancing Information
+  Retrieval with Instance Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15002v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15002v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elona Shatri, George Fazekas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optical Music Recognition (OMR) automates the transcription of musical
+notation from images into machine-readable formats like MusicXML, MEI, or MIDI,
+significantly reducing the costs and time of manual transcription. This study
+explores knowledge discovery in OMR by applying instance segmentation using
+Mask R-CNN to enhance the detection and delineation of musical symbols in sheet
+music. Unlike Optical Character Recognition (OCR), OMR must handle the
+intricate semantics of Common Western Music Notation (CWMN), where symbol
+meanings depend on shape, position, and context. Our approach leverages
+instance segmentation to manage the density and overlap of musical symbols,
+facilitating more precise information retrieval from music scores. Evaluations
+on the DoReMi and MUSCIMA++ datasets demonstrate substantial improvements, with
+our method achieving a mean Average Precision (mAP) of up to 59.70\% in dense
+symbol environments, achieving comparable results to object detection.
+Furthermore, using traditional computer vision techniques, we add a parallel
+step for staff detection to infer the pitch for the recognised symbols. This
+study emphasises the role of pixel-wise segmentation in advancing accurate
+music symbol recognition, contributing to knowledge discovery in OMR. Our
+findings indicate that instance segmentation provides more precise
+representations of musical symbols, particularly in densely populated scores,
+advancing OMR technology. We make our implementation, pre-processing scripts,
+trained models, and evaluation results publicly available to support further
+research and development.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages content and one references, accepted version at the
+  International Conference on Knowledge Discovery and Information Retrieval
+  2024, Porto, Portugal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FastTextSpotter: A High-Efficiency <span class="highlight-title">Transformer</span> for Multilingual Scene
+  Text Spotting <span class="chip">ICPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14998v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14998v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alloy Das, Sanket Biswas, Umapada Pal, Josep Lladós, Saumik Bhattacharya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The proliferation of scene text in both structured and unstructured
+environments presents significant challenges in optical character recognition
+(OCR), necessitating more efficient and robust text spotting solutions. This
+paper presents FastTextSpotter, a framework that integrates a Swin Transformer
+visual backbone with a Transformer Encoder-Decoder architecture, enhanced by a
+novel, faster self-attention unit, SAC2, to improve processing speeds while
+maintaining accuracy. FastTextSpotter has been validated across multiple
+datasets, including ICDAR2015 for regular texts and CTW1500 and TotalText for
+arbitrary-shaped texts, benchmarking against current state-of-the-art models.
+Our results indicate that FastTextSpotter not only achieves superior accuracy
+in detecting and recognizing multilingual scene text (English and Vietnamese)
+but also improves model efficiency, thereby setting new benchmarks in the
+field. This study underscores the potential of advanced transformer
+architectures in improving the adaptability and speed of text spotting
+applications in diverse real-world settings. The dataset, code, and pre-trained
+models have been released in our Github.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in ICPR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Depth Restoration of Hand-Held Transparent Objects for Human-to-Robot
+  Handover 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14997v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14997v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ran Yu, Haixin Yu, Huang Yan, Ziwu Song, Shoujie Li, Wenbo Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transparent objects are common in daily life, while their unique optical
+properties pose challenges for RGB-D cameras, which struggle to capture
+accurate depth information. For assistant robots, accurately perceiving
+transparent objects held by humans is essential for effective human-robot
+interaction. This paper presents a Hand-Aware Depth Restoration (HADR) method
+for hand-held transparent objects based on creating an implicit neural
+representation function from a single RGB-D image. The proposed method
+introduces the hand posture as an important guidance to leverage semantic and
+geometric information. To train and evaluate the proposed method, we create a
+high-fidelity synthetic dataset called TransHand-14K with a real-to-sim data
+generation scheme. Experiments show that our method has a better performance
+and generalization ability compared with existing methods. We further develop a
+real-world human-to-robot handover system based on the proposed depth
+restoration method, demonstrating its application value in human-robot
+interaction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 7 figures, conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LN-Gen: Rectal Lymph Nodes Generation via Anatomical Features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14977v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14977v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weidong Guo, Hantao Zhang, Shouhong Wan, Bingbing Zou, Wanqin Wang, Peiquan Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate segmentation of rectal lymph nodes is crucial for the staging and
+treatment planning of rectal cancer. However, the complexity of the surrounding
+anatomical structures and the scarcity of annotated data pose significant
+challenges. This study introduces a novel lymph node synthesis technique aimed
+at generating diverse and realistic synthetic rectal lymph node samples to
+mitigate the reliance on manual annotation. Unlike direct diffusion methods,
+which often produce masks that are discontinuous and of suboptimal quality, our
+approach leverages an implicit SDF-based method for mask generation, ensuring
+the production of continuous, stable, and morphologically diverse masks.
+Experimental results demonstrate that our synthetic data significantly improves
+segmentation performance. Our work highlights the potential of diffusion model
+for accurately synthesizing structurally complex lesions, such as lymph nodes
+in rectal cancer, alleviating the challenge of limited annotated data in this
+field and aiding in advancements in rectal cancer diagnosis and treatment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Prior-free Balanced Replay: Uncertainty-guided Reservoir Sampling for
+  Long-Tailed Continual Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14976v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14976v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Liu, Li Liu, Yawen Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Even in the era of large models, one of the well-known issues in continual
+learning (CL) is catastrophic forgetting, which is significantly challenging
+when the continual data stream exhibits a long-tailed distribution, termed as
+Long-Tailed Continual Learning (LTCL). Existing LTCL solutions generally
+require the label distribution of the data stream to achieve re-balance
+training. However, obtaining such prior information is often infeasible in real
+scenarios since the model should learn without pre-identifying the majority and
+minority classes. To this end, we propose a novel Prior-free Balanced Replay
+(PBR) framework to learn from long-tailed data stream with less forgetting.
+Concretely, motivated by our experimental finding that the minority classes are
+more likely to be forgotten due to the higher uncertainty, we newly design an
+uncertainty-guided reservoir sampling strategy to prioritize rehearsing
+minority data without using any prior information, which is based on the mutual
+dependence between the model and samples. Additionally, we incorporate two
+prior-free components to further reduce the forgetting issue: (1) Boundary
+constraint is to preserve uncertain boundary supporting samples for continually
+re-estimating task boundaries. (2) Prototype constraint is to maintain the
+consistency of learned class prototypes along with training. Our approach is
+evaluated on three standard long-tailed benchmarks, demonstrating superior
+performance to existing CL methods and previous SOTA LTCL approach in both
+task- and class-incremental learning settings, as well as ordered- and
+shuffled-LTCL settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MegActor-$Σ$: Unlocking Flexible Mixed-Modal Control in Portrait
+  Animation with Diffusion <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14975v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14975v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shurong Yang, Huadong Li, Juhao Wu, Minhao Jing, Linze Li, Renhe Ji, Jiajun Liang, Haoqiang Fan, Jin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have demonstrated superior performance in the field of
+portrait animation. However, current approaches relied on either visual or
+audio modality to control character movements, failing to exploit the potential
+of mixed-modal control. This challenge arises from the difficulty in balancing
+the weak control strength of audio modality and the strong control strength of
+visual modality. To address this issue, we introduce MegActor-$\Sigma$: a
+mixed-modal conditional diffusion transformer (DiT), which can flexibly inject
+audio and visual modality control signals into portrait animation.
+Specifically, we make substantial advancements over its predecessor, MegActor,
+by leveraging the promising model structure of DiT and integrating audio and
+visual conditions through advanced modules within the DiT framework. To further
+achieve flexible combinations of mixed-modal control signals, we propose a
+``Modality Decoupling Control" training strategy to balance the control
+strength between visual and audio modalities, along with the ``Amplitude
+Adjustment" inference strategy to freely regulate the motion amplitude of each
+modality. Finally, to facilitate extensive studies in this field, we design
+several dataset evaluation metrics to filter out public datasets and solely use
+this filtered dataset to train MegActor-$\Sigma$. Extensive experiments
+demonstrate the superiority of our approach in generating vivid portrait
+animations, outperforming previous methods trained on private dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Learning-based Average Shear Wave Velocity Prediction using
+  Accelerometer Records 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14962v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14962v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Barış Yılmaz, Melek Türkmen, Sanem Meral, Erdem Akagündüz, Salih Tileylioglu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Assessing seismic hazards and thereby designing earthquake-resilient
+structures or evaluating structural damage that has been incurred after an
+earthquake are important objectives in earthquake engineering. Both tasks
+require critical evaluation of strong ground motion records, and the knowledge
+of site conditions at the earthquake stations plays a major role in achieving
+the aforementioned objectives. Site conditions are generally represented by the
+time-averaged shear wave velocity in the upper 30 meters of the geological
+materials (Vs30). Several strong motion stations lack Vs30 measurements
+resulting in potentially inaccurate assessment of seismic hazards and
+evaluation of ground motion records. In this study, we present a deep
+learning-based approach for predicting Vs30 at strong motion station locations
+using three-channel earthquake records. For this purpose, Convolutional Neural
+Networks (CNNs) with dilated and causal convolutional layers are used to
+extract deep features from accelerometer records collected from over 700
+stations located in Turkey. In order to overcome the limited availability of
+labeled data, we propose a two-phase training approach. In the first phase, a
+CNN is trained to estimate the epicenters, for which ground truth is available
+for all records. After the CNN is trained, the pre-trained encoder is
+fine-tuned based on the Vs30 ground truth. The performance of the proposed
+method is compared with machine learning models that utilize hand-crafted
+features. The results demonstrate that the deep convolutional encoder based
+Vs30 prediction model outperforms the machine learning models that rely on
+hand-crafted features.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 14 figures, Accepted by 18th World Conference on Earthquake
+  Engineering WCEE2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CVPT: Cross-Attention help Visual <span class="highlight-title">Prompt</span> Tuning adapt visual task 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14961v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14961v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lingyun Huang, Jianxu Mao, Yaonan Wang, Junfei Yi, Ziming Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, the rapid expansion of model sizes has led to large-scale
+pre-trained models demonstrating remarkable capabilities. Consequently, there
+has been a trend towards increasing the scale of models. However, this trend
+introduces significant challenges, including substantial computational costs of
+training and transfer to downstream tasks. To address these issues,
+Parameter-Efficient Fine-Tuning (PEFT) methods have been introduced. These
+methods optimize large-scale pre-trained models for specific tasks by
+fine-tuning a select group of parameters. Among these PEFT methods,
+adapter-based and prompt-based methods are the primary techniques.
+Specifically, in the field of visual fine-tuning, adapters gain prominence over
+prompts because of the latter's relatively weaker performance and efficiency.
+Under the circumstances, we refine the widely-used Visual Prompt Tuning (VPT)
+method, proposing Cross Visual Prompt Tuning (CVPT). CVPT calculates
+cross-attention between the prompt tokens and the embedded tokens, which allows
+us to compute the semantic relationship between them and conduct the
+fine-tuning of models exactly to adapt visual tasks better. Furthermore, we
+introduce the weight-sharing mechanism to initialize the parameters of
+cross-attention, which avoids massive learnable parameters from cross-attention
+and enhances the representative capability of cross-attention. We conduct
+comprehensive testing across 25 datasets and the result indicates that CVPT
+significantly improves VPT's performance and efficiency in visual tasks. For
+example, on the VTAB-1K benchmark, CVPT outperforms VPT over 4% in average
+accuracy, rivaling the advanced adapter-based methods in performance and
+efficiency. Our experiments confirm that prompt-based methods can achieve
+exceptional results in visual fine-tuning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Applying ViT in Generalized Few-shot Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14957v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14957v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liyuan Geng, Jinhong Xia, Yuanhe Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper explores the capability of ViT-based models under the generalized
+few-shot semantic segmentation (GFSS) framework. We conduct experiments with
+various combinations of backbone models, including ResNets and pretrained
+Vision Transformer (ViT)-based models, along with decoders featuring a linear
+classifier, UPerNet, and Mask Transformer. The structure made of DINOv2 and
+linear classifier takes the lead on popular few-shot segmentation bench mark
+PASCAL-$5^i$, substantially outperforming the best of ResNet structure by 116%
+in one-shot scenario. We demonstrate the great potential of large pretrained
+ViT-based model on GFSS task, and expect further improvement on testing
+benchmarks. However, a potential caveat is that when applying pure ViT-based
+model and large scale ViT decoder, the model is easy to overfit.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NeuralOOD: Improving Out-of-Distribution Generalization Performance with
+  Brain-machine Fusion Learning Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14950v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14950v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuangchen Zhao, Changde Du, Hui Li, Huiguang He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Neural Networks (DNNs) have demonstrated exceptional recognition
+capabilities in traditional computer vision (CV) tasks. However, existing CV
+models often suffer a significant decrease in accuracy when confronted with
+out-of-distribution (OOD) data. In contrast to these DNN models, human can
+maintain a consistently low error rate when facing OOD scenes, partly
+attributed to the rich prior cognitive knowledge stored in the human brain.
+Previous OOD generalization researches only focus on the single modal,
+overlooking the advantages of multimodal learning method. In this paper, we
+utilize the multimodal learning method to improve the OOD generalization and
+propose a novel Brain-machine Fusion Learning (BMFL) framework. We adopt the
+cross-attention mechanism to fuse the visual knowledge from CV model and prior
+cognitive knowledge from the human brain. Specially, we employ a pre-trained
+visual neural encoding model to predict the functional Magnetic Resonance
+Imaging (fMRI) from visual features which eliminates the need for the fMRI data
+collection and pre-processing, effectively reduces the workload associated with
+conventional BMFL methods. Furthermore, we construct a brain transformer to
+facilitate the extraction of knowledge inside the fMRI data. Moreover, we
+introduce the Pearson correlation coefficient maximization regularization
+method into the training process, which improves the fusion capability with
+better constrains. Our model outperforms the DINOv2 and baseline models on the
+ImageNet-1k validation dataset as well as six curated OOD datasets, showcasing
+its superior performance in diverse scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ERX: A Fast Real-Time Anomaly Detection Algorithm for Hyperspectral
+  Line-Scanning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14947v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14947v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samuel Garske, Bradley Evans, Christopher Artlett, KC Wong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting unexpected objects (anomalies) in real-time has great potential for
+monitoring, managing, and protecting the environment. Hyperspectral line-scan
+cameras are a low-cost solution that enhance confidence in anomaly detection
+over RGB and multispectral imagery. However, real-time algorithms for these
+cameras must be fast when using small computers (e.g., those onboard a drone or
+small satellite), scalable to high dimensions, adaptable to changing scenery,
+and robust against geometric and radiometric distortions. This paper introduces
+the Exponentially moving RX algorithm (ERX) and compares it to existing
+RX-based anomaly detection methods for real-time line-scanning. ERX was tested
+using a Jetson Xavier NX compute module, achieving the best combination of
+speed and detection across three novel datasets compared to the other
+algorithms. This research paves the way for future studies in grouping and
+locating anomalous objects, adaptive and automatic threshold selection, and
+real-time field tests. The Python code for the algorithms and experiments is
+available at https://github.com/WiseGamgee/HyperAD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 9 figures, 3 tables, code and datasets accessible at
+  https://github.com/WiseGamgee/HyperAD</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BOX3D: Lightweight Camera-LiDAR Fusion for 3D Object Detection and
+  Localization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14941v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14941v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mario A. V. Saucedo, Nikolaos Stathoulopoulos, Vidya Sumathy, Christoforos Kanellakis, George Nikolakopoulos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Object detection and global localization play a crucial role in robotics,
+spanning across a great spectrum of applications from autonomous cars to
+multi-layered 3D Scene Graphs for semantic scene understanding. This article
+proposes BOX3D, a novel multi-modal and lightweight scheme for localizing
+objects of interest by fusing the information from RGB camera and 3D LiDAR.
+BOX3D is structured around a three-layered architecture, building up from the
+local perception of the incoming sequential sensor data to the global
+perception refinement that covers for outliers and the general consistency of
+each object's observation. More specifically, the first layer handles the
+low-level fusion of camera and LiDAR data for initial 3D bounding box
+extraction. The second layer converts each LiDAR's scan 3D bounding boxes to
+the world coordinate frame and applies a spatial pairing and merging mechanism
+to maintain the uniqueness of objects observed from different viewpoints.
+Finally, BOX3D integrates the third layer that supervises the consistency of
+the results on the global map iteratively, using a point-to-voxel comparison
+for identifying all points in the global map that belong to the object.
+Benchmarking results of the proposed novel architecture are showcased in
+multiple experimental trials on public state-of-the-art large-scale dataset of
+urban environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented in MED 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cross-Modal Temporal Alignment for Event-guided Video Deblurring <span class="chip">ECCV2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14930v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14930v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taewoo Kim, Hoonhee Cho, Kuk-Jin Yoon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video deblurring aims to enhance the quality of restored results in
+motion-blurred videos by effectively gathering information from adjacent video
+frames to compensate for the insufficient data in a single blurred frame.
+However, when faced with consecutively severe motion blur situations,
+frame-based video deblurring methods often fail to find accurate temporal
+correspondence among neighboring video frames, leading to diminished
+performance. To address this limitation, we aim to solve the video deblurring
+task by leveraging an event camera with micro-second temporal resolution. To
+fully exploit the dense temporal resolution of the event camera, we propose two
+modules: 1) Intra-frame feature enhancement operates within the exposure time
+of a single blurred frame, iteratively enhancing cross-modality features in a
+recurrent manner to better utilize the rich temporal information of events, 2)
+Inter-frame temporal feature alignment gathers valuable long-range temporal
+information to target frames, aggregating sharp features leveraging the
+advantages of the events. In addition, we present a novel dataset composed of
+real-world blurred RGB videos, corresponding sharp videos, and event data. This
+dataset serves as a valuable resource for evaluating event-guided deblurring
+methods. We demonstrate that our proposed methods outperform state-of-the-art
+frame-based and event-based motion deblurring methods through extensive
+experiments conducted on both synthetic and real-world deblurring datasets. The
+code and dataset are available at https://github.com/intelpro/CMTA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in ECCV2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automatic Detection of COVID-19 from Chest X-ray Images Using Deep
+  Learning Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14927v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14927v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alloy Das, Rohit Agarwal, Rituparna Singh, Arindam Chowdhury, Debashis Nandi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The infectious disease caused by novel corona virus (2019-nCoV) has been
+widely spreading since last year and has shaken the entire world. It has caused
+an unprecedented effect on daily life, global economy and public health. Hence
+this disease detection has life-saving importance for both patients as well as
+doctors. Due to limited test kits, it is also a daunting task to test every
+patient with severe respiratory problems using conventional techniques
+(RT-PCR). Thus implementing an automatic diagnosis system is urgently required
+to overcome the scarcity problem of Covid-19 test kits at hospital, health care
+systems. The diagnostic approach is mainly classified into two
+categories-laboratory based and Chest radiography approach. In this paper, a
+novel approach for computerized corona virus (2019-nCoV) detection from lung
+x-ray images is presented. Here, we propose models using deep learning to show
+the effectiveness of diagnostic systems. In the experimental result, we
+evaluate proposed models on publicly available data-set which exhibit
+satisfactory performance and promising results compared with other previous
+existing methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in AIP Conference Proceedings (Vol. 2424, No. 1)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Real-world Event-guided Low-light Video Enhancement and
+  Deblurring <span class="chip">ECCV2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14916v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14916v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taewoo Kim, Jaeseok Jeong, Hoonhee Cho, Yuhwan Jeong, Kuk-Jin Yoon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In low-light conditions, capturing videos with frame-based cameras often
+requires long exposure times, resulting in motion blur and reduced visibility.
+While frame-based motion deblurring and low-light enhancement have been
+studied, they still pose significant challenges. Event cameras have emerged as
+a promising solution for improving image quality in low-light environments and
+addressing motion blur. They provide two key advantages: capturing scene
+details well even in low light due to their high dynamic range, and effectively
+capturing motion information during long exposures due to their high temporal
+resolution. Despite efforts to tackle low-light enhancement and motion
+deblurring using event cameras separately, previous work has not addressed both
+simultaneously. To explore the joint task, we first establish real-world
+datasets for event-guided low-light enhancement and deblurring using a hybrid
+camera system based on beam splitters. Subsequently, we introduce an end-to-end
+framework to effectively handle these tasks. Our framework incorporates a
+module to efficiently leverage temporal information from events and frames.
+Furthermore, we propose a module to utilize cross-modal feature information to
+employ a low-pass filter for noise suppression while enhancing the main
+structural information. Our proposed method significantly outperforms existing
+approaches in addressing the joint task. Our project pages are available at
+https://github.com/intelpro/ELEDNet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in ECCV2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MeshUp: Multi-Target Mesh Deformation via Blended Score Distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14899v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14899v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyunwoo Kim, Itai Lang, Noam Aigerman, Thibault Groueix, Vladimir G. Kim, Rana Hanocka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose MeshUp, a technique that deforms a 3D mesh towards multiple target
+concepts, and intuitively controls the region where each concept is expressed.
+Conveniently, the concepts can be defined as either text queries, e.g., "a dog"
+and "a turtle," or inspirational images, and the local regions can be selected
+as any number of vertices on the mesh. We can effectively control the influence
+of the concepts and mix them together using a novel score distillation
+approach, referred to as the Blended Score Distillation (BSD). BSD operates on
+each attention layer of the denoising U-Net of a diffusion model as it extracts
+and injects the per-objective activations into a unified denoising pipeline
+from which the deformation gradients are calculated. To localize the expression
+of these activations, we create a probabilistic Region of Interest (ROI) map on
+the surface of the mesh, and turn it into 3D-consistent masks that we use to
+control the expression of these activations. We demonstrate the effectiveness
+of BSD empirically and show that it can deform various meshes towards multiple
+objectives.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VHAKG: A Multi-modal Knowledge Graph Based on Synchronized Multi-view
+  Videos of Daily Activities <span class="chip">CIKM2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14895v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14895v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shusaku Egami, Takahiro Ugai, Ken Fukuda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal knowledge graphs (MMKGs), which ground various non-symbolic data
+(e.g., images and videos) into symbols, have attracted attention as resources
+enabling knowledge processing and machine learning across modalities. However,
+the construction of MMKGs for videos consisting of multiple events, such as
+daily activities, is still in the early stages. In this paper, we construct an
+MMKG based on synchronized multi-view simulated videos of daily activities.
+Besides representing the content of daily life videos as event-centric
+knowledge, our MMKG also includes frame-by-frame fine-grained changes, such as
+bounding boxes within video frames. In addition, we provide support tools for
+querying our MMKG. As an application example, we demonstrate that our MMKG
+facilitates benchmarking vision-language models by providing the necessary
+vision-language datasets for a tailored task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages,4 figures, accepted by CIKM2024 Resource Track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adversarial Manhole: Challenging Monocular Depth Estimation and Semantic
+  Segmentation Models with Patch Attack 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14879v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14879v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Naufal Suryanto, Andro Aprila Adiputra, Ahmada Yusril Kadiptya, Yongsu Kim, Howon Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Monocular depth estimation (MDE) and semantic segmentation (SS) are crucial
+for the navigation and environmental interpretation of many autonomous driving
+systems. However, their vulnerability to practical adversarial attacks is a
+significant concern. This paper presents a novel adversarial attack using
+practical patches that mimic manhole covers to deceive MDE and SS models. The
+goal is to cause these systems to misinterpret scenes, leading to false
+detections of near obstacles or non-passable objects. We use Depth Planar
+Mapping to precisely position these patches on road surfaces, enhancing the
+attack's effectiveness. Our experiments show that these adversarial patches
+cause a 43% relative error in MDE and achieve a 96% attack success rate in SS.
+These patches create affected error regions over twice their size in MDE and
+approximately equal to their size in SS. Our studies also confirm the patch's
+effectiveness in physical simulations, the adaptability of the patches across
+different target models, and the effectiveness of our proposed modules,
+highlighting their practical implications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for WISA 2024. Code and dataset:
+  https://github.com/naufalso/adversarial-manhole</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ZeroMamba: Exploring Visual State Space Model for Zero-Shot Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14868v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14868v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenjin Hou, Dingjie Fu, Kun Li, Shiming Chen, Hehe Fan, Yi Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Zero-shot learning (ZSL) aims to recognize unseen classes by transferring
+semantic knowledge from seen classes to unseen ones, guided by semantic
+information. To this end, existing works have demonstrated remarkable
+performance by utilizing global visual features from Convolutional Neural
+Networks (CNNs) or Vision Transformers (ViTs) for visual-semantic interactions.
+Due to the limited receptive fields of CNNs and the quadratic complexity of
+ViTs, however, these visual backbones achieve suboptimal visual-semantic
+interactions. In this paper, motivated by the visual state space model (i.e.,
+Vision Mamba), which is capable of capturing long-range dependencies and
+modeling complex visual dynamics, we propose a parameter-efficient ZSL
+framework called ZeroMamba to advance ZSL. Our ZeroMamba comprises three key
+components: Semantic-aware Local Projection (SLP), Global Representation
+Learning (GRL), and Semantic Fusion (SeF). Specifically, SLP integrates
+semantic embeddings to map visual features to local semantic-related
+representations, while GRL encourages the model to learn global semantic
+representations. SeF combines these two semantic representations to enhance the
+discriminability of semantic features. We incorporate these designs into Vision
+Mamba, forming an end-to-end ZSL framework. As a result, the learned semantic
+representations are better suited for classification. Through extensive
+experiments on four prominent ZSL benchmarks, ZeroMamba demonstrates superior
+performance, significantly outperforming the state-of-the-art (i.e., CNN-based
+and ViT-based) methods under both conventional ZSL (CZSL) and generalized ZSL
+(GZSL) settings. Code is available at:
+https://anonymous.4open.science/r/ZeroMamba.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DiffSurf: A <span class="highlight-title">Transformer</span>-based Diffusion Model for Generating and
+  Reconstructing 3D Surfaces in Pose <span class="chip">ECCV2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14860v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14860v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yusuke Yoshiyasu, Leyuan Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents DiffSurf, a transformer-based denoising diffusion model
+for generating and reconstructing 3D surfaces. Specifically, we design a
+diffusion transformer architecture that predicts noise from noisy 3D surface
+vertices and normals. With this architecture, DiffSurf is able to generate 3D
+surfaces in various poses and shapes, such as human bodies, hands, animals and
+man-made objects. Further, DiffSurf is versatile in that it can address various
+3D downstream tasks including morphing, body shape variation and 3D human mesh
+fitting to 2D keypoints. Experimental results on 3D human model benchmarks
+demonstrate that DiffSurf can generate shapes with greater diversity and higher
+quality than previous generative models. Furthermore, when applied to the task
+of single-image 3D human mesh recovery, DiffSurf achieves accuracy comparable
+to prior techniques at a near real-time rate.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ECCV2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Intraoperative Glioma Segmentation with YOLO + SAM for Improved Accuracy
+  in Tumor Resection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14847v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14847v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samir Kassam, Angelo Markham, Katie Vo, Yashas Revanakara, Michael Lam, Kevin Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gliomas, a common type of malignant brain tumor, present significant surgical
+challenges due to their similarity to healthy tissue. Preoperative Magnetic
+Resonance Imaging (MRI) images are often ineffective during surgery due to
+factors such as brain shift, which alters the position of brain structures and
+tumors. This makes real-time intraoperative MRI (ioMRI) crucial, as it provides
+updated imaging that accounts for these shifts, ensuring more accurate tumor
+localization and safer resections. This paper presents a deep learning pipeline
+combining You Only Look Once Version 8 (YOLOv8) and Segment Anything Model
+Vision Transformer-base (SAM ViT-b) to enhance glioma detection and
+segmentation during ioMRI. Our model was trained using the Brain Tumor
+Segmentation 2021 (BraTS 2021) dataset, which includes standard magnetic
+resonance imaging (MRI) images, and noise-augmented MRI images that simulate
+ioMRI images. Noised MRI images are harder for a deep learning pipeline to
+segment, but they are more representative of surgical conditions. Achieving a
+Dice Similarity Coefficient (DICE) score of 0.79, our model performs comparably
+to state-of-the-art segmentation models tested on noiseless data. This
+performance demonstrates the model's potential to assist surgeons in maximizing
+tumor resection and improving surgical outcomes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diffusion-Occ: 3D Point Cloud Completion via Occupancy Diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14846v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14846v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guoqing Zhang, Jian Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Point clouds are crucial for capturing three-dimensional data but often
+suffer from incompleteness due to limitations such as resolution and occlusion.
+Traditional methods typically rely on point-based approaches within
+discriminative frameworks for point cloud completion. In this paper, we
+introduce \textbf{Diffusion-Occ}, a novel framework for Diffusion Point Cloud
+Completion. Diffusion-Occ utilizes a two-stage coarse-to-fine approach. In the
+first stage, the Coarse Density Voxel Prediction Network (CDNet) processes
+partial points to predict coarse density voxels, streamlining global feature
+extraction through voxel classification, as opposed to previous
+regression-based methods. In the second stage, we introduce the Occupancy
+Generation Network (OccGen), a conditional occupancy diffusion model based on a
+transformer architecture and enhanced by our Point-Voxel Fuse (PVF) block. This
+block integrates coarse density voxels with partial points to leverage both
+global and local features for comprehensive completion. By thresholding the
+occupancy field, we convert it into a complete point cloud. Additionally, our
+method employs diverse training mixtures and efficient diffusion
+parameterization to enable effective one-step sampling during both training and
+inference. Experimental results demonstrate that Diffusion-Occ outperforms
+existing discriminative and generative methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Bias to Balance: Detecting Facial Expression Recognition Biases in
+  Large Multimodal Foundation Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14842v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14842v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaylee Chhua, Zhoujinyi Wen, Vedant Hathalia, Kevin Zhu, Sean O'Brien
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study addresses the racial biases in facial expression recognition (FER)
+systems within Large Multimodal Foundation Models (LMFMs). Despite advances in
+deep learning and the availability of diverse datasets, FER systems often
+exhibit higher error rates for individuals with darker skin tones. Existing
+research predominantly focuses on traditional FER models (CNNs, RNNs, ViTs),
+leaving a gap in understanding racial biases in LMFMs. We benchmark four
+leading LMFMs: GPT-4o, PaliGemma, Gemini, and CLIP to assess their performance
+in facial emotion detection across different racial demographics. A linear
+classifier trained on CLIP embeddings obtains accuracies of 95.9\% for RADIATE,
+90.3\% for Tarr, and 99.5\% for Chicago Face. Furthermore, we identify that
+Anger is misclassified as Disgust 2.1 times more often in Black Females than
+White Females. This study highlights the need for fairer FER systems and
+establishes a foundation for developing unbiased, accurate FER technologies.
+Visit https://kvjvhub.github.io/FERRacialBias/ for further information
+regarding the biases within facial expression recognition.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diffusion based Semantic Outlier Generation via Nuisance Awareness for
+  Out-of-Distribution Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14841v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14841v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Suhee Yoon, Sanghyu Yoon, Hankook Lee, Ye Seul Sim, Sungik Choi, Kyungeun Lee, Hye-Seung Cho, Woohyung Lim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Out-of-distribution (OOD) detection, which determines whether a given sample
+is part of the in-distribution (ID), has recently shown promising results
+through training with synthetic OOD datasets. Nonetheless, existing methods
+often produce outliers that are considerably distant from the ID, showing
+limited efficacy for capturing subtle distinctions between ID and OOD. To
+address these issues, we propose a novel framework, Semantic Outlier generation
+via Nuisance Awareness (SONA), which notably produces challenging outliers by
+directly leveraging pixel-space ID samples through diffusion models. Our
+approach incorporates SONA guidance, providing separate control over semantic
+and nuisance regions of ID samples. Thereby, the generated outliers achieve two
+crucial properties: (i) they present explicit semantic-discrepant information,
+while (ii) maintaining various levels of nuisance resemblance with ID.
+Furthermore, the improved OOD detector training with SONA outliers facilitates
+learning with a focus on semantic distinctions. Extensive experiments
+demonstrate the effectiveness of our framework, achieving an impressive AUROC
+of 88% on near-OOD datasets, which surpasses the performance of baseline
+methods by a significant margin of approximately 6%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diffusion Models Are Real-Time Game Engines 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14837v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14837v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dani Valevski, Yaniv Leviathan, Moab Arar, Shlomi Fruchter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present GameNGen, the first game engine powered entirely by a neural model
+that enables real-time interaction with a complex environment over long
+trajectories at high quality. GameNGen can interactively simulate the classic
+game DOOM at over 20 frames per second on a single TPU. Next frame prediction
+achieves a PSNR of 29.4, comparable to lossy JPEG compression. Human raters are
+only slightly better than random chance at distinguishing short clips of the
+game from clips of the simulation. GameNGen is trained in two phases: (1) an
+RL-agent learns to play the game and the training sessions are recorded, and
+(2) a diffusion model is trained to produce the next frame, conditioned on the
+sequence of past frames and actions. Conditioning augmentations enable stable
+auto-regressive generation over long trajectories.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://gamengen.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Time-Aware Face Anti-Spoofing with Rotation Invariant Local Binary
+  Patterns and Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14829v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14829v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Moritz Finke, Alexandra Dmitrienko
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Facial recognition systems have become an integral part of the modern world.
+These methods accomplish the task of human identification in an automatic,
+fast, and non-interfering way. Past research has uncovered high vulnerability
+to simple imitation attacks that could lead to erroneous identification and
+subsequent authentication of attackers. Similar to face recognition, imitation
+attacks can also be detected with Machine Learning. Attack detection systems
+use a variety of facial features and advanced machine learning models for
+uncovering the presence of attacks. In this work, we assess existing work on
+liveness detection and propose a novel approach that promises high
+classification accuracy by combining previously unused features with time-aware
+deep learning strategies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Alfie: Democratising RGBA Image Generation With No $$$ <span class="chip">ECCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14826v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14826v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabio Quattrini, Vittorio Pippi, Silvia Cascianelli, Rita Cucchiara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Designs and artworks are ubiquitous across various creative fields, requiring
+graphic design skills and dedicated software to create compositions that
+include many graphical elements, such as logos, icons, symbols, and art scenes,
+which are integral to visual storytelling. Automating the generation of such
+visual elements improves graphic designers' productivity, democratizes and
+innovates the creative industry, and helps generate more realistic synthetic
+data for related tasks. These illustration elements are mostly RGBA images with
+irregular shapes and cutouts, facilitating blending and scene composition.
+However, most image generation models are incapable of generating such images
+and achieving this capability requires expensive computational resources,
+specific training recipes, or post-processing solutions. In this work, we
+propose a fully-automated approach for obtaining RGBA illustrations by
+modifying the inference-time behavior of a pre-trained Diffusion Transformer
+model, exploiting the prompt-guided controllability and visual quality offered
+by such models with no additional computational cost. We force the generation
+of entire subjects without sharp croppings, whose background is easily removed
+for seamless integration into design projects or artistic scenes. We show with
+a user study that, in most cases, users prefer our solution over generating and
+then matting an image, and we show that our generated illustrations yield good
+results when used as inputs for composite scene generation pipelines. We
+release the code at https://github.com/aimagelab/Alfie.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ECCV AI for Visual Arts Workshop and Challenges</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Rule-Based Models to Deep Learning <span class="highlight-title">Transformer</span>s Architectures for
+  Natural Language Processing and Sign Language Translation Systems: <span class="highlight-title">Survey</span>,
+  Taxonomy and Performance Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14825v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14825v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nada Shahin, Leila Ismail
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the growing Deaf and Hard of Hearing population worldwide and the
+persistent shortage of certified sign language interpreters, there is a
+pressing need for an efficient, signs-driven, integrated end-to-end translation
+system, from sign to gloss to text and vice-versa. There has been a wealth of
+research on machine translations and related reviews. However, there are few
+works on sign language machine translation considering the particularity of the
+language being continuous and dynamic. This paper aims to address this void,
+providing a retrospective analysis of the temporal evolution of sign language
+machine translation algorithms and a taxonomy of the Transformers
+architectures, the most used approach in language translation. We also present
+the requirements of a real-time Quality-of-Service sign language ma-chine
+translation system underpinned by accurate deep learning algorithms. We propose
+future research directions for sign language translation systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LapisGS: Layered Progressive 3D Gaussian Splatting for Adaptive
+  Streaming 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14823v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14823v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuang Shi, Simone Gasparini, Géraldine Morin, Wei Tsang Ooi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rise of Extended Reality (XR) requires efficient streaming of 3D online
+worlds, challenging current 3DGS representations to adapt to
+bandwidth-constrained environments. This paper proposes LapisGS, a layered 3DGS
+that supports adaptive streaming and progressive rendering. Our method
+constructs a layered structure for cumulative representation, incorporates
+dynamic opacity optimization to maintain visual fidelity, and utilizes
+occupancy maps to efficiently manage Gaussian splats. This proposed model
+offers a progressive representation supporting a continuous rendering quality
+adapted for bandwidth-aware streaming. Extensive experiments validate the
+effectiveness of our approach in balancing visual fidelity with the compactness
+of the model, with up to 50.71% improvement in SSIM, 286.53% improvement in
+LPIPS, and 318.41% reduction in model size, and shows its potential for
+bandwidth-adapted 3D streaming and rendering applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Build-A-Scene: Interactive 3D Layout Control for Diffusion-Based Image
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14819v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14819v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abdelrahman Eldesokey, Peter Wonka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a diffusion-based approach for Text-to-Image (T2I) generation with
+interactive 3D layout control. Layout control has been widely studied to
+alleviate the shortcomings of T2I diffusion models in understanding objects'
+placement and relationships from text descriptions. Nevertheless, existing
+approaches for layout control are limited to 2D layouts, require the user to
+provide a static layout beforehand, and fail to preserve generated images under
+layout changes. This makes these approaches unsuitable for applications that
+require 3D object-wise control and iterative refinements, e.g., interior design
+and complex scene generation. To this end, we leverage the recent advancements
+in depth-conditioned T2I models and propose a novel approach for interactive 3D
+layout control. We replace the traditional 2D boxes used in layout control with
+3D boxes. Furthermore, we revamp the T2I task as a multi-stage generation
+process, where at each stage, the user can insert, change, and move an object
+in 3D while preserving objects from earlier stages. We achieve this through our
+proposed Dynamic Self-Attention (DSA) module and the consistent 3D object
+translation strategy. Experiments show that our approach can generate
+complicated scenes based on 3D layouts, boosting the object generation success
+rate over the standard depth-conditioned T2I methods by 2x. Moreover, it
+outperforms other methods in comparison in preserving objects under layout
+changes. Project Page: \url{https://abdo-eldesokey.github.io/build-a-scene/}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://abdo-eldesokey.github.io/build-a-scene/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HPT++: Hierarchically <span class="highlight-title">Prompt</span>ing Vision-Language Models with
+  Multi-Granularity Knowledge Generation and Improved Structure Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14812v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14812v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yubin Wang, Xinyang Jiang, De Cheng, Wenli Sun, Dongsheng Li, Cairong Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prompt learning has become a prevalent strategy for adapting vision-language
+foundation models (VLMs) such as CLIP to downstream tasks. With the emergence
+of large language models (LLMs), recent studies have explored the potential of
+using category-related descriptions to enhance prompt effectiveness. However,
+conventional descriptions lack explicit structured information necessary to
+represent the interconnections among key elements like entities or attributes
+with relation to a particular category. Since existing prompt tuning methods
+give little consideration to managing structured knowledge, this paper
+advocates leveraging LLMs to construct a graph for each description to
+prioritize such structured knowledge. Consequently, we propose a novel approach
+called Hierarchical Prompt Tuning (HPT), enabling simultaneous modeling of both
+structured and conventional linguistic knowledge. Specifically, we introduce a
+relationship-guided attention module to capture pair-wise associations among
+entities and attributes for low-level prompt learning. In addition, by
+incorporating high-level and global-level prompts modeling overall semantics,
+the proposed hierarchical structure forges cross-level interlinks and empowers
+the model to handle more complex and long-term relationships. Finally, by
+enhancing multi-granularity knowledge generation, redesigning the
+relationship-driven attention re-weighting module, and incorporating consistent
+constraints on the hierarchical text encoder, we propose HPT++, which further
+improves the performance of HPT. Our experiments are conducted across a wide
+range of evaluation settings, including base-to-new generalization,
+cross-dataset evaluation, and domain generalization. Extensive results and
+ablation studies demonstrate the effectiveness of our methods, which
+consistently outperform existing SOTA methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 7 figures, 7 tables. arXiv admin note: substantial text
+  overlap with arXiv:2312.06323</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generalist Segmentation Algorithm for Photoreceptors Analysis in
+  Adaptive Optics Imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14810v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14810v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mikhail Kulyabin, Aline Sindel, Hilde Pedersen, Stuart Gilson, Rigmor Baraas, Andreas Maier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Analyzing the cone photoreceptor pattern in images obtained from the living
+human retina using quantitative methods can be crucial for the early detection
+and management of various eye conditions. Confocal adaptive optics scanning
+light ophthalmoscope (AOSLO) imaging enables visualization of the cones from
+reflections of waveguiding cone photoreceptors. While there have been
+significant improvements in automated algorithms for segmenting cones in
+confocal AOSLO images, the process of labelling data remains labor-intensive
+and manual. This paper introduces a method based on deep learning (DL) for
+detecting and segmenting cones in AOSLO images. The models were trained on a
+semi-automatically labelled dataset of 20 AOSLO batches of images of 18
+participants for 0$^{\circ}$, 1$^{\circ}$, and 2$^{\circ}$ from the foveal
+center. F1 scores were 0.968, 0.958, and 0.954 for 0$^{\circ}$, 1$^{\circ}$,
+and 2$^{\circ}$, respectively, which is better than previously reported DL
+approaches. Our method minimizes the need for labelled data by only
+necessitating a fraction of labelled cones, which is especially beneficial in
+the field of ophthalmology, where labelled data can often be limited.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Platypus: A Generalized Specialist Model for Reading Text in Various
+  Forms <span class="chip">ECCV2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14805v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14805v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peng Wang, Zhaohai Li, Jun Tang, Humen Zhong, Fei Huang, Zhibo Yang, Cong Yao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reading text from images (either natural scenes or documents) has been a
+long-standing research topic for decades, due to the high technical challenge
+and wide application range. Previously, individual specialist models are
+developed to tackle the sub-tasks of text reading (e.g., scene text
+recognition, handwritten text recognition and mathematical expression
+recognition). However, such specialist models usually cannot effectively
+generalize across different sub-tasks. Recently, generalist models (such as
+GPT-4V), trained on tremendous data in a unified way, have shown enormous
+potential in reading text in various scenarios, but with the drawbacks of
+limited accuracy and low efficiency. In this work, we propose Platypus, a
+generalized specialist model for text reading. Specifically, Platypus combines
+the best of both worlds: being able to recognize text of various forms with a
+single unified architecture, while achieving excellent accuracy and high
+efficiency. To better exploit the advantage of Platypus, we also construct a
+text reading dataset (called Worms), the images of which are curated from
+previous datasets and partially re-labeled. Experiments on standard benchmarks
+demonstrate the effectiveness and superiority of the proposed Platypus model.
+Model and data will be made publicly available at
+https://github.com/AlibabaResearch/AdvancedLiterateMachinery/tree/main/OCR/Platypus.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ECCV2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RAW-Adapter: Adapting <span class="highlight-title">Pre-train</span>ed Visual Model to Camera RAW Images <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14802v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14802v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziteng Cui, Tatsuya Harada
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  sRGB images are now the predominant choice for pre-training visual models in
+computer vision research, owing to their ease of acquisition and efficient
+storage. Meanwhile, the advantage of RAW images lies in their rich physical
+information under variable real-world challenging lighting conditions. For
+computer vision tasks directly based on camera RAW data, most existing studies
+adopt methods of integrating image signal processor (ISP) with backend
+networks, yet often overlook the interaction capabilities between the ISP
+stages and subsequent networks. Drawing inspiration from ongoing adapter
+research in NLP and CV areas, we introduce RAW-Adapter, a novel approach aimed
+at adapting sRGB pre-trained models to camera RAW data. RAW-Adapter comprises
+input-level adapters that employ learnable ISP stages to adjust RAW inputs, as
+well as model-level adapters to build connections between ISP stages and
+subsequent high-level networks. Additionally, RAW-Adapter is a general
+framework that could be used in various computer vision frameworks. Abundant
+experiments under different lighting conditions have shown our algorithm's
+state-of-the-art (SOTA) performance, demonstrating its effectiveness and
+efficiency across a range of real-world and synthetic datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ECCV 2024, code link: https://github.com/cuiziteng/ECCV_RAW_Adapter</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revisiting Surgical Instrument Segmentation Without Human Intervention:
+  A Graph Partitioning View 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14789v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14789v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingyu Sheng, Jianan Fan, Dongnan Liu, Ron Kikinis, Weidong Cai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Surgical instrument segmentation (SIS) on endoscopic images stands as a
+long-standing and essential task in the context of computer-assisted
+interventions for boosting minimally invasive surgery. Given the recent surge
+of deep learning methodologies and their data-hungry nature, training a neural
+predictive model based on massive expert-curated annotations has been
+dominating and served as an off-the-shelf approach in the field, which could,
+however, impose prohibitive burden to clinicians for preparing fine-grained
+pixel-wise labels corresponding to the collected surgical video frames. In this
+work, we propose an unsupervised method by reframing the video frame
+segmentation as a graph partitioning problem and regarding image pixels as
+graph nodes, which is significantly different from the previous efforts. A
+self-supervised pre-trained model is firstly leveraged as a feature extractor
+to capture high-level semantic features. Then, Laplacian matrixs are computed
+from the features and are eigendecomposed for graph partitioning. On the "deep"
+eigenvectors, a surgical video frame is meaningfully segmented into different
+modules such as tools and tissues, providing distinguishable semantic
+information like locations, classes, and relations. The segmentation problem
+can then be naturally tackled by applying clustering or threshold on the
+eigenvectors. Extensive experiments are conducted on various datasets (e.g.,
+EndoVis2017, EndoVis2018, UCL, etc.) for different clinical endpoints. Across
+all the challenging scenarios, our method demonstrates outstanding performance
+and robustness higher than unsupervised state-of-the-art (SOTA) methods. The
+code is released at https://github.com/MingyuShengSMY/GraphClusteringSIS.git.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MROVSeg: Breaking the Resolution Curse of Vision-Language Models in
+  Open-Vocabulary Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14776v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14776v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanbing Zhu, Bingke Zhu, Zhen Chen, Huan Xu, Ming Tang, Jinqiao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Open-vocabulary semantic segmentation aims to segment and recognize
+semantically meaningful regions based on text-based descriptions during
+inference. A typical solution to address this task is to leverage powerful
+vision-language models (VLMs), such as CLIP, to bridge the gap between open-
+and close-vocabulary recognition. As VLMs are usually pretrained with
+low-resolution images (e.g. $224\times224$), most previous methods operate only
+on downscaled images. We question this design as low resolution features often
+fail to preserve fine details. Although employing additional image backbones
+for high-resolution inputs can mitigate this issue, it may also introduce
+significant computation overhead. Therefore, we propose MROVSeg, a
+multi-resolution training framework for open-vocabulary semantic segmentation
+with a single pretrained CLIP backbone, that uses sliding windows to slice the
+high-resolution input into uniform patches, each matching the input size of the
+well-trained image encoder. Its key components include a Multi-Res Adapter,
+which restores the spatial geometry and grasps local-global correspondences
+across patches by learnable convolutional and scale attention layers. To
+achieve accurate segmentation, we introduce Multi-grained Masked Attention
+scheme to aggregate multi-grained semantics by performing cross-attention
+between object queries and multi-resolution CLIP features within the region of
+interests. Through comprehensive experiments, we demonstrate the superiority of
+MROVSeg on well-established open-vocabulary semantic segmentation benchmarks,
+particularly for high-resolution inputs, establishing new standards for
+open-vocabulary semantic segmentation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Text-guided Foundation Model Adaptation for Long-Tailed Medical Image
+  Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14770v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14770v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sirui Li, Li Lin, Yijin Huang, Pujin Cheng, Xiaoying Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In medical contexts, the imbalanced data distribution in long-tailed
+datasets, due to scarce labels for rare diseases, greatly impairs the
+diagnostic accuracy of deep learning models. Recent multimodal text-image
+supervised foundation models offer new solutions to data scarcity through
+effective representation learning. However, their limited medical-specific
+pretraining hinders their performance in medical image classification relative
+to natural images. To address this issue, we propose a novel Text-guided
+Foundation model Adaptation for Long-Tailed medical image classification
+(TFA-LT). We adopt a two-stage training strategy, integrating representations
+from the foundation model using just two linear adapters and a single ensembler
+for balanced outcomes. Experimental results on two long-tailed medical image
+datasets validate the simplicity, lightweight and efficiency of our approach:
+requiring only 6.1% GPU memory usage of the current best-performing algorithm,
+our method achieves an accuracy improvement of up to 27.1%, highlighting the
+substantial potential of foundation model adaptation in this area.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE ISBI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CrossViewDiff: A Cross-View Diffusion Model for Satellite-to-Street View
+  Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14765v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14765v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weijia Li, Jun He, Junyan Ye, Huaping Zhong, Zhimeng Zheng, Zilong Huang, Dahua Lin, Conghui He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Satellite-to-street view synthesis aims at generating a realistic street-view
+image from its corresponding satellite-view image. Although stable diffusion
+models have exhibit remarkable performance in a variety of image generation
+applications, their reliance on similar-view inputs to control the generated
+structure or texture restricts their application to the challenging cross-view
+synthesis task. In this work, we propose CrossViewDiff, a cross-view diffusion
+model for satellite-to-street view synthesis. To address the challenges posed
+by the large discrepancy across views, we design the satellite scene structure
+estimation and cross-view texture mapping modules to construct the structural
+and textural controls for street-view image synthesis. We further design a
+cross-view control guided denoising process that incorporates the above
+controls via an enhanced cross-view attention module. To achieve a more
+comprehensive evaluation of the synthesis results, we additionally design a
+GPT-based scoring method as a supplement to standard evaluation metrics. We
+also explore the effect of different data sources (e.g., text, maps, building
+heights, and multi-temporal satellite imagery) on this task. Results on three
+public cross-view datasets show that CrossViewDiff outperforms current
+state-of-the-art on both standard and GPT-based evaluation metrics, generating
+high-quality street-view panoramas with more realistic structures and textures
+across rural, suburban, and urban scenes. The code and models of this work will
+be released at https://opendatalab.github.io/CrossViewDiff/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SynthDoc: Bilingual Documents Synthesis for Visual Document
+  Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14764v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14764v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chuanghao Ding, Xuejing Liu, Wei Tang, Juan Li, Xiaoliang Wang, Rui Zhao, Cam-Tu Nguyen, Fei Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces SynthDoc, a novel synthetic document generation
+pipeline designed to enhance Visual Document Understanding (VDU) by generating
+high-quality, diverse datasets that include text, images, tables, and charts.
+Addressing the challenges of data acquisition and the limitations of existing
+datasets, SynthDoc leverages publicly available corpora and advanced rendering
+tools to create a comprehensive and versatile dataset. Our experiments,
+conducted using the Donut model, demonstrate that models trained with
+SynthDoc's data achieve superior performance in pre-training read tasks and
+maintain robustness in downstream tasks, despite language inconsistencies. The
+release of a benchmark dataset comprising 5,000 image-text pairs not only
+showcases the pipeline's capabilities but also provides a valuable resource for
+the VDU community to advance research and development in document image
+recognition. This work significantly contributes to the field by offering a
+scalable solution to data scarcity and by validating the efficacy of end-to-end
+models in parsing complex, real-world documents.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning effective pruning at initialization from iterative pruning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14757v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14757v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengkai Liu, Yaofeng Cheng, Fusheng Zha, Wei Guo, Lining Sun, Zhenshan Bing, Chenguang Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pruning at initialization (PaI) reduces training costs by removing weights
+before training, which becomes increasingly crucial with the growing network
+size. However, current PaI methods still have a large accuracy gap with
+iterative pruning, especially at high sparsity levels. This raises an
+intriguing question: can we get inspiration from iterative pruning to improve
+the PaI performance? In the lottery ticket hypothesis, the iterative rewind
+pruning (IRP) finds subnetworks retroactively by rewinding the parameter to the
+original initialization in every pruning iteration, which means all the
+subnetworks are based on the initial state. Here, we hypothesise the surviving
+subnetworks are more important and bridge the initial feature and their
+surviving score as the PaI criterion. We employ an end-to-end neural network
+(\textbf{AutoS}parse) to learn this correlation, input the model's initial
+features, output their score and then prune the lowest score parameters before
+training. To validate the accuracy and generalization of our method, we
+performed PaI across various models. Results show that our approach outperforms
+existing methods in high-sparsity settings. Notably, as the underlying logic of
+model pruning is consistent in different models, only one-time IRP on one model
+is needed (e.g., once IRP on ResNet-18/CIFAR-10, AutoS can be generalized to
+VGG-16/CIFAR-10, ResNet-18/TinyImageNet, et al.). As the first neural
+network-based PaI method, we conduct extensive experiments to validate the
+factors influencing this approach. These results reveal the learning tendencies
+of neural networks and provide new insights into our understanding and research
+of PaI from a practical perspective. Our code is available at:
+https://github.com/ChengYaofeng/AutoSparse.git.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sequential-Scanning Dual-Energy CT Imaging Using High Temporal
+  Resolution Image Reconstruction and Error-Compensated Material Basis Image
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14754v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14754v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiaoxin Li, Ruifeng Chen, Peng Wang, Guotao Quan, Yanfeng Du, Dong Liang, Yinsheng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dual-energy computed tomography (DECT) has been widely used to obtain
+quantitative elemental composition of imaged subjects for personalized and
+precise medical diagnosis. Compared with DECT leveraging advanced X-ray source
+and/or detector technologies, the use of the sequential-scanning data
+acquisition scheme to implement DECT may make a broader impact on clinical
+practice because this scheme requires no specialized hardware designs and can
+be directly implemented into conventional CT systems. However, since the
+concentration of iodinated contrast agent in the imaged subject varies over
+time, sequentially scanned data sets acquired at two tube potentials are
+temporally inconsistent. As existing material basis image reconstruction
+approaches assume that the data sets acquired at two tube potentials are
+temporally consistent, the violation of this assumption results in inaccurate
+quantification of material concentration. In this work, we developed
+sequential-scanning DECT imaging using high temporal resolution image
+reconstruction and error-compensated material basis image generation,
+ACCELERATION in short, to address the technical challenge induced by temporal
+inconsistency of sequentially scanned data sets and improve quantification
+accuracy of material concentration in sequential-scanning DECT. ACCELERATION
+has been validated and evaluated using numerical simulation data sets generated
+from clinical human subject exams and experimental human subject studies.
+Results demonstrated the improvement of quantification accuracy and image
+quality using ACCELERATION.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RSTeller: Scaling Up Visual Language Modeling in Remote Sensing with
+  Rich Linguistic Semantics from Openly Available Data and Large Language
+  Models <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14744v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14744v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junyao Ge, Yang Zheng, Kaitai Guo, Jimin Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Abundant, well-annotated multimodal data in remote sensing are pivotal for
+aligning complex visual remote sensing (RS) scenes with human language,
+enabling the development of specialized vision language models across diverse
+RS interpretation tasks. However, annotating RS images with rich linguistic
+semantics at scale demands expertise in RS and substantial human labor, making
+it costly and often impractical. In this study, we propose a workflow that
+leverages large language models (LLMs) to generate multimodal datasets with
+semantically rich captions at scale from plain OpenStreetMap (OSM) data for
+images sourced from the Google Earth Engine (GEE) platform. This approach
+facilitates the generation of paired remote sensing data and can be readily
+scaled up using openly available data. Within this framework, we present
+RSTeller, a multimodal dataset comprising over 1 million RS images, each
+accompanied by multiple descriptive captions. Extensive experiments demonstrate
+that RSTeller enhances the performance of multiple existing vision language
+models for RS scene understanding through continual pre-training. Our
+methodology significantly reduces the manual effort and expertise needed for
+annotating remote sensing imagery while democratizing access to high-quality
+annotated data. This advancement fosters progress in visual language modeling
+and encourages broader participation in remote sensing research and
+applications. The RSTeller dataset is available at
+https://github.com/SlytherinGe/RSTeller.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to ISPRS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Personalized Video Summarization using Text-Based Queries and
+  Conditional Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14743v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14743v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jia-Hong Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The proliferation of video content on platforms like YouTube and Vimeo
+presents significant challenges in efficiently locating relevant information.
+Automatic video summarization aims to address this by extracting and presenting
+key content in a condensed form. This thesis explores enhancing video
+summarization by integrating text-based queries and conditional modeling to
+tailor summaries to user needs. Traditional methods often produce fixed
+summaries that may not align with individual requirements. To overcome this, we
+propose a multi-modal deep learning approach that incorporates both textual
+queries and visual information, fusing them at different levels of the model
+architecture. Evaluation metrics such as accuracy and F1-score assess the
+quality of the generated summaries. The thesis also investigates improving
+text-based query representations using contextualized word embeddings and
+specialized attention networks. This enhances the semantic understanding of
+queries, leading to better video summaries. To emulate human-like
+summarization, which accounts for both visual coherence and abstract factors
+like storyline consistency, we introduce a conditional modeling approach. This
+method uses multiple random variables and joint distributions to capture key
+summarization components, resulting in more human-like and explainable
+summaries. Addressing data scarcity in fully supervised learning, the thesis
+proposes a segment-level pseudo-labeling approach. This self-supervised method
+generates additional data, improving model performance even with limited
+human-labeled datasets. In summary, this research aims to enhance automatic
+video summarization by incorporating text-based queries, improving query
+representations, introducing conditional modeling, and addressing data
+scarcity, thereby creating more effective and personalized video summaries.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Ph.D. thesis, 137 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Differentially Private Diffusion Models via Stochastic
+  Adversarial Distillation <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14738v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14738v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bochao Liu, Pengju Wang, Shiming Ge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While the success of deep learning relies on large amounts of training
+datasets, data is often limited in privacy-sensitive domains. To address this
+challenge, generative model learning with differential privacy has emerged as a
+solution to train private generative models for desensitized data generation.
+However, the quality of the images generated by existing methods is limited due
+to the complexity of modeling data distribution. We build on the success of
+diffusion models and introduce DP-SAD, which trains a private diffusion model
+by a stochastic adversarial distillation method. Specifically, we first train a
+diffusion model as a teacher and then train a student by distillation, in which
+we achieve differential privacy by adding noise to the gradients from other
+models to the student. For better generation quality, we introduce a
+discriminator to distinguish whether an image is from the teacher or the
+student, which forms the adversarial training. Extensive experiments and
+analysis clearly demonstrate the effectiveness of our proposed method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted by ECCV 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OctFusion: Octree-based Diffusion Models for 3D Shape Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14732v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14732v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bojun Xiong, Si-Tong Wei, Xin-Yang Zheng, Yan-Pei Cao, Zhouhui Lian, Peng-Shuai Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have emerged as a popular method for 3D generation. However,
+it is still challenging for diffusion models to efficiently generate diverse
+and high-quality 3D shapes. In this paper, we introduce OctFusion, which can
+generate 3D shapes with arbitrary resolutions in 2.5 seconds on a single Nvidia
+4090 GPU, and the extracted meshes are guaranteed to be continuous and
+manifold. The key components of OctFusion are the octree-based latent
+representation and the accompanying diffusion models. The representation
+combines the benefits of both implicit neural representations and explicit
+spatial octrees and is learned with an octree-based variational autoencoder.
+The proposed diffusion model is a unified multi-scale U-Net that enables
+weights and computation sharing across different octree levels and avoids the
+complexity of widely used cascaded diffusion schemes. We verify the
+effectiveness of OctFusion on the ShapeNet and Objaverse datasets and achieve
+state-of-the-art performances on shape generation tasks. We demonstrate that
+OctFusion is extendable and flexible by generating high-quality color fields
+for textured mesh generation and high-quality 3D shapes conditioned on text
+prompts, sketches, or category labels. Our code and pre-trained models are
+available at \url{https://github.com/octree-nn/octfusion}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GeoTransfer : Generalizable Few-Shot Multi-View Reconstruction via
+  Transfer Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14724v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14724v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shubhendu Jena, Franck Multon, Adnane Boukhayma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a novel approach for sparse 3D reconstruction by
+leveraging the expressive power of Neural Radiance Fields (NeRFs) and fast
+transfer of their features to learn accurate occupancy fields. Existing 3D
+reconstruction methods from sparse inputs still struggle with capturing
+intricate geometric details and can suffer from limitations in handling
+occluded regions. On the other hand, NeRFs excel in modeling complex scenes but
+do not offer means to extract meaningful geometry. Our proposed method offers
+the best of both worlds by transferring the information encoded in NeRF
+features to derive an accurate occupancy field representation. We utilize a
+pre-trained, generalizable state-of-the-art NeRF network to capture detailed
+scene radiance information, and rapidly transfer this knowledge to train a
+generalizable implicit occupancy network. This process helps in leveraging the
+knowledge of the scene geometry encoded in the generalizable NeRF prior and
+refining it to learn occupancy fields, facilitating a more precise
+generalizable representation of 3D space. The transfer learning approach leads
+to a dramatic reduction in training time, by orders of magnitude (i.e. from
+several days to 3.5 hrs), obviating the need to train generalizable sparse
+surface reconstruction methods from scratch. Additionally, we introduce a novel
+loss on volumetric rendering weights that helps in the learning of accurate
+occupancy fields, along with a normal loss that helps in global smoothing of
+the occupancy fields. We evaluate our approach on the DTU dataset and
+demonstrate state-of-the-art performance in terms of reconstruction accuracy,
+especially in challenging scenarios with sparse input data and occluded
+regions. We furthermore demonstrate the generalization capabilities of our
+method by showing qualitative results on the Blended MVS dataset without any
+retraining.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Snap and Diagnose: An Advanced Multimodal Retrieval System for
+  Identifying Plant Diseases in the Wild 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14723v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14723v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianqi Wei, Zhi Chen, Xin Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Plant disease recognition is a critical task that ensures crop health and
+mitigates the damage caused by diseases. A handy tool that enables farmers to
+receive a diagnosis based on query pictures or the text description of
+suspicious plants is in high demand for initiating treatment before potential
+diseases spread further. In this paper, we develop a multimodal plant disease
+image retrieval system to support disease search based on either image or text
+prompts. Specifically, we utilize the largest in-the-wild plant disease dataset
+PlantWild, which includes over 18,000 images across 89 categories, to provide a
+comprehensive view of potential diseases relating to the query. Furthermore,
+cross-modal retrieval is achieved in the developed system, facilitated by a
+novel CLIP-based vision-language model that encodes both disease descriptions
+and disease images into the same latent space. Built on top of the retriever,
+our retrieval system allows users to upload either plant disease images or
+disease descriptions to retrieve the corresponding images with similar
+characteristics from the disease dataset to suggest candidate diseases for end
+users' consideration.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fine-grained length controllable video captioning with ordinal
+  embeddings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15447v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15447v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tomoya Nitta, Takumi Fukuzawa, Toru Tamaki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a method for video captioning that controls the length of
+generated captions. Previous work on length control often had few levels for
+expressing length. In this study, we propose two methods of length embedding
+for fine-grained length control. A traditional embedding method is linear,
+using a one-hot vector and an embedding matrix. In this study, we propose
+methods that represent length in multi-hot vectors. One is bit embedding that
+expresses length in bit representation, and the other is ordinal embedding that
+uses the binary representation often used in ordinal regression. These length
+representations of multi-hot vectors are converted into length embedding by a
+nonlinear MLP. This method allows for not only the length control of caption
+sentences but also the control of the time when reading the caption.
+Experiments using ActivityNet Captions and Spoken Moments in Time show that the
+proposed method effectively controls the length of the generated captions.
+Analysis of the embedding vectors with ICA shows that length and semantics were
+learned separately, demonstrating the effectiveness of the proposed embedding
+methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HEAD: A Bandwidth-Efficient Cooperative Perception Approach for
+  Heterogeneous Connected and Autonomous Vehicles <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15428v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15428v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Deyuan Qu, Qi Chen, Yongqi Zhu, Yihao Zhu, Sergei S. Avedisov, Song Fu, Qing Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In cooperative perception studies, there is often a trade-off between
+communication bandwidth and perception performance. While current feature
+fusion solutions are known for their excellent object detection performance,
+transmitting the entire sets of intermediate feature maps requires substantial
+bandwidth. Furthermore, these fusion approaches are typically limited to
+vehicles that use identical detection models. Our goal is to develop a solution
+that supports cooperative perception across vehicles equipped with different
+modalities of sensors. This method aims to deliver improved perception
+performance compared to late fusion techniques, while achieving precision
+similar to the state-of-art intermediate fusion, but requires an order of
+magnitude less bandwidth. We propose HEAD, a method that fuses features from
+the classification and regression heads in 3D object detection networks. Our
+method is compatible with heterogeneous detection networks such as LiDAR
+PointPillars, SECOND, VoxelNet, and camera Bird's-eye View (BEV) Encoder. Given
+the naturally smaller feature size in the detection heads, we design a
+self-attention mechanism to fuse the classification head and a complementary
+feature fusion layer to fuse the regression head. Our experiments,
+comprehensively evaluated on the V2V4Real and OPV2V datasets, demonstrate that
+HEAD is a fusion method that effectively balances communication bandwidth and
+perception performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ECCV 2024 Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating <span class="highlight-title">Pre-Train</span>ing Bias on Severe Acute Respiratory Syndrome
+  <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15398v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15398v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Diego Dimer Rodrigues
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning (ML) is a growing field of computer science that has found
+many practical applications in several domains, including Health. However, as
+data grows in size and availability, and the number of models that aim to aid
+or replace human decisions, it raises the concern that these models can be
+susceptible to bias, which can lead to harm to specific individuals by basing
+its decisions on protected attributes such as gender, religion, sexual
+orientation, ethnicity, and others. Visualization techniques might generate
+insights and help summarize large datasets, enabling data scientists to
+understand the data better before training a model by evaluating pre-training
+metrics applied to the datasets before training, which might contribute to
+identifying potential harm before any effort is put into training and deploying
+the models. This work uses the severe acute respiratory syndrome dataset from
+OpenDataSUS to visualize three pre-training bias metrics and their distribution
+across different regions in Brazil. A random forest model is trained in each
+region and applied to the others. The aim is to compare the bias for the
+different regions, focusing on their protected attributes and comparing the
+model's performance with the metric values.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>short paper for eurovis, 5 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Panoptic Perception for Autonomous Driving: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15388v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15388v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunge Li, Lanyu Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Panoptic perception represents a forefront advancement in autonomous driving
+technology, unifying multiple perception tasks into a singular, cohesive
+framework to facilitate a thorough understanding of the vehicle's surroundings.
+This survey reviews typical panoptic perception models for their unique inputs
+and architectures and compares them to performance, responsiveness, and
+resource utilization. It also delves into the prevailing challenges faced in
+panoptic perception and explores potential trajectories for future research.
+Our goal is to furnish researchers in autonomous driving with a detailed
+synopsis of panoptic perception, positioning this survey as a pivotal reference
+in the ever-evolving landscape of autonomous driving technologies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Feature Aggregation in Diffusion Models for Enhanced Face
+  Super-Resolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15386v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15386v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marcelo dos Santos, Rayson Laroca, Rafael O. Ribeiro, João C. Neves, David Menotti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Super-resolution algorithms often struggle with images from surveillance
+environments due to adverse conditions such as unknown degradation, variations
+in pose, irregular illumination, and occlusions. However, acquiring multiple
+images, even of low quality, is possible with surveillance cameras. In this
+work, we develop an algorithm based on diffusion models that utilize a
+low-resolution image combined with features extracted from multiple low-quality
+images to generate a super-resolved image while minimizing distortions in the
+individual's identity. Unlike other algorithms, our approach recovers facial
+features without explicitly providing attribute information or without the need
+to calculate a gradient of a function during the reconstruction process. To the
+best of our knowledge, this is the first time multi-features combined with
+low-resolution images are used as conditioners to generate more reliable
+super-resolution images using stochastic differential equations. The FFHQ
+dataset was employed for training, resulting in state-of-the-art performance in
+facial recognition and verification metrics when evaluated on the CelebA and
+Quis-Campi datasets. Our code is publicly available at
+https://github.com/marcelowds/fasr
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for presentation at the Conference on Graphics, Patterns and
+  Images (SIBGRAPI) 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CycleGAN with Better Cycles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15374v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15374v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tongzhou Wang, Yihan Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  CycleGAN provides a framework to train image-to-image translation with
+unpaired datasets using cycle consistency loss [4]. While results are great in
+many applications, the pixel level cycle consistency can potentially be
+problematic and causes unrealistic images in certain cases. In this project, we
+propose three simple modifications to cycle consistency, and show that such an
+approach achieves better results with fewer artifacts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report 2018</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Handling Geometric Domain Shifts in Semantic Segmentation of Surgical
+  RGB and Hyperspectral Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15373v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15373v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Silvia Seidlitz, Jan Sellner, Alexander Studier-Fischer, Alessandro Motta, Berkin Özdemir, Beat P. Müller-Stich, Felix Nickel, Lena Maier-Hein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robust semantic segmentation of intraoperative image data holds promise for
+enabling automatic surgical scene understanding and autonomous robotic surgery.
+While model development and validation are primarily conducted on idealistic
+scenes, geometric domain shifts, such as occlusions of the situs, are common in
+real-world open surgeries. To close this gap, we (1) present the first analysis
+of state-of-the-art (SOA) semantic segmentation models when faced with
+geometric out-of-distribution (OOD) data, and (2) propose an augmentation
+technique called "Organ Transplantation", to enhance generalizability. Our
+comprehensive validation on six different OOD datasets, comprising 600 RGB and
+hyperspectral imaging (HSI) cubes from 33 pigs, each annotated with 19 classes,
+reveals a large performance drop in SOA organ segmentation models on geometric
+OOD data. This performance decline is observed not only in conventional RGB
+data (with a dice similarity coefficient (DSC) drop of 46 %) but also in HSI
+data (with a DSC drop of 45 %), despite the richer spectral information
+content. The performance decline increases with the spatial granularity of the
+input data. Our augmentation technique improves SOA model performance by up to
+67 % for RGB data and 90 % for HSI data, achieving performance at the level of
+in-distribution performance on real OOD test data. Given the simplicity and
+effectiveness of our augmentation method, it is a valuable tool for addressing
+geometric domain shifts in surgical scene segmentation, regardless of the
+underlying model. Our code and pre-trained models are publicly available at
+https://github.com/IMSY-DKFZ/htc.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Silvia Seidlitz and Jan Sellner contributed equally</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimizing Lung Cancer Detection in CT Imaging: A Wavelet Multi-Layer
+  Perceptron (WMLP) Approach Enhanced by Dragonfly Algorithm (DA) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15355v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15355v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bitasadat Jamshidi, Nastaran Ghorbani, Mohsen Rostamy-Malkhalifeh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lung cancer stands as the preeminent cause of cancer-related mortality
+globally. Prompt and precise diagnosis, coupled with effective treatment, is
+imperative to reduce the fatality rates associated with this formidable
+disease. This study introduces a cutting-edge deep learning framework for the
+classification of lung cancer from CT scan imagery. The research encompasses a
+suite of image pre-processing strategies, notably Canny edge detection, and
+wavelet transformations, which precede the extraction of salient features and
+subsequent classification via a Multi-Layer Perceptron (MLP). The optimization
+process is further refined using the Dragonfly Algorithm (DA). The methodology
+put forth has attained an impressive training and testing accuracy of 99.82\%,
+underscoring its efficacy and reliability in the accurate diagnosis of lung
+cancer.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TAPVid-3D: A Benchmark for Tracking Any Point in 3D 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.05921v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.05921v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Skanda Koppula, Ignacio Rocco, Yi Yang, Joe Heyward, João Carreira, Andrew Zisserman, Gabriel Brostow, Carl Doersch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a new benchmark, TAPVid-3D, for evaluating the task of
+long-range Tracking Any Point in 3D (TAP-3D). While point tracking in two
+dimensions (TAP) has many benchmarks measuring performance on real-world
+videos, such as TAPVid-DAVIS, three-dimensional point tracking has none. To
+this end, leveraging existing footage, we build a new benchmark for 3D point
+tracking featuring 4,000+ real-world videos, composed of three different data
+sources spanning a variety of object types, motion patterns, and indoor and
+outdoor environments. To measure performance on the TAP-3D task, we formulate a
+collection of metrics that extend the Jaccard-based metric used in TAP to
+handle the complexities of ambiguous depth scales across models, occlusions,
+and multi-track spatio-temporal smoothness. We manually verify a large sample
+of trajectories to ensure correct video annotations, and assess the current
+state of the TAP-3D task by constructing competitive baselines using existing
+tracking models. We anticipate this benchmark will serve as a guidepost to
+improve our ability to understand precise 3D motion and surface deformation
+from monocular video. Code for dataset download, generation, and model
+evaluation is available at https://tapvid3d.github.io
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ KAN-RCBEVDepth: A multi-modal fusion algorithm in object detection for
+  autonomous driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.02088v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.02088v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhihao Lai, Chuanhao Liu, Shihui Sheng, Zhiqiang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate 3D object detection in autonomous driving is critical yet
+challenging due to occlusions, varying object sizes, and complex urban
+environments. This paper introduces the KAN-RCBEVDepth method, an innovative
+approach aimed at enhancing 3D object detection by fusing multimodal sensor
+data from cameras, LiDAR, and millimeter-wave radar. Our unique Bird's Eye
+View-based approach significantly improves detection accuracy and efficiency by
+seamlessly integrating diverse sensor inputs, refining spatial relationship
+understanding, and optimizing computational procedures. Experimental results
+show that the proposed method outperforms existing techniques across multiple
+detection metrics, achieving a higher Mean Distance AP (0.389, 23\%
+improvement), a better ND Score (0.485, 17.1\% improvement), and a faster
+Evaluation Time (71.28s, 8\% faster). Additionally, the KAN-RCBEVDepth method
+significantly reduces errors compared to BEVDepth, with lower Transformation
+Error (0.6044, 13.8\% improvement), Scale Error (0.2780, 2.6\% improvement),
+Orientation Error (0.5830, 7.6\% improvement), Velocity Error (0.4244, 28.3\%
+improvement), and Attribute Error (0.2129, 3.2\% improvement). These findings
+suggest that our method offers enhanced accuracy, reliability, and efficiency,
+making it well-suited for dynamic and demanding autonomous driving scenarios.
+The code will be released in \url{https://github.com/laitiamo/RCBEVDepth-KAN}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Creating Image <span class="highlight-title">Dataset</span>s in Agricultural Environments using DALL.E:
+  Generative AI-Powered Large Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08789v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08789v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ranjan Sapkota, Manoj Karkee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This research investigated the role of artificial intelligence (AI),
+specifically the DALL.E model by OpenAI, in advancing data generation and
+visualization techniques in agriculture. DALL.E, an advanced AI image
+generator, works alongside ChatGPT's language processing to transform text
+descriptions and image clues into realistic visual representations of the
+content. The study used both approaches of image generation: text-to-image and
+image-to image (variation). Six types of datasets depicting fruit crop
+environment were generated. These AI-generated images were then compared
+against ground truth images captured by sensors in real agricultural fields.
+The comparison was based on Peak Signal-to-Noise Ratio (PSNR) and Feature
+Similarity Index (FSIM) metrics. The image-to-image generation exhibited a
+5.78% increase in average PSNR over text-to-image methods, signifying superior
+image clarity and quality. However, this method also resulted in a 10.23%
+decrease in average FSIM, indicating a diminished structural and textural
+similarity to the original images. Similar to these measures, human evaluation
+also showed that images generated using image-to-image-based method were more
+realistic compared to those generated with text-to-image approach. The results
+highlighted DALL.E's potential in generating realistic agricultural image
+datasets and thus accelerating the development and adoption of imaging-based
+precision agricultural solutions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 Figures, 1 table, 17 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Comprehensive Performance Evaluation of YOLOv10, YOLOv9 and YOLOv8 on
+  Detecting and Counting Fruitlet in Complex Orchard Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.12040v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.12040v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ranjan Sapkota, Zhichao Meng, Martin Churuvija, Xiaoqiang Du, Zenghong Ma, Manoj Karkee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study performed an extensive evaluation of the performances of all
+configurations of YOLOv8, YOLOv9, and YOLOv10 object detection algorithms for
+fruitlet (of green fruit) detection in commercial orchards. Additionally, this
+research performed and validated in-field counting of fruitlets using an iPhone
+and machine vision sensors in 5 different apple varieties (Scifresh, Scilate,
+Honeycrisp, Cosmic crisp & Golden delicious). This comprehensive investigation
+of total 17 different configurations (5 for YOLOv8, 6 for YOLOv9 and 6 for
+YOLOv10) revealed that YOLOv9 outperforms YOLOv10 and YOLOv8 in terms of
+mAP@50, while YOLOv10x outperformed all 17 configurations tested in terms of
+precision and recall. Specifically, YOLOv9 Gelan-e achieved the highest mAP@50
+of 0.935, outperforming YOLOv10n's 0.921 and YOLOv8s's 0.924. In terms of
+precision, YOLOv10x achieved the highest precision of 0.908, indicating
+superior object identification accuracy compared to other configurations tested
+(e.g. YOLOv9 Gelan-c with a precision of 0.903 and YOLOv8m with 0.897. In terms
+of recall, YOLOv10s achieved the highest in its series (0.872), while YOLOv9
+Gelan m performed the best among YOLOv9 configurations (0.899), and YOLOv8n
+performed the best among the YOLOv8 configurations (0.883). Meanwhile, three
+configurations of YOLOv10: YOLOv10b, YOLOv10l, and YOLOv10x achieved superior
+post-processing speeds of 1.5 milliseconds, outperforming all other
+configurations within the YOLOv9 and YOLOv8 families. Specifically, YOLOv9
+Gelan-e recorded a post-processing speed of 1.9 milliseconds, and YOLOv8m
+achieved 2.1 milliseconds. Furthermore, YOLOv8n exhibited the highest inference
+speed among all configurations tested, achieving a processing time of 4.1
+milliseconds while YOLOv9 Gelan-t and YOLOv10n also demonstrated comparatively
+slower inference speeds of 9.3 ms and 5.5 ms, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ UWF-RI2FA: Generating Multi-frame Ultrawide-field Fluorescein
+  Angiography from Ultrawide-field Retinal Imaging Improves Diabetic
+  Retinopathy Stratification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.10636v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.10636v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruoyu Chen, Kezheng Xu, Kangyan Zheng, Weiyi Zhang, Yan Lu, Danli Shi, Mingguang He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ultrawide-field fluorescein angiography (UWF-FA) facilitates diabetic
+retinopathy (DR) detection by providing a clear visualization of peripheral
+retinal lesions. However, the intravenous dye injection with potential risks
+hamper its application. We aim to acquire dye-free UWF-FA images from
+noninvasive UWF retinal imaging (UWF-RI) using generative artificial
+intelligence (GenAI) and evaluate its effectiveness in DR screening. A total of
+18,321 UWF-FA images of different phases were registered with corresponding
+UWF-RI images and fed into a generative adversarial networks (GAN)-based model
+for training. The quality of generated UWF-FA images was evaluated through
+quantitative metrics and human evaluation. The DeepDRiD dataset was used to
+externally assess the contribution of generated UWF-FA images to DR
+classification, using area under the receiver operating characteristic curve
+(AUROC) as outcome metrics. The generated early, mid, and late phase UWF-FA
+images achieved high authenticity, with multi-scale similarity scores ranging
+from 0.70 to 0.91 and qualitative visual scores ranging from 1.64 to 1.98
+(1=real UWF-FA quality). In fifty randomly selected images, 56% to 76% of the
+generated images were difficult to distinguish from real images in the Turing
+test. Moreover, adding these generated UWF-FA images for DR classification
+significantly increased the AUROC from 0.869 to 0.904 compared to the baseline
+model using UWF-RI images (P < .001). The model successfully generates
+realistic multi-frame UWF-FA images for enhancing DR stratification without
+intravenous dye injection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CNN-<span class="highlight-title">Transformer</span> Rectified Collaborative Learning for Medical Image
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13698v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13698v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lanhu Wu, Miao Zhang, Yongri Piao, Zhenyan Yao, Weibing Sun, Feng Tian, Huchuan Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic and precise medical image segmentation (MIS) is of vital importance
+for clinical diagnosis and analysis. Current MIS methods mainly rely on the
+convolutional neural network (CNN) or self-attention mechanism (Transformer)
+for feature modeling. However, CNN-based methods suffer from the inaccurate
+localization owing to the limited global dependency while Transformer-based
+methods always present the coarse boundary for the lack of local emphasis.
+Although some CNN-Transformer hybrid methods are designed to synthesize the
+complementary local and global information for better performance, the
+combination of CNN and Transformer introduces numerous parameters and increases
+the computation cost. To this end, this paper proposes a CNN-Transformer
+rectified collaborative learning (CTRCL) framework to learn stronger CNN-based
+and Transformer-based models for MIS tasks via the bi-directional knowledge
+transfer between them. Specifically, we propose a rectified logit-wise
+collaborative learning (RLCL) strategy which introduces the ground truth to
+adaptively select and rectify the wrong regions in student soft labels for
+accurate knowledge transfer in the logit space. We also propose a class-aware
+feature-wise collaborative learning (CFCL) strategy to achieve effective
+knowledge transfer between CNN-based and Transformer-based models in the
+feature space by granting their intermediate features the similar capability of
+category perception. Extensive experiments on three popular MIS benchmarks
+demonstrate that our CTRCL outperforms most state-of-the-art collaborative
+learning methods under different evaluation metrics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Zero-Shot Character Identification and Speaker Prediction in Comics via
+  Iterative Multimodal Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.13993v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.13993v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yingxuan Li, Ryota Hinami, Kiyoharu Aizawa, Yusuke Matsui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recognizing characters and predicting speakers of dialogue are critical for
+comic processing tasks, such as voice generation or translation. However,
+because characters vary by comic title, supervised learning approaches like
+training character classifiers which require specific annotations for each
+comic title are infeasible. This motivates us to propose a novel zero-shot
+approach, allowing machines to identify characters and predict speaker names
+based solely on unannotated comic images. In spite of their importance in
+real-world applications, these task have largely remained unexplored due to
+challenges in story comprehension and multimodal integration. Recent large
+language models (LLMs) have shown great capability for text understanding and
+reasoning, while their application to multimodal content analysis is still an
+open problem. To address this problem, we propose an iterative multimodal
+framework, the first to employ multimodal information for both character
+identification and speaker prediction tasks. Our experiments demonstrate the
+effectiveness of the proposed framework, establishing a robust baseline for
+these tasks. Furthermore, since our method requires no training data or
+annotations, it can be used as-is on any comic series.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM Multimedia 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RT-Attack: Jailbreaking Text-to-Image Models via Random Token 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13896v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13896v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sensen Gao, Xiaojun Jia, Yihao Huang, Ranjie Duan, Jindong Gu, Yang Liu, Qing Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, Text-to-Image(T2I) models have achieved remarkable success in image
+generation and editing, yet these models still have many potential issues,
+particularly in generating inappropriate or Not-Safe-For-Work(NSFW) content.
+Strengthening attacks and uncovering such vulnerabilities can advance the
+development of reliable and practical T2I models. Most of the previous works
+treat T2I models as white-box systems, using gradient optimization to generate
+adversarial prompts. However, accessing the model's gradient is often
+impossible in real-world scenarios. Moreover, existing defense methods, those
+using gradient masking, are designed to prevent attackers from obtaining
+accurate gradient information. While some black-box jailbreak attacks have been
+explored, these typically rely on simply replacing sensitive words, leading to
+suboptimal attack performance. To address this issue, we introduce a two-stage
+query-based black-box attack method utilizing random search. In the first
+stage, we establish a preliminary prompt by maximizing the semantic similarity
+between the adversarial and target harmful prompts. In the second stage, we use
+this initial prompt to refine our approach, creating a detailed adversarial
+prompt aimed at jailbreaking and maximizing the similarity in image features
+between the images generated from this prompt and those produced by the target
+harmful prompt. Extensive experiments validate the effectiveness of our method
+in attacking the latest prompt checkers, post-hoc image checkers, securely
+trained T2I models, and online commercial models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Polyp SAM 2: Advancing Zero shot Polyp Segmentation in Colorectal Cancer
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.05892v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.05892v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mobina Mansoori, Sajjad Shahabodini, Jamshid Abouei, Konstantinos N. Plataniotis, Arash Mohammadi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Polyp segmentation plays a crucial role in the early detection and diagnosis
+of colorectal cancer. However, obtaining accurate segmentations often requires
+labor-intensive annotations and specialized models. Recently, Meta AI Research
+released a general Segment Anything Model 2 (SAM 2), which has demonstrated
+promising performance in several segmentation tasks. In this manuscript, we
+evaluate the performance of SAM 2 in segmenting polyps under various prompted
+settings. We hope this report will provide insights to advance the field of
+polyp segmentation and promote more interesting work in the future. This
+project is publicly available at https://github.com/ sajjad-sh33/Polyp-SAM-2.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Variational Autoencoding of Dental Point Clouds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10895v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10895v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Johan Ziruo Ye, Thomas Ørkild, Peter Lempel Søndergaard, Søren Hauberg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Digital dentistry has made significant advancements, yet numerous challenges
+remain. This paper introduces the FDI 16 dataset, an extensive collection of
+tooth meshes and point clouds. Additionally, we present a novel approach:
+Variational FoldingNet (VF-Net), a fully probabilistic variational autoencoder
+for point clouds. Notably, prior latent variable models for point clouds lack a
+one-to-one correspondence between input and output points. Instead, they rely
+on optimizing Chamfer distances, a metric that lacks a normalized
+distributional counterpart, rendering it unsuitable for probabilistic modeling.
+We replace the explicit minimization of Chamfer distances with a suitable
+encoder, increasing computational efficiency while simplifying the
+probabilistic extension. This allows for straightforward application in various
+tasks, including mesh generation, shape completion, and representation
+learning. Empirically, we provide evidence of lower reconstruction error in
+dental reconstruction and interpolation, showcasing state-of-the-art
+performance in dental sample generation while identifying valuable latent
+representations
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unsupervised Domain Adaptation via Style-Aware Self-intermediate Domain 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.01870v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.01870v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lianyu Wang, Meng Wang, Daoqiang Zhang, Huazhu Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised domain adaptation (UDA) has attracted considerable attention,
+which transfers knowledge from a label-rich source domain to a related but
+unlabeled target domain. Reducing inter-domain differences has always been a
+crucial factor to improve performance in UDA, especially for tasks where there
+is a large gap between source and target domains. To this end, we propose a
+novel style-aware feature fusion method (SAFF) to bridge the large domain gap
+and transfer knowledge while alleviating the loss of class-discriminative
+information. Inspired by the human transitive inference and learning ability, a
+novel style-aware self-intermediate domain (SSID) is investigated to link two
+seemingly unrelated concepts through a series of intermediate auxiliary
+synthesized concepts. Specifically, we propose a novel learning strategy of
+SSID, which selects samples from both source and target domains as anchors, and
+then randomly fuses the object and style features of these anchors to generate
+labeled and style-rich intermediate auxiliary features for knowledge transfer.
+Moreover, we design an external memory bank to store and update specified
+labeled features to obtain stable class features and class-wise style features.
+Based on the proposed memory bank, the intra- and inter-domain loss functions
+are designed to improve the class recognition ability and feature
+compatibility, respectively. Meanwhile, we simulate the rich latent feature
+space of SSID by infinite sampling and the convergence of the loss function by
+mathematical theory. Finally, we conduct comprehensive experiments on commonly
+used domain adaptive benchmarks to evaluate the proposed SAFF, and the
+experimental results show that the proposed SAFF can be easily combined with
+different backbone networks and obtain better performance as a plug-in-plug-out
+module.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Recent Event Camera Innovations: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13627v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13627v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bharatesh Chakravarthi, Aayush Atul Verma, Kostas Daniilidis, Cornelia Fermuller, Yezhou Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Event-based vision, inspired by the human visual system, offers
+transformative capabilities such as low latency, high dynamic range, and
+reduced power consumption. This paper presents a comprehensive survey of event
+cameras, tracing their evolution over time. It introduces the fundamental
+principles of event cameras, compares them with traditional frame cameras, and
+highlights their unique characteristics and operational differences. The survey
+covers various event camera models from leading manufacturers, key
+technological milestones, and influential research contributions. It explores
+diverse application areas across different domains and discusses essential
+real-world and synthetic datasets for research advancement. Additionally, the
+role of event camera simulators in testing and development is discussed. This
+survey aims to consolidate the current state of event cameras and inspire
+further innovation in this rapidly evolving field. To support the research
+community, a GitHub page
+(https://github.com/chakravarthi589/Event-based-Vision_Resources) categorizes
+past and future research articles and consolidates valuable resources.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Improved Anomaly Detection Model for Automated Inspection of Power
+  Line Insulators 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.11470v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.11470v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laya Das, Blazhe Gjorgiev, Giovanni Sansavini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Inspection of insulators is important to ensure reliable operation of the
+power system. Deep learning is being increasingly exploited to automate the
+inspection process by leveraging object detection models to analyse aerial
+images captured by drones. A purely object detection-based approach, however,
+suffers from class imbalance-induced poor performance, which can be accentuated
+for infrequent and hard-to-detect incipient faults. This article proposes the
+use of anomaly detection along with object detection in a two-stage approach
+for incipient fault detection in a data-efficient manner. An explainable
+convolutional one-class classifier is adopted for anomaly detection. The
+one-class formulation reduces the reliance on plentifully available images of
+faulty insulators, while the explainability of the model is expected to promote
+adoption by the industry. A modified loss function is developed that addresses
+computational and interpretability issues with the existing model, also
+allowing for the integration of other losses. The superiority of the novel loss
+function is demonstrated with MVTec-AD dataset. The models are trained for
+insulator inspection with two datasets -- representing data-abundant and
+data-scarce scenarios -- in unsupervised and semi-supervised settings. The
+results suggest that including as few as five real anomalies in the training
+dataset significantly improves the model's performance and enables reliable
+detection of rarely occurring incipient faults in insulators.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ 3D Adaptive Structural Convolution Network for Domain-Invariant Point
+  Cloud Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.04833v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.04833v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Younggun Kim, Beomsik Cho, Seonghoon Ryoo, Soomok Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adapting deep learning networks for point cloud data recognition in
+self-driving vehicles faces challenges due to the variability in datasets and
+sensor technologies, emphasizing the need for adaptive techniques to maintain
+accuracy across different conditions. In this paper, we introduce the 3D
+Adaptive Structural Convolution Network (3D-ASCN), a cutting-edge framework for
+3D point cloud recognition. It combines 3D convolution kernels, a structural
+tree structure, and adaptive neighborhood sampling for effective geometric
+feature extraction. This method obtains domain-invariant features and
+demonstrates robust, adaptable performance on a variety of point cloud
+datasets, ensuring compatibility across diverse sensor configurations without
+the need for parameter adjustments. This highlights its potential to
+significantly enhance the reliability and efficiency of self-driving vehicle
+technology.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Attention is All They Need: Exploring the Media Archaeology of the
+  Computer Vision Research Paper 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.11200v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.11200v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samuel Goree, Gabriel Appleby, David Crandall, Norman Su
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Research papers, in addition to textual documents, are a designed interface
+through which researchers communicate. Recently, rapid growth has transformed
+that interface in many fields of computing. In this work, we examine the
+effects of this growth from a media archaeology perspective, through the
+changes to figures and tables in research papers. Specifically, we study these
+changes in computer vision over the past decade, as the deep learning
+revolution has driven unprecedented growth in the discipline. We ground our
+investigation through interviews with veteran researchers spanning computer
+vision, graphics, and visualization. Our analysis focuses on the research
+attention economy: how research paper elements contribute towards advertising,
+measuring, and disseminating an increasingly commodified "contribution."
+Through this work, we seek to motivate future discussion surrounding the design
+of both the research paper itself as well as the larger sociotechnical research
+publishing system, including tools for finding, reading, and writing research
+papers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TAAT: Think and Act from Arbitrary Texts in Text2Motion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.14745v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.14745v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Runqi Wang, Caoyuan Ma, Guopeng Li, Zheng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text to Motion aims to generate human motions from texts. Existing settings
+assume that texts include action labels, which limits flexibility in practical
+scenarios. This paper extends this task with a more realistic assumption that
+the texts are arbitrary. Specifically, in our setting, arbitrary texts include
+existing action texts composed of action labels and introduce scene texts
+without explicit action labels. To address this practical issue, we extend the
+action texts in the HUMANML3D dataset by incorporating additional scene texts,
+thereby creating a new dataset, HUMANML3D++. Concurrently, we propose a simple
+framework that extracts action representations from arbitrary texts using a
+Large Language Model (LLM) and subsequently generates motions. Furthermore, we
+enhance the existing evaluation methodologies to address their inadequacies.
+Extensive experiments are conducted under different application scenarios to
+validate the effectiveness of the proposed framework on existing and proposed
+datasets. The results indicate that Text to Motion in this realistic setting is
+very challenging, fostering new research in this practical direction. Our
+dataset and code will be released.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Updated errors in author information</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FPO++: Efficient Encoding and Rendering of Dynamic Neural Radiance
+  Fields by Analyzing and Enhancing Fourier PlenOctrees 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.20710v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.20710v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saskia Rabich, Patrick Stotko, Reinhard Klein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fourier PlenOctrees have shown to be an efficient representation for
+real-time rendering of dynamic Neural Radiance Fields (NeRF). Despite its many
+advantages, this method suffers from artifacts introduced by the involved
+compression when combining it with recent state-of-the-art techniques for
+training the static per-frame NeRF models. In this paper, we perform an
+in-depth analysis of these artifacts and leverage the resulting insights to
+propose an improved representation. In particular, we present a novel density
+encoding that adapts the Fourier-based compression to the characteristics of
+the transfer function used by the underlying volume rendering procedure and
+leads to a substantial reduction of artifacts in the dynamic model.
+Furthermore, we show an augmentation of the training data that relaxes the
+periodicity assumption of the compression. We demonstrate the effectiveness of
+our enhanced Fourier PlenOctrees in the scope of quantitative and qualitative
+evaluations on synthetic and real-world scenes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ IPAdapter-Instruct: Resolving Ambiguity in Image-based Conditioning
+  using Instruct <span class="highlight-title">Prompt</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.03209v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.03209v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ciara Rowles, Shimon Vainer, Dante De Nigris, Slava Elizarov, Konstantin Kutsy, Simon Donné
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models continuously push the boundary of state-of-the-art image
+generation, but the process is hard to control with any nuance: practice proves
+that textual prompts are inadequate for accurately describing image style or
+fine structural details (such as faces). ControlNet and IPAdapter address this
+shortcoming by conditioning the generative process on imagery instead, but each
+individual instance is limited to modeling a single conditional posterior: for
+practical use-cases, where multiple different posteriors are desired within the
+same workflow, training and using multiple adapters is cumbersome. We propose
+IPAdapter-Instruct, which combines natural-image conditioning with ``Instruct''
+prompts to swap between interpretations for the same conditioning image: style
+transfer, object extraction, both, or something else still? IPAdapterInstruct
+efficiently learns multiple tasks with minimal loss in quality compared to
+dedicated per-task models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 10 figures, Project page:
+  https://unity-research.github.io/IP-Adapter-Instruct.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dynamic Object Queries for <span class="highlight-title">Transformer</span>-based Incremental Object
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.21687v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.21687v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jichuan Zhang, Wei Li, Shuang Cheng, Ya-Li Li, Shengjin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Incremental object detection (IOD) aims to sequentially learn new classes,
+while maintaining the capability to locate and identify old ones. As the
+training data arrives with annotations only with new classes, IOD suffers from
+catastrophic forgetting. Prior methodologies mainly tackle the forgetting issue
+through knowledge distillation and exemplar replay, ignoring the conflict
+between limited model capacity and increasing knowledge. In this paper, we
+explore \textit{dynamic object queries} for incremental object detection built
+on Transformer architecture. We propose the \textbf{Dy}namic object
+\textbf{Q}uery-based \textbf{DE}tection \textbf{TR}ansformer (DyQ-DETR), which
+incrementally expands the model representation ability to achieve
+stability-plasticity tradeoff. First, a new set of learnable object queries are
+fed into the decoder to represent new classes. These new object queries are
+aggregated with those from previous phases to adapt both old and new knowledge
+well. Second, we propose the isolated bipartite matching for object queries in
+different phases, based on disentangled self-attention. The interaction among
+the object queries at different phases is eliminated to reduce inter-class
+confusion. Thanks to the separate supervision and computation over object
+queries, we further present the risk-balanced partial calibration for effective
+exemplar replay. Extensive experiments demonstrate that DyQ-DETR significantly
+surpasses the state-of-the-art methods, with limited parameter overhead. Code
+will be made publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Listen, Disentangle, and Control: Controllable Speech-Driven Talking
+  Head Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.07257v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.07257v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Changpeng Cai, Guinan Guo, Jiao Li, Junhao Su, Chenghao He, Jing Xiao, Yuanxu Chen, Lei Dai, Feiyu Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most earlier investigations on talking face generation have focused on the
+synchronization of lip motion and speech content. However, human head pose and
+facial emotions are equally important characteristics of natural human faces.
+While audio-driven talking face generation has seen notable advancements,
+existing methods either overlook facial emotions or are limited to specific
+individuals and cannot be applied to arbitrary subjects. In this paper, we
+propose a one-shot Talking Head Generation framework (SPEAK) that distinguishes
+itself from general Talking Face Generation by enabling emotional and postural
+control. Specifically, we introduce the Inter-Reconstructed Feature
+Disentanglement (IRFD) method to decouple human facial features into three
+latent spaces. We then design a face editing module that modifies speech
+content and facial latent codes into a single latent space. Subsequently, we
+present a novel generator that employs modified latent codes derived from the
+editing module to regulate emotional expression, head poses, and speech content
+in synthesizing facial animations. Extensive trials demonstrate that our method
+can generate realistic talking head with coordinated lip motions, authentic
+facial emotions, and smooth head movements. The demo video is available at the
+anonymous link: https://anonymous.4open.science/r/SPEAK-F56E
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Due to our negligence, there are factual errors in the experimental
+  results, so we are considering resubmitting the paper after an overhaul</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BayTTA: Uncertainty-aware medical image classification with optimized
+  test-time augmentation using Bayesian model averaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.17640v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.17640v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeinab Sherkatghanad, Moloud Abdar, Mohammadreza Bakhtyari, Pawel Plawiak, Vladimir Makarenkov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Test-time augmentation (TTA) is a well-known technique employed during the
+testing phase of computer vision tasks. It involves aggregating multiple
+augmented versions of input data. Combining predictions using a simple average
+formulation is a common and straightforward approach after performing TTA. This
+paper introduces a novel framework for optimizing TTA, called BayTTA
+(Bayesian-based TTA), which is based on Bayesian Model Averaging (BMA). First,
+we generate a prediction list associated with different variations of the input
+data created through TTA. Then, we use BMA to combine predictions weighted by
+the respective posterior probabilities. Such an approach allows one to take
+into account model uncertainty, and thus to enhance the predictive performance
+of the related machine learning or deep learning model. We evaluate the
+performance of BayTTA on various public data, including three medical image
+datasets comprising skin cancer, breast cancer, and chest X-ray images and two
+well-known gene editing datasets, CRISPOR and GUIDE-seq. Our experimental
+results indicate that BayTTA can be effectively integrated into
+state-of-the-art deep learning models used in medical image analysis as well as
+into some popular pre-trained CNN models such as VGG-16, MobileNetV2,
+DenseNet201, ResNet152V2, and InceptionRes-NetV2, leading to the enhancement in
+their accuracy and robustness performance. The source code of the proposed
+BayTTA method is freely available at: \underline
+{https://github.com/Z-Sherkat/BayTTA}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Smartphone-Based Method for Assessing Tomato Nutrient Status through
+  Trichome Density Measurement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.19513v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.19513v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sho Ueda, Xujun Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Early detection of fertilizer-induced stress in tomato plants is crucial for
+timely crop management interventions and yield optimization. Conventional
+optical methods detect fertilizer stress in young leaves with difficulty. This
+study proposes a novel, noninvasive technique for quantifying the density of
+trichomes-elongated hair-like structures found on plant surfaces-on young
+leaves using a smartphone. This method exhibits superior detection latency,
+enabling earlier and more accurate identification of fertilizer stress in
+tomato plants. Our approach combines augmented reality technology and image
+processing algorithms to analyze smartphone images of a specialized measurement
+paper. This measurement paper is applied to a tomato leaf to transfer trichomes
+onto its adhesive surface. The captured images are then processed through a
+pipeline involving region of interest extraction, perspective transformation,
+and illumination correction. Trichome detection and spatial distribution
+analysis of these preprocessed images yield a robust density metric. We
+validated our method through experiments on hydroponically grown tomatoes under
+varying fertilizer concentrations. Using leave-one-out cross-validation
+(LOOCV), our model achieves a mean area under the precision-recall curve of
+0.824 and a receiver operating characteristic curve of 0.641 for predicting
+additional fertilization needs. Based on LOOCV, quantitative analysis revealed
+a strong relationship between trichome density and explanatory variables,
+including nitrate ion concentration, explaining 62.48% of the variation ($R^2 =
+0.625$). The predicted and actual trichome densities were strongly correlated
+($r = 0.794$). This straightforward and cost-effective method overcomes the
+limitations of traditional techniques, demonstrating the potential of using
+smartphones for practical plant nutrition diagnosis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Text3DAug -- <span class="highlight-title">Prompt</span>ed Instance Augmentation for LiDAR Perception <span class="chip">IROS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14253v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14253v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laurenz Reichardt, Luca Uhr, Oliver Wasenmüller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  LiDAR data of urban scenarios poses unique challenges, such as heterogeneous
+characteristics and inherent class imbalance. Therefore, large-scale datasets
+are necessary to apply deep learning methods. Instance augmentation has emerged
+as an efficient method to increase dataset diversity. However, current methods
+require the time-consuming curation of 3D models or costly manual data
+annotation. To overcome these limitations, we propose Text3DAug, a novel
+approach leveraging generative models for instance augmentation. Text3DAug does
+not depend on labeled data and is the first of its kind to generate instances
+and annotations from text. This allows for a fully automated pipeline,
+eliminating the need for manual effort in practical applications. Additionally,
+Text3DAug is sensor agnostic and can be applied regardless of the LiDAR sensor
+used. Comprehensive experimental analysis on LiDAR segmentation, detection and
+novel class discovery demonstrates that Text3DAug is effective in supplementing
+existing methods or as a standalone method, performing on par or better than
+established methods, however while overcoming their specific drawbacks. The
+code is publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the 2024 IEEE/RSJ International Conference on Intelligent
+  Robots and Systems (IROS 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adaptive Fusion of Radiomics and Deep Features for Lung Adenocarcinoma
+  Subtype Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.13997v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.13997v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jing Zhou, Xiaotong Fu, Xirong Li, Ying Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The most common type of lung cancer, lung adenocarcinoma (LUAD), has been
+increasingly detected since the advent of low-dose computed tomography
+screening technology. In clinical practice, pre-invasive LUAD (Pre-IAs) should
+only require regular follow-up care, while invasive LUAD (IAs) should receive
+immediate treatment with appropriate lung cancer resection, based on the cancer
+subtype. However, prior research on diagnosing LUAD has mainly focused on
+classifying Pre-IAs/IAs, as techniques for distinguishing different subtypes of
+IAs have been lacking. In this study, we proposed a multi-head attentional
+feature fusion (MHA-FF) model for not only distinguishing IAs from Pre-IAs, but
+also for distinguishing the different subtypes of IAs. To predict the subtype
+of each nodule accurately, we leveraged both radiomics and deep features
+extracted from computed tomography images. Furthermore, those features were
+aggregated through an adaptive fusion module that can learn attention-based
+discriminative features. The utility of our proposed method is demonstrated
+here by means of real-world data collected from a multi-center cohort.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 5 figures and 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ODDR: Outlier Detection & Dimension Reduction Based Defense Against
+  Adversarial Patches 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.12084v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.12084v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nandish Chattopadhyay, Amira Guesmi, Muhammad Abdullah Hanif, Bassem Ouni, Muhammad Shafique
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adversarial attacks present a significant challenge to the dependable
+deployment of machine learning models, with patch-based attacks being
+particularly potent. These attacks introduce adversarial perturbations in
+localized regions of an image, deceiving even well-trained models. In this
+paper, we propose Outlier Detection and Dimension Reduction (ODDR), a
+comprehensive defense strategy engineered to counteract patch-based adversarial
+attacks through advanced statistical methodologies. Our approach is based on
+the observation that input features corresponding to adversarial
+patches-whether naturalistic or synthetic-deviate from the intrinsic
+distribution of the remaining image data and can thus be identified as
+outliers. ODDR operates through a robust three-stage pipeline: Fragmentation,
+Segregation, and Neutralization. This model-agnostic framework is versatile,
+offering protection across various tasks, including image classification,
+object detection, and depth estimation, and is proved effective in both
+CNN-based and Transformer-based architectures. In the Fragmentation stage,
+image samples are divided into smaller segments, preparing them for the
+Segregation stage, where advanced outlier detection techniques isolate
+anomalous features linked to adversarial perturbations. The Neutralization
+stage then applies dimension reduction techniques to these outliers,
+effectively neutralizing the adversarial impact while preserving critical
+information for the machine learning task. Extensive evaluation on benchmark
+datasets against state-of-the-art adversarial patches underscores the efficacy
+of ODDR. Our method enhances model accuracy from 39.26% to 79.1% under the
+GoogleAp attack, outperforming leading defenses such as LGS (53.86%), Jujutsu
+(60%), and Jedi (64.34%).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Regional quality estimation for echocardiography using deep learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.00591v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.00591v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gilles Van De Vyver, Svein-Erik Måsøy, Håvard Dalen, Bjørnar Leangen Grenne, Espen Holte, Sindre Hellum Olaisen, John Nyberg, Andreas Østvik, Lasse Løvstakken, Erik Smistad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic estimation of cardiac ultrasound image quality can be beneficial
+for guiding operators and ensuring the accuracy of clinical measurements.
+Previous work often fails to distinguish the view correctness of the
+echocardiogram from the image quality. Additionally, previous studies only
+provide a global image quality value, which limits their practical utility. In
+this work, we developed and compared three methods to estimate image quality:
+1) classic pixel-based metrics like the generalized contrast-to-noise ratio
+(gCNR) on myocardial segments as region of interest and left ventricle lumen as
+background, obtained using a U-Net segmentation 2) local image coherence
+derived from a U-Net model that predicts coherence from B-Mode images 3) a deep
+convolutional network that predicts the quality of each region directly in an
+end-to-end fashion. We evaluate each method against manual regional image
+quality annotations by three experienced cardiologists. The results indicate
+poor performance of the gCNR metric, with Spearman correlation to the
+annotations of rho = 0.24. The end-to-end learning model obtains the best
+result, rho = 0.69, comparable to the inter-observer correlation, rho = 0.63.
+Finally, the coherence-based method, with rho = 0.58, outperformed the
+classical metrics and is more generic than the end-to-end approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLM4GEN: Leveraging Semantic Representation of LLMs for Text-to-Image
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.00737v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.00737v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mushui Liu, Yuhang Ma, Yang Zhen, Jun Dan, Yunlong Yu, Zeng Zhao, Zhipeng Hu, Bai Liu, Changjie Fan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have exhibited substantial success in text-to-image
+generation. However, they often encounter challenges when dealing with complex
+and dense prompts involving multiple objects, attribute binding, and long
+descriptions. In this paper, we propose a novel framework called
+\textbf{LLM4GEN}, which enhances the semantic understanding of text-to-image
+diffusion models by leveraging the representation of Large Language Models
+(LLMs). It can be seamlessly incorporated into various diffusion models as a
+plug-and-play component. A specially designed Cross-Adapter Module (CAM)
+integrates the original text features of text-to-image models with LLM
+features, thereby enhancing text-to-image generation. Additionally, to
+facilitate and correct entity-attribute relationships in text prompts, we
+develop an entity-guided regularization loss to further improve generation
+performance. We also introduce DensePrompts, which contains $7,000$ dense
+prompts to provide a comprehensive evaluation for the text-to-image generation
+task. Experiments indicate that LLM4GEN significantly improves the semantic
+alignment of SD1.5 and SDXL, demonstrating increases of 9.69\% and 12.90\% in
+color on T2I-CompBench, respectively. Moreover, it surpasses existing models in
+terms of sample quality, image-text alignment, and human evaluation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 13 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring Human-in-the-Loop Test-Time Adaptation by Synergizing Active
+  Learning and Model Selection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.18911v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.18911v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yushu Li, Yongyi Su, Xulei Yang, Kui Jia, Xun Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing test-time adaptation (TTA) approaches often adapt models with the
+unlabeled testing data stream. A recent attempt relaxed the assumption by
+introducing limited human annotation, referred to as Human-In-the-Loop
+Test-Time Adaptation (HILTTA) in this study. The focus of existing HILTTA
+studies lies in selecting the most informative samples to label, a.k.a. active
+learning. In this work, we are motivated by a pitfall of TTA, i.e. sensitivity
+to hyper-parameters, and propose to approach HILTTA by synergizing active
+learning and model selection. Specifically, we first select samples for human
+annotation (active learning) and then use the labeled data to select optimal
+hyper-parameters (model selection). To prevent the model selection process from
+overfitting to local distributions, multiple regularization techniques are
+employed to complement the validation objective. A sample selection strategy is
+further tailored by considering the balance between active learning and model
+selection purposes. We demonstrate on 5 TTA datasets that the proposed HILTTA
+approach is compatible with off-the-shelf TTA methods and such combinations
+substantially outperform the state-of-the-art HILTTA methods. Importantly, our
+proposed method can always prevent choosing the worst hyper-parameters on all
+off-the-shelf TTA methods. The source code will be released upon publication.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Barbie: Text to Barbie-Style 3D Avatars 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.09126v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.09126v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaokun Sun, Zhenyu Zhang, Ying Tai, Qian Wang, Hao Tang, Zili Yi, Jian Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in text-guided 3D avatar generation have made substantial
+progress by distilling knowledge from diffusion models. Despite the plausible
+generated appearance, existing methods cannot achieve fine-grained
+disentanglement or high-fidelity modeling between inner body and outfit. In
+this paper, we propose Barbie, a novel framework for generating 3D avatars that
+can be dressed in diverse and high-quality Barbie-like garments and
+accessories. Instead of relying on a holistic model, Barbie achieves
+fine-grained disentanglement on avatars by semantic-aligned separated models
+for human body and outfits. These disentangled 3D representations are then
+optimized by different expert models to guarantee the domain-specific fidelity.
+To balance geometry diversity and reasonableness, we propose a series of losses
+for template-preserving and human-prior evolving. The final avatar is enhanced
+by unified texture refinement for superior texture consistency. Extensive
+experiments demonstrate that Barbie outperforms existing methods in both
+dressed human and outfit generation, supporting flexible apparel combination
+and animation. The code will be released for research purposes. Our project
+page is: https://xiaokunsun.github.io/Barbie.github.io/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tora: Trajectory-oriented Diffusion <span class="highlight-title">Transformer</span> for Video Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.21705v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.21705v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenghao Zhang, Junchao Liao, Menghao Li, Zuozhuo Dai, Bingxue Qiu, Siyu Zhu, Long Qin, Weizhi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in Diffusion Transformer (DiT) have demonstrated
+remarkable proficiency in producing high-quality video content. Nonetheless,
+the potential of transformer-based diffusion models for effectively generating
+videos with controllable motion remains an area of limited exploration. This
+paper introduces Tora, the first trajectory-oriented DiT framework that
+concurrently integrates textual, visual, and trajectory conditions, thereby
+enabling scalable video generation with effective motion guidance.
+Specifically, Tora consists of a Trajectory Extractor(TE), a Spatial-Temporal
+DiT, and a Motion-guidance Fuser(MGF). The TE encodes arbitrary trajectories
+into hierarchical spacetime motion patches with a 3D video compression network.
+The MGF integrates the motion patches into the DiT blocks to generate
+consistent videos that accurately follow designated trajectories. Our design
+aligns seamlessly with DiT's scalability, allowing precise control of video
+content's dynamics with diverse durations, aspect ratios, and resolutions.
+Extensive experiments demonstrate Tora's excellence in achieving high motion
+fidelity, while also meticulously simulating the intricate movement of the
+physical world.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TFDet: Target-Aware Fusion for RGB-T Pedestrian Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16580v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16580v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xue Zhang, Xiaohan Zhang, Jiangtao Wang, Jiacheng Ying, Zehua Sheng, Heng Yu, Chunguang Li, Hui-Liang Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pedestrian detection plays a critical role in computer vision as it
+contributes to ensuring traffic safety. Existing methods that rely solely on
+RGB images suffer from performance degradation under low-light conditions due
+to the lack of useful information. To address this issue, recent multispectral
+detection approaches have combined thermal images to provide complementary
+information and have obtained enhanced performances. Nevertheless, few
+approaches focus on the negative effects of false positives caused by noisy
+fused feature maps. Different from them, we comprehensively analyze the impacts
+of false positives on the detection performance and find that enhancing feature
+contrast can significantly reduce these false positives. In this paper, we
+propose a novel target-aware fusion strategy for multispectral pedestrian
+detection, named TFDet. TFDet achieves state-of-the-art performance on two
+multispectral pedestrian benchmarks, KAIST and LLVIP. TFDet can easily extend
+to multi-class object detection scenarios. It outperforms the previous best
+approaches on two multispectral object detection benchmarks, FLIR and M3FD.
+Importantly, TFDet has comparable inference efficiency to the previous
+approaches, and has remarkably good detection performance even under low-light
+conditions, which is a significant advancement for ensuring road safety.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted by IEEE T-NNLS journal. Please jump to
+  External DOI to view the official version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Robustness of Human Detection Algorithms in Maritime SAR
+  through Augmented Aerial Images to Simulate Weather Conditions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13766v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13766v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miguel Tjia, Artem Kim, Elaine Wynette Wijaya, Hanna Tefara, Kevin Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  7,651 cases of Search and Rescue Missions (SAR) were reported by the United
+States Coast Guard in 2024, with over 1322 SAR helicopters deployed in the 6
+first months alone. Through the utilizations of YOLO, we were able to run
+different weather conditions and lighting from our augmented dataset for
+training. YOLO then utilizes CNNs to apply a series of convolutions and pooling
+layers to the input image, where the convolution layers are able to extract the
+main features of the image. Through this, our YOLO model is able to learn to
+differentiate different objects which may considerably improve its accuracy,
+possibly enhancing the efficiency of SAR operations through enhanced detection
+accuracy. This paper aims to improve the model's accuracy of human detection in
+maritime SAR by evaluating a robust datasets containing various elevations and
+geological locations, as well as through data augmentation which simulates
+different weather and lighting. We observed that models trained on augmented
+datasets outperformed their non-augmented counterparts in which the human
+recall scores ranged from 0.891 to 0.911 with an improvement rate of 3.4\% on
+the YOLOv5l model. Results showed that these models demonstrate greater
+robustness to real-world conditions in varying of weather, brightness, tint,
+and contrast.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Classification Matters: Improving Video Action Detection with
+  Class-Specific Attention <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.19698v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.19698v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinsung Lee, Taeoh Kim, Inwoong Lee, Minho Shim, Dongyoon Wee, Minsu Cho, Suha Kwak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video action detection (VAD) aims to detect actors and classify their actions
+in a video. We figure that VAD suffers more from classification rather than
+localization of actors. Hence, we analyze how prevailing methods form features
+for classification and find that they prioritize actor regions, yet often
+overlooking the essential contextual information necessary for accurate
+classification. Accordingly, we propose to reduce the bias toward actor and
+encourage paying attention to the context that is relevant to each action
+class. By assigning a class-dedicated query to each action class, our model can
+dynamically determine where to focus for effective classification. The proposed
+model demonstrates superior performance on three challenging benchmarks with
+significantly fewer parameters and less computation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 pages, accepted to ECCV 2024 (oral)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Channel-Partitioned Windowed Attention And Frequency Learning for Single
+  Image Super-Resolution <span class="chip">BMVC 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.16232v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.16232v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dinh Phu Tran, Dao Duy Hung, Daeyoung Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, window-based attention methods have shown great potential for
+computer vision tasks, particularly in Single Image Super-Resolution (SISR).
+However, it may fall short in capturing long-range dependencies and
+relationships between distant tokens. Additionally, we find that learning on
+spatial domain does not convey the frequency content of the image, which is a
+crucial aspect in SISR. To tackle these issues, we propose a new
+Channel-Partitioned Attention Transformer (CPAT) to better capture long-range
+dependencies by sequentially expanding windows along the height and width of
+feature maps. In addition, we propose a novel Spatial-Frequency Interaction
+Module (SFIM), which incorporates information from spatial and frequency
+domains to provide a more comprehensive information from feature maps. This
+includes information about the frequency content and enhances the receptive
+field across the entire image. Experimental findings show the effectiveness of
+our proposed modules and architecture. In particular, CPAT surpasses current
+state-of-the-art methods by up to 0.31dB at x2 SR on Urban100.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Camera ready version, BMVC 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Distribution-Aware Calibration for Object Detection with Noisy Bounding
+  Boxes <span class="chip">BMVC2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.12017v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.12017v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Donghao Zhou, Jialin Li, Jinpeng Li, Jiancheng Huang, Qiang Nie, Yong Liu, Bin-Bin Gao, Qiong Wang, Pheng-Ann Heng, Guangyong Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale well-annotated datasets are of great importance for training an
+effective object detector. However, obtaining accurate bounding box annotations
+is laborious and demanding. Unfortunately, the resultant noisy bounding boxes
+could cause corrupt supervision signals and thus diminish detection
+performance. Motivated by the observation that the real ground-truth is usually
+situated in the aggregation region of the proposals assigned to a noisy
+ground-truth, we propose DIStribution-aware CalibratiOn (DISCO) to model the
+spatial distribution of proposals for calibrating supervision signals. In
+DISCO, spatial distribution modeling is performed to statistically extract the
+potential locations of objects. Based on the modeled distribution, three
+distribution-aware techniques, i.e., distribution-aware proposal augmentation
+(DA-Aug), distribution-aware box refinement (DA-Ref), and distribution-aware
+confidence estimation (DA-Est), are developed to improve classification,
+localization, and interpretability, respectively. Extensive experiments on
+large-scale noisy image datasets (i.e., Pascal VOC and MS-COCO) demonstrate
+that DISCO can achieve state-of-the-art detection performance, especially at
+high noise levels. Code is available at https://github.com/Correr-Zhou/DISCO.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by BMVC2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Pano2Room: Novel View Synthesis from a Single Indoor Panorama <span class="chip">SIGGRAPH</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.11413v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.11413v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guo Pu, Yiming Zhao, Zhouhui Lian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent single-view 3D generative methods have made significant advancements
+by leveraging knowledge distilled from extensive 3D object datasets. However,
+challenges persist in the synthesis of 3D scenes from a single view, primarily
+due to the complexity of real-world environments and the limited availability
+of high-quality prior resources. In this paper, we introduce a novel approach
+called Pano2Room, designed to automatically reconstruct high-quality 3D indoor
+scenes from a single panoramic image. These panoramic images can be easily
+generated using a panoramic RGBD inpainter from captures at a single location
+with any camera. The key idea is to initially construct a preliminary mesh from
+the input panorama, and iteratively refine this mesh using a panoramic RGBD
+inpainter while collecting photo-realistic 3D-consistent pseudo novel views.
+Finally, the refined mesh is converted into a 3D Gaussian Splatting field and
+trained with the collected pseudo novel views. This pipeline enables the
+reconstruction of real-world 3D scenes, even in the presence of large
+occlusions, and facilitates the synthesis of photo-realistic novel views with
+detailed geometry. Extensive qualitative and quantitative experiments have been
+conducted to validate the superiority of our method in single-panorama indoor
+novel synthesis compared to the state-of-the-art. Our code and data are
+available at \url{https://github.com/TrickyGo/Pano2Room}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>SIGGRAPH Asia 2024 Conference Papers (SA Conference Papers '24),
+  December 3--6, 2024, Tokyo, Japan</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Training-free Long Video Generation with Chain of Diffusion Model
+  Experts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13423v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13423v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenhao Li, Yichao Cao, Xiu Su, Xi Lin, Shan You, Mingkai Zheng, Yi Chen, Chang Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video generation models hold substantial potential in areas such as
+filmmaking. However, current video diffusion models need high computational
+costs and produce suboptimal results due to high complexity of video generation
+task. In this paper, we propose \textbf{ConFiner}, an efficient high-quality
+video generation framework that decouples video generation into easier
+subtasks: structure \textbf{con}trol and spatial-temporal re\textbf{fine}ment.
+It can generate high-quality videos with chain of off-the-shelf diffusion model
+experts, each expert responsible for a decoupled subtask. During the
+refinement, we introduce coordinated denoising, which can merge multiple
+diffusion experts' capabilities into a single sampling. Furthermore, we design
+ConFiner-Long framework, which can generate long coherent video with three
+constraint strategies on ConFiner. Experimental results indicate that with only
+10\% of the inference cost, our ConFiner surpasses representative models like
+Lavie and Modelscope across all objective and subjective metrics. And
+ConFiner-Long can generate high-quality and coherent videos with up to 600
+frames.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FaceCat: Enhancing Face Recognition Security with a Unified Diffusion
+  Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.09193v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.09193v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawei Chen, Xiao Yang, Yinpeng Dong, Hang Su, Zhaoxia Yin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Face anti-spoofing (FAS) and adversarial detection (FAD) have been regarded
+as critical technologies to ensure the safety of face recognition systems.
+However, due to limited practicality, complex deployment, and the additional
+computational overhead, it is necessary to implement both detection techniques
+within a unified framework. This paper aims to achieve this goal by breaking
+through two primary obstacles: 1) the suboptimal face feature representation
+and 2) the scarcity of training data. To address the limited performance caused
+by existing feature representations, motivated by the rich structural and
+detailed features of face diffusion models, we propose FaceCat, the first
+approach leveraging the diffusion model to simultaneously enhance the
+performance of FAS and FAD. Specifically, FaceCat elaborately designs a
+hierarchical fusion mechanism to capture rich face semantic features of the
+diffusion model. These features then serve as a robust foundation for a
+lightweight head, designed to execute FAS and FAD simultaneously. Due to the
+limitations in feature representation that arise from relying solely on
+single-modality image data, we further propose a novel text-guided multi-modal
+alignment strategy that utilizes text prompts to enrich feature representation,
+thereby enhancing performance. To combat data scarcity, we build a
+comprehensive dataset with a wide range of 28 attack types, offering greater
+potential for a unified framework in facial security. Extensive experiments
+validate the effectiveness of FaceCat generalizes significantly better and
+obtains excellent robustness against common input transformations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sewer Image Super-Resolution with Depth Priors and Its Lightweight
+  Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.19271v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.19271v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gang Pan, Chen Wang, Zhijie Sui, Shuai Guo, Yaozhi Lv, Honglie Li, Di Sun, Zixia Xia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Quick-view (QV) technique serves as a primary method for detecting
+defects within sewerage systems. However, the effectiveness of QV is impeded by
+the limited visual range of its hardware, resulting in suboptimal image quality
+for distant portions of the sewer network. Image super-resolution is an
+effective way to improve image quality and has been applied in a variety of
+scenes. However, research on super-resolution for sewer images remains
+considerably unexplored. In response, this study leverages the inherent depth
+relationships present within QV images and introduces a novel Depth-guided,
+Reference-based Super-Resolution framework denoted as DSRNet. It comprises two
+core components: a depth extraction module and a depth information matching
+module (DMM). DSRNet utilizes the adjacent frames of the low-resolution image
+as reference images and helps them recover texture information based on the
+correlation. By combining these modules, the integration of depth priors
+significantly enhances both visual quality and performance benchmarks. Besides,
+in pursuit of computational efficiency and compactness, a super-resolution
+knowledge distillation model based on an attention mechanism is introduced.
+This mechanism facilitates the acquisition of feature similarity between a more
+complex teacher model and a streamlined student model, with the latter being a
+lightweight version of DSRNet. Experimental results demonstrate that DSRNet
+significantly improves PSNR and SSIM compared with other methods. This study
+also conducts experiments on sewer defect semantic segmentation, object
+detection, and classification on the Pipe dataset and Sewer-ML dataset.
+Experiments show that the method can improve the performance of low-resolution
+sewer images in these tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GenFormer -- Generated Images are All You Need to Improve Robustness of
+  <span class="highlight-title">Transformer</span>s on Small <span class="highlight-title">Dataset</span>s <span class="chip">ICPR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14131v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14131v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sven Oehri, Nikolas Ebert, Ahmed Abdullah, Didier Stricker, Oliver Wasenmüller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent studies showcase the competitive accuracy of Vision Transformers
+(ViTs) in relation to Convolutional Neural Networks (CNNs), along with their
+remarkable robustness. However, ViTs demand a large amount of data to achieve
+adequate performance, which makes their application to small datasets
+challenging, falling behind CNNs. To overcome this, we propose GenFormer, a
+data augmentation strategy utilizing generated images, thereby improving
+transformer accuracy and robustness on small-scale image classification tasks.
+In our comprehensive evaluation we propose Tiny ImageNetV2, -R, and -A as new
+test set variants of Tiny ImageNet by transferring established ImageNet
+generalization and robustness benchmarks to the small-scale data domain.
+Similarly, we introduce MedMNIST-C and EuroSAT-C as corrupted test set variants
+of established fine-grained datasets in the medical and aerial domain. Through
+a series of experiments conducted on small datasets of various domains,
+including Tiny ImageNet, CIFAR, EuroSAT and MedMNIST datasets, we demonstrate
+the synergistic power of our method, in particular when combined with common
+train and test time augmentations, knowledge distillation, and architectural
+design choices. Additionally, we prove the effectiveness of our approach under
+challenging conditions with limited training data, demonstrating significant
+improvements in both accuracy and robustness, bridging the gap between CNNs and
+ViTs in the small-scale dataset domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted at International Conference on Pattern
+  Recognition (ICPR), 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DiffuseHigh: Training-free Progressive High-Resolution Image Synthesis
+  through Structure Guidance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.18459v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.18459v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Younghyun Kim, Geunmin Hwang, Junyu Zhang, Eunbyung Park
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale generative models, such as text-to-image diffusion models, have
+garnered widespread attention across diverse domains due to their creative and
+high-fidelity image generation. Nonetheless, existing large-scale diffusion
+models are confined to generating images of up to 1K resolution, which is far
+from meeting the demands of contemporary commercial applications. Directly
+sampling higher-resolution images often yields results marred by artifacts such
+as object repetition and distorted shapes. Addressing the aforementioned issues
+typically necessitates training or fine-tuning models on higher-resolution
+datasets. However, this poses a formidable challenge due to the difficulty in
+collecting large-scale high-resolution images and substantial computational
+resources. While several preceding works have proposed alternatives to bypass
+the cumbersome training process, they often fail to produce convincing results.
+In this work, we probe the generative ability of diffusion models at higher
+resolution beyond their original capability and propose a novel progressive
+approach that fully utilizes generated low-resolution images to guide the
+generation of higher-resolution images. Our method obviates the need for
+additional training or fine-tuning which significantly lowers the burden of
+computational costs. Extensive experiments and results validate the efficiency
+and efficacy of our method. Project page:
+https://yhyun225.github.io/DiffuseHigh/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://yhyun225.github.io/DiffuseHigh/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ 5%>100%: Breaking Performance Shackles of Full Fine-Tuning on Visual
+  Recognition Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.08345v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.08345v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongshuo Yin, Leiyi Hu, Bin Li, Youqun Zhang, Xue Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-training & fine-tuning can enhance the transferring efficiency and
+performance in visual tasks. Recent delta-tuning methods provide more options
+for visual classification tasks. Despite their success, existing visual
+delta-tuning art fails to exceed the upper limit of full fine-tuning on
+challenging tasks like object detection and segmentation. To find a competitive
+alternative to full fine-tuning, we propose the Multi-cognitive Visual Adapter
+(Mona) tuning, a novel adapter-based tuning method. First, we introduce
+multiple vision-friendly filters into the adapter to enhance its ability to
+process visual signals, while previous methods mainly rely on language-friendly
+linear filters. Second, we add the scaled normalization layer in the adapter to
+regulate the distribution of input features for visual filters. To fully
+demonstrate the practicality and generality of Mona, we conduct experiments on
+multiple representative visual tasks, including instance segmentation on COCO,
+semantic segmentation on ADE20K, object detection on Pascal VOC, oriented
+object detection on DOTA/STAR, and image classification on three common
+datasets. Exciting results illustrate that Mona surpasses full fine-tuning on
+all these tasks, and is the only delta-tuning method outperforming full
+fine-tuning on the above various tasks. For example, Mona achieves 1%
+performance gain on the COCO dataset compared to full fine-tuning.
+Comprehensive results suggest that Mona-tuning is more suitable for retaining
+and utilizing the capabilities of pre-trained models than full fine-tuning. The
+code will be released at https://github.com/Leiyi-Hu/mona.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2311.15010</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SwiftBrush v2: Make Your One-step Diffusion Model Better Than Its
+  Teacher <span class="chip">ECCV'24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14176v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14176v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Trung Dao, Thuan Hoang Nguyen, Thanh Le, Duc Vu, Khoi Nguyen, Cuong Pham, Anh Tran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we aim to enhance the performance of SwiftBrush, a prominent
+one-step text-to-image diffusion model, to be competitive with its multi-step
+Stable Diffusion counterpart. Initially, we explore the quality-diversity
+trade-off between SwiftBrush and SD Turbo: the former excels in image
+diversity, while the latter excels in image quality. This observation motivates
+our proposed modifications in the training methodology, including better weight
+initialization and efficient LoRA training. Moreover, our introduction of a
+novel clamped CLIP loss enhances image-text alignment and results in improved
+image quality. Remarkably, by combining the weights of models trained with
+efficient LoRA and full training, we achieve a new state-of-the-art one-step
+diffusion model, achieving an FID of 8.14 and surpassing all GAN-based and
+multi-step Stable Diffusion models. The project page is available at
+https://swiftbrushv2.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ECCV'24</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ STAMP: Outlier-Aware Test-Time Adaptation with Stable Memory Replay <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.15773v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.15773v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongcan Yu, Lijun Sheng, Ran He, Jian Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Test-time adaptation (TTA) aims to address the distribution shift between the
+training and test data with only unlabeled data at test time. Existing TTA
+methods often focus on improving recognition performance specifically for test
+data associated with classes in the training set. However, during the
+open-world inference process, there are inevitably test data instances from
+unknown classes, commonly referred to as outliers. This paper pays attention to
+the problem that conducts both sample recognition and outlier rejection during
+inference while outliers exist. To address this problem, we propose a new
+approach called STAble Memory rePlay (STAMP), which performs optimization over
+a stable memory bank instead of the risky mini-batch. In particular, the memory
+bank is dynamically updated by selecting low-entropy and label-consistent
+samples in a class-balanced manner. In addition, we develop a self-weighted
+entropy minimization strategy that assigns higher weight to low-entropy
+samples. Extensive results demonstrate that STAMP outperforms existing TTA
+methods in terms of both recognition and outlier detection performance. The
+code is released at https://github.com/yuyongcan/STAMP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ECCV 2024; Fixed a bug in calculating OOD score of STAMP
+  and updated the results</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SONICS: Synthetic Or Not -- Identifying Counterfeit Songs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14080v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14080v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Awsafur Rahman, Zaber Ibn Abdul Hakim, Najibul Haque Sarker, Bishmoy Paul, Shaikh Anowarul Fattah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent surge in AI-generated songs presents exciting possibilities and
+challenges. While these tools democratize music creation, they also necessitate
+the ability to distinguish between human-composed and AI-generated songs for
+safeguarding artistic integrity and content curation. Existing research and
+datasets in fake song detection only focus on singing voice deepfake detection
+(SVDD), where the vocals are AI-generated but the instrumental music is sourced
+from real songs. However, this approach is inadequate for contemporary
+end-to-end AI-generated songs where all components (vocals, lyrics, music, and
+style) could be AI-generated. Additionally, existing datasets lack lyrics-music
+diversity, long-duration songs, and open fake songs. To address these gaps, we
+introduce SONICS, a novel dataset for end-to-end Synthetic Song Detection
+(SSD), comprising over 97k songs with over 49k synthetic songs from popular
+platforms like Suno and Udio. Furthermore, we highlight the importance of
+modeling long-range temporal dependencies in songs for effective authenticity
+detection, an aspect overlooked in existing methods. To capture these patterns,
+we propose a novel model, SpecTTTra, that is up to 3 times faster and 6 times
+more memory efficient compared to popular CNN and Transformer-based models
+while maintaining competitive performance. Finally, we offer both AI-based and
+Human evaluation benchmarks, addressing another deficiency in current research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Jump-teaching: Ultra Efficient and Robust Learning with Noisy Label 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17137v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17137v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kangye Ji, Fei Cheng, Zeqing Wang, Bohu Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sample selection is the most straightforward technique to combat label noise,
+aiming to distinguish mislabeled samples during training and avoid the
+degradation of the robustness of the model. In the workflow, $\textit{selecting
+possibly clean data}$ and $\textit{model update}$ are iterative. However, their
+interplay and intrinsic characteristics hinder the robustness and efficiency of
+learning with noisy labels: 1) The model chooses clean data with selection
+bias, leading to the accumulated error in the model update. 2) Most selection
+strategies leverage partner networks or supplementary information to mitigate
+label corruption, albeit with increased computation resources and lower
+throughput speed. Therefore, we employ only one network with the jump manner
+update to decouple the interplay and mine more semantic information from the
+loss for a more precise selection. Specifically, the selection of clean data
+for each model update is based on one of the prior models, excluding the last
+iteration. The strategy of model update exhibits a jump behavior in the form.
+Moreover, we map the outputs of the network and labels into the same semantic
+feature space, respectively. In this space, a detailed and simple loss
+distribution is generated to distinguish clean samples more effectively. Our
+proposed approach achieves almost up to $2.53\times$ speedup, $0.46\times$ peak
+memory footprint, and superior robustness over state-of-the-art works with
+various noise settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Research on the Spatial Data Intelligent Foundation Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19730v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19730v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaohua Wang, Xing Xie, Yong Li, Danhuai Guo, Zhi Cai, Yu Liu, Yang Yue, Xiao Pan, Feng Lu, Huayi Wu, Zhipeng Gui, Zhiming Ding, Bolong Zheng, Fuzheng Zhang, Jingyuan Wang, Zhengchao Chen, Hao Lu, Jiayi Li, Peng Yue, Wenhao Yu, Yao Yao, Leilei Sun, Yong Zhang, Longbiao Chen, Xiaoping Du, Xiang Li, Xueying Zhang, Kun Qin, Zhaoya Gong, Weihua Dong, Xiaofeng Meng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This report focuses on spatial data intelligent large models, delving into
+the principles, methods, and cutting-edge applications of these models. It
+provides an in-depth discussion on the definition, development history, current
+status, and trends of spatial data intelligent large models, as well as the
+challenges they face. The report systematically elucidates the key technologies
+of spatial data intelligent large models and their applications in urban
+environments, aerospace remote sensing, geography, transportation, and other
+scenarios. Additionally, it summarizes the latest application cases of spatial
+data intelligent large models in themes such as urban development, multimodal
+systems, remote sensing, smart transportation, and resource environments.
+Finally, the report concludes with an overview and outlook on the development
+prospects of spatial data intelligent large models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>V1 and V2 are in Chinese language, other versions are in English</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SiCP: Simultaneous Individual and Cooperative Perception for 3D Object
+  Detection in Connected and Automated Vehicles <span class="chip">IROS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.04822v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.04822v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Deyuan Qu, Qi Chen, Tianyu Bai, Hongsheng Lu, Heng Fan, Hao Zhang, Song Fu, Qing Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cooperative perception for connected and automated vehicles is traditionally
+achieved through the fusion of feature maps from two or more vehicles. However,
+the absence of feature maps shared from other vehicles can lead to a
+significant decline in 3D object detection performance for cooperative
+perception models compared to standalone 3D detection models. This drawback
+impedes the adoption of cooperative perception as vehicle resources are often
+insufficient to concurrently employ two perception models. To tackle this
+issue, we present Simultaneous Individual and Cooperative Perception (SiCP), a
+generic framework that supports a wide range of the state-of-the-art standalone
+perception backbones and enhances them with a novel Dual-Perception Network
+(DP-Net) designed to facilitate both individual and cooperative perception. In
+addition to its lightweight nature with only 0.13M parameters, DP-Net is robust
+and retains crucial gradient information during feature map fusion. As
+demonstrated in a comprehensive evaluation on the V2V4Real and OPV2V datasets,
+thanks to DP-Net, SiCP surpasses state-of-the-art cooperative perception
+solutions while preserving the performance of standalone perception solutions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IROS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VTON-HandFit: Virtual Try-on for Arbitrary Hand Pose Guided by Hand
+  Priors Embedding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.12340v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.12340v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yujie Liang, Xiaobin Hu, Boyuan Jiang, Donghao Luo, Kai WU, Wenhui Han, Taisong Jin, Chengjie Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although diffusion-based image virtual try-on has made considerable progress,
+emerging approaches still struggle to effectively address the issue of hand
+occlusion (i.e., clothing regions occluded by the hand part), leading to a
+notable degradation of the try-on performance. To tackle this issue widely
+existing in real-world scenarios, we propose VTON-HandFit, leveraging the power
+of hand priors to reconstruct the appearance and structure for hand occlusion
+cases. Firstly, we tailor a Handpose Aggregation Net using the ControlNet-based
+structure explicitly and adaptively encoding the global hand and pose priors.
+Besides, to fully exploit the hand-related structure and appearance
+information, we propose Hand-feature Disentanglement Embedding module to
+disentangle the hand priors into the hand structure-parametric and
+visual-appearance features, and customize a masked cross attention for further
+decoupled feature embedding. Lastly, we customize a hand-canny constraint loss
+to better learn the structure edge knowledge from the hand template of model
+image. VTON-HandFit outperforms the baselines in qualitative and quantitative
+evaluations on the public dataset and our self-collected hand-occlusion
+Handfit-3K dataset particularly for the arbitrary hand pose occlusion cases in
+real-world scenarios. The Code and dataset will be available at
+\url{https://github.com/VTON-HandFit/VTON-HandFit}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The project page is \url{https://vton-handfit.github.io}</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Private Gradient Estimation is Useful for Generative Modeling <span class="chip">ACM MM 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.10662v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.10662v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bochao Liu, Pengju Wang, Weijia Guo, Yong Li, Liansheng Zhuang, Weiping Wang, Shiming Ge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While generative models have proved successful in many domains, they may pose
+a privacy leakage risk in practical deployment. To address this issue,
+differentially private generative model learning has emerged as a solution to
+train private generative models for different downstream tasks. However,
+existing private generative modeling approaches face significant challenges in
+generating high-dimensional data due to the inherent complexity involved in
+modeling such data. In this work, we present a new private generative modeling
+approach where samples are generated via Hamiltonian dynamics with gradients of
+the private dataset estimated by a well-trained network. In the approach, we
+achieve differential privacy by perturbing the projection vectors in the
+estimation of gradients with sliced score matching. In addition, we enhance the
+reconstruction ability of the model by incorporating a residual enhancement
+module during the score matching. For sampling, we perform Hamiltonian dynamics
+with gradients estimated by the well-trained network, allowing the sampled data
+close to the private dataset's manifold step by step. In this way, our model is
+able to generate data with a resolution of 256x256. Extensive experiments and
+analysis clearly demonstrate the effectiveness and rationality of the proposed
+approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted by ACM MM 2024 Oral</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sapiens: Foundation for Human Vision Models <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.12569v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.12569v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rawal Khirodkar, Timur Bagautdinov, Julieta Martinez, Su Zhaoen, Austin James, Peter Selednik, Stuart Anderson, Shunsuke Saito
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Sapiens, a family of models for four fundamental human-centric
+vision tasks -- 2D pose estimation, body-part segmentation, depth estimation,
+and surface normal prediction. Our models natively support 1K high-resolution
+inference and are extremely easy to adapt for individual tasks by simply
+fine-tuning models pretrained on over 300 million in-the-wild human images. We
+observe that, given the same computational budget, self-supervised pretraining
+on a curated dataset of human images significantly boosts the performance for a
+diverse set of human-centric tasks. The resulting models exhibit remarkable
+generalization to in-the-wild data, even when labeled data is scarce or
+entirely synthetic. Our simple model design also brings scalability -- model
+performance across tasks improves as we scale the number of parameters from 0.3
+to 2 billion. Sapiens consistently surpasses existing baselines across various
+human-centric benchmarks. We achieve significant improvements over the prior
+state-of-the-art on Humans-5K (pose) by 7.6 mAP, Humans-2K (part-seg) by 17.1
+mIoU, Hi4D (depth) by 22.4% relative RMSE, and THuman2 (normal) by 53.5%
+relative angular error. Project page:
+https://about.meta.com/realitylabs/codecavatars/sapiens.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ECCV 2024 (Oral)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BCDNet: A Convolutional Neural Network For Breast Cancer Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13800v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13800v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yujia Lin, Aiwei Lian, Mingyu Liao, Yipeng Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Previous research has established that breast cancer is a prevalent cancer
+type, with Invasive Ductal Carcinoma (IDC) being the most common subtype. The
+incidence of this dangerous cancer continues to rise, making accurate and rapid
+diagnosis, particularly in the early stages, critically important. While modern
+Computer-Aided Diagnosis (CAD) systems can address most cases, medical
+professionals still face challenges in using them in the field without powerful
+computing resources. In this paper, we propose a novel CNN model called BCDNet,
+which effectively detects IDC in histopathological images with an accuracy of
+up to 89.5% and reduces training time effectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Prompt</span>-Softbox-<span class="highlight-title">Prompt</span>: A free-text Embedding Control for Image Editing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13623v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13623v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yitong Yang, Yinglin Wang, Jing Wang, Tian Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-driven diffusion models have achieved remarkable success in image
+editing, but a crucial component in these models-text embeddings-has not been
+fully explored. The entanglement and opacity of text embeddings present
+significant challenges to achieving precise image editing. In this paper, we
+provide a comprehensive and in-depth analysis of text embeddings in Stable
+Diffusion XL, offering three key insights. First, while the 'aug_embedding'
+captures the full semantic content of the text, its contribution to the final
+image generation is relatively minor. Second, 'BOS' and 'Padding_embedding' do
+not contain any semantic information. Lastly, the 'EOS' holds the semantic
+information of all words and contains the most style features. Each word
+embedding plays a unique role without interfering with one another. Based on
+these insights, we propose a novel approach for controllable image editing
+using a free-text embedding control method called PSP (Prompt-Softbox-Prompt).
+PSP enables precise image editing by inserting or adding text embeddings within
+the cross-attention layers and using Softbox to define and control the specific
+area for semantic injection. This technique allows for obejct additions and
+replacements while preserving other areas of the image. Additionally, PSP can
+achieve style transfer by simply replacing text embeddings. Extensive
+experimental results show that PSP achieves significant results in tasks such
+as object replacement, object addition, and style transfer.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Attack on Scene Flow using Point Clouds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.13621v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.13621v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haniyeh Ehsani Oskouie, Mohammad-Shahram Moin, Shohreh Kasaei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks have made significant advancements in accurately
+estimating scene flow using point clouds, which is vital for many applications
+like video analysis, action recognition, and navigation. The robustness of
+these techniques, however, remains a concern, particularly in the face of
+adversarial attacks that have been proven to deceive state-of-the-art deep
+neural networks in many domains. Surprisingly, the robustness of scene flow
+networks against such attacks has not been thoroughly investigated. To address
+this problem, the proposed approach aims to bridge this gap by introducing
+adversarial white-box attacks specifically tailored for scene flow networks.
+Experimental results show that the generated adversarial examples obtain up to
+33.7 relative degradation in average end-point error on the KITTI and
+FlyingThings3D datasets. The study also reveals the significant impact that
+attacks targeting point clouds in only one dimension or color channel have on
+average end-point error. Analyzing the success and failure of these attacks on
+the scene flow networks and their 2D optical flow network variants shows a
+higher vulnerability for the optical flow networks. Code is available at
+https://github.com/aheldis/Attack-on-Scene-Flow-using-Point-Clouds.git.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Neurosymbolic Approach to Adaptive Feature Extraction in SLAM <span class="chip">IROS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.06889v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.06889v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yasra Chandio, Momin A. Khan, Khotso Selialia, Luis Garcia, Joseph DeGol, Fatima M. Anwar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous robots, autonomous vehicles, and humans wearing mixed-reality
+headsets require accurate and reliable tracking services for safety-critical
+applications in dynamically changing real-world environments. However, the
+existing tracking approaches, such as Simultaneous Localization and Mapping
+(SLAM), do not adapt well to environmental changes and boundary conditions
+despite extensive manual tuning. On the other hand, while deep learning-based
+approaches can better adapt to environmental changes, they typically demand
+substantial data for training and often lack flexibility in adapting to new
+domains. To solve this problem, we propose leveraging the neurosymbolic program
+synthesis approach to construct adaptable SLAM pipelines that integrate the
+domain knowledge from traditional SLAM approaches while leveraging data to
+learn complex relationships. While the approach can synthesize end-to-end SLAM
+pipelines, we focus on synthesizing the feature extraction module. We first
+devise a domain-specific language (DSL) that can encapsulate domain knowledge
+on the important attributes for feature extraction and the real-world
+performance of various feature extractors. Our neurosymbolic architecture then
+undertakes adaptive feature extraction, optimizing parameters via learning
+while employing symbolic reasoning to select the most suitable feature
+extractor. Our evaluations demonstrate that our approach, neurosymbolic Feature
+EXtraction (nFEX), yields higher-quality features. It also reduces the pose
+error observed for the state-of-the-art baseline feature extractors ORB and
+SIFT by up to 90% and up to 66%, respectively, thereby enhancing the system's
+efficiency and adaptability to novel environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures, and 5 tables. Published at the 2024 IEEE/RSJ
+  International Conference on Intelligent Robots and Systems (IROS).
+  Corresponding author: Yasra Chandio (ychandio@umass.edu)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OMEGAS: Object Mesh Extraction from Large Scenes Guided by Gaussian
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.15891v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.15891v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lizhi Wang, Feng Zhou, Bo yu, Pu Cao, Jianqin Yin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in 3D reconstruction technologies have paved the way for
+high-quality and real-time rendering of complex 3D scenes. Despite these
+achievements, a notable challenge persists: it is difficult to precisely
+reconstruct specific objects from large scenes. Current scene reconstruction
+techniques frequently result in the loss of object detail textures and are
+unable to reconstruct object portions that are occluded or unseen in views. To
+address this challenge, we delve into the meticulous 3D reconstruction of
+specific objects within large scenes and propose a framework termed OMEGAS:
+Object Mesh Extraction from Large Scenes Guided by Gaussian Segmentation.
+Specifically, we proposed a novel 3D target segmentation technique based on 2D
+Gaussian Splatting, which segments 3D consistent target masks in multi-view
+scene images and generates a preliminary target model. Moreover, to reconstruct
+the unseen portions of the target, we propose a novel target replenishment
+technique driven by large-scale generative diffusion priors. We demonstrate
+that our method can accurately reconstruct specific targets from large scenes,
+both quantitatively and qualitatively. Our experiments show that OMEGAS
+significantly outperforms existing reconstruction methods across various
+scenarios. Our project page is at: https://github.com/CrystalWlz/OMEGAS
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Expert Knowledge-Aware Image Difference Graph Representation Learning
+  for Difference-Aware Medical Visual Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11986v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11986v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyue Hu, Lin Gu, Qiyuan An, Mengliang Zhang, Liangchen Liu, Kazuma Kobayashi, Tatsuya Harada, Ronald M. Summers, Yingying Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To contribute to automating the medical vision-language model, we propose a
+novel Chest-Xray Difference Visual Question Answering (VQA) task. Given a pair
+of main and reference images, this task attempts to answer several questions on
+both diseases and, more importantly, the differences between them. This is
+consistent with the radiologist's diagnosis practice that compares the current
+image with the reference before concluding the report. We collect a new
+dataset, namely MIMIC-Diff-VQA, including 700,703 QA pairs from 164,324 pairs
+of main and reference images. Compared to existing medical VQA datasets, our
+questions are tailored to the Assessment-Diagnosis-Intervention-Evaluation
+treatment procedure used by clinical professionals. Meanwhile, we also propose
+a novel expert knowledge-aware graph representation learning model to address
+this task. The proposed baseline model leverages expert knowledge such as
+anatomical structure prior, semantic, and spatial knowledge to construct a
+multi-relationship graph, representing the image differences between two images
+for the image difference VQA task. The dataset and code can be found at
+https://github.com/Holipori/MIMIC-Diff-VQA. We believe this work would further
+push forward the medical vision language model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Variational Bayesian Imaging with an Efficient Surrogate Score-based
+  Prior 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.01949v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.01949v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Berthy T. Feng, Katherine L. Bouman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a surrogate function for efficient yet principled use of
+score-based priors in Bayesian imaging. We consider ill-posed inverse imaging
+problems in which one aims for a clean image posterior given incomplete or
+noisy measurements. Since the measurements do not uniquely determine a true
+image, a prior is needed to constrain the solution space. Recent work turned
+score-based diffusion models into principled priors for solving ill-posed
+imaging problems by appealing to an ODE-based log-probability function.
+However, evaluating the ODE is computationally inefficient and inhibits
+posterior estimation of high-dimensional images. Our proposed surrogate prior
+is based on the evidence lower bound of a score-based diffusion model. We
+demonstrate the surrogate prior on variational inference for efficient
+approximate posterior sampling of large images. Compared to the exact prior in
+previous work, our surrogate accelerates optimization of the variational image
+distribution by at least two orders of magnitude. We also find that our
+principled approach gives more accurate posterior estimation than
+non-variational diffusion-based approaches that involve hyperparameter-tuning
+at inference. Our work establishes a practical path forward for using
+score-based diffusion models as general-purpose image priors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in Transactions on Machine Learning Research (TMLR) August
+  2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reduce, Reuse, Recycle: Compositional Generation with Energy-Based
+  Diffusion Models and MCMC <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.11552v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.11552v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yilun Du, Conor Durkan, Robin Strudel, Joshua B. Tenenbaum, Sander Dieleman, Rob Fergus, Jascha Sohl-Dickstein, Arnaud Doucet, Will Grathwohl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Since their introduction, diffusion models have quickly become the prevailing
+approach to generative modeling in many domains. They can be interpreted as
+learning the gradients of a time-varying sequence of log-probability density
+functions. This interpretation has motivated classifier-based and
+classifier-free guidance as methods for post-hoc control of diffusion models.
+In this work, we build upon these ideas using the score-based interpretation of
+diffusion models, and explore alternative ways to condition, modify, and reuse
+diffusion models for tasks involving compositional generation and guidance. In
+particular, we investigate why certain types of composition fail using current
+techniques and present a number of solutions. We conclude that the sampler (not
+the model) is responsible for this failure and propose new samplers, inspired
+by MCMC, which enable successful compositional generation. Further, we propose
+an energy-based parameterization of diffusion models which enables the use of
+new compositional operators and more sophisticated, Metropolis-corrected
+samplers. Intriguingly we find these samplers lead to notable improvements in
+compositional generation across a wide set of problems such as
+classifier-guided ImageNet modeling and compositional text-to-image generation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2023, Project Webpage:
+  https://energy-based-model.github.io/reduce-reuse-recycle/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Dataset</span> Scale and Societal Consistency Mediate Facial Impression Bias in
+  Vision-Language AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.01959v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.01959v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Robert Wolfe, Aayushi Dangol, Alexis Hiniker, Bill Howe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal AI models capable of associating images and text hold promise for
+numerous domains, ranging from automated image captioning to accessibility
+applications for blind and low-vision users. However, uncertainty about bias
+has in some cases limited their adoption and availability. In the present work,
+we study 43 CLIP vision-language models to determine whether they learn
+human-like facial impression biases, and we find evidence that such biases are
+reflected across three distinct CLIP model families. We show for the first time
+that the the degree to which a bias is shared across a society predicts the
+degree to which it is reflected in a CLIP model. Human-like impressions of
+visually unobservable attributes, like trustworthiness and sexuality, emerge
+only in models trained on the largest dataset, indicating that a better fit to
+uncurated cultural data results in the reproduction of increasingly subtle
+social biases. Moreover, we use a hierarchical clustering approach to show that
+dataset size predicts the extent to which the underlying structure of facial
+impression bias resembles that of facial impression bias in humans. Finally, we
+show that Stable Diffusion models employing CLIP as a text encoder learn facial
+impression biases, and that these biases intersect with racial biases in Stable
+Diffusion XL-Turbo. While pretrained CLIP models may prove useful for
+scientific studies of bias, they will also require significant dataset curation
+when intended for use as general-purpose models in a zero-shot setting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Artificial Intelligence, Ethics, and Society 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Genixer: Empowering Multimodal Large Language Models as a Powerful Data
+  Generator <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.06731v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.06731v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Henry Hengyuan Zhao, Pan Zhou, Mike Zheng Shou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal Large Language Models (MLLMs) demonstrate exceptional
+problem-solving capabilities, but few research studies aim to gauge the ability
+to generate visual instruction tuning data. This paper proposes to explore the
+potential of empowering MLLMs to generate data independently without relying on
+GPT-4. We introduce Genixer, a comprehensive data generation pipeline
+consisting of four key steps: (i) instruction data collection, (ii) instruction
+template design, (iii) empowering MLLMs, and (iv) data generation and
+filtering. Additionally, we outline two modes of data generation: task-agnostic
+and task-specific, enabling controllable output. We demonstrate that a
+synthetic VQA-like dataset trained with LLaVA1.5 enhances performance on 10 out
+of 12 multimodal benchmarks. Additionally, the grounding MLLM Shikra, when
+trained with a REC-like synthetic dataset, shows improvements on 7 out of 8 REC
+datasets. Through experiments and synthetic data analysis, our findings are:
+(1) current MLLMs can serve as robust data generators without assistance from
+GPT-4V; (2) MLLMs trained with task-specific datasets can surpass GPT-4V in
+generating complex instruction tuning data; (3) synthetic datasets enhance
+performance across various multimodal benchmarks and help mitigate model
+hallucinations. The data, code, and models can be found at
+https://github.com/zhaohengyuan1/Genixer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ECCV 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Splatt3R: Zero-shot Gaussian Splatting from Uncalibrated Image Pairs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13912v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13912v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brandon Smart, Chuanxia Zheng, Iro Laina, Victor Adrian Prisacariu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce Splatt3R, a pose-free, feed-forward method for
+in-the-wild 3D reconstruction and novel view synthesis from stereo pairs. Given
+uncalibrated natural images, Splatt3R can predict 3D Gaussian Splats without
+requiring any camera parameters or depth information. For generalizability, we
+build Splatt3R upon a ``foundation'' 3D geometry reconstruction method, MASt3R,
+by extending it to deal with both 3D structure and appearance. Specifically,
+unlike the original MASt3R which reconstructs only 3D point clouds, we predict
+the additional Gaussian attributes required to construct a Gaussian primitive
+for each point. Hence, unlike other novel view synthesis methods, Splatt3R is
+first trained by optimizing the 3D point cloud's geometry loss, and then a
+novel view synthesis objective. By doing this, we avoid the local minima
+present in training 3D Gaussian Splats from stereo views. We also propose a
+novel loss masking strategy that we empirically find is critical for strong
+performance on extrapolated viewpoints. We train Splatt3R on the ScanNet++
+dataset and demonstrate excellent generalisation to uncalibrated, in-the-wild
+images. Splatt3R can reconstruct scenes at 4FPS at 512 x 512 resolution, and
+the resultant splats can be rendered in real-time.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Our project page can be found at: https://splatt3r.active.vision/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multimodality Helps Unimodality: Cross-Modal Few-Shot Learning with
+  Multimodal Models <span class="chip">CVPR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.06267v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.06267v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiqiu Lin, Samuel Yu, Zhiyi Kuang, Deepak Pathak, Deva Ramanan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ability to quickly learn a new task with minimal instruction - known as
+few-shot learning - is a central aspect of intelligent agents. Classical
+few-shot benchmarks make use of few-shot samples from a single modality, but
+such samples may not be sufficient to characterize an entire concept class. In
+contrast, humans use cross-modal information to learn new concepts efficiently.
+In this work, we demonstrate that one can indeed build a better ${\bf visual}$
+dog classifier by ${\bf read}$ing about dogs and ${\bf listen}$ing to them
+bark. To do so, we exploit the fact that recent multimodal foundation models
+such as CLIP learn cross-modal encoders that map different modalities to the
+same representation space. Specifically, we propose a simple strategy for ${\bf
+cross-modal}$ ${\bf adaptation}$: we treat examples from different modalities
+as additional few-shot examples. For example, by simply repurposing class names
+as an additional training sample, we trivially turn any n-shot learning problem
+into a (n+1)-shot problem. This allows us to produce SOTA results with
+embarrassingly simple linear classifiers. We show that our approach can be
+combined with existing methods such as prefix tuning, adapters, and classifier
+ensembling. Finally, to explore other modalities beyond vision and language, we
+construct the first (to our knowledge) audiovisual few-shot benchmark and use
+cross-modal training to improve the performance of both image and audio
+classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at CVPR 2023. Project site:
+  https://linzhiqiu.github.io/papers/cross_modal/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Policy Gradient-Driven Noise Mask 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.14568v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.14568v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mehmet Can Yavuz, Yang Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning classifiers face significant challenges when dealing with
+heterogeneous multi-modal and multi-organ biomedical datasets. The low-level
+feature distinguishability limited to imaging-modality hinders the classifiers'
+ability to learn high-level semantic relationships, resulting in sub-optimal
+performance. To address this issue, image augmentation strategies are employed
+as regularization techniques. While additive noise input during network
+training is a well-established augmentation as regularization method, modern
+pipelines often favor more robust techniques such as dropout and weight decay.
+This preference stems from the observation that combining these established
+techniques with noise input can adversely affect model performance.
+  In this study, we propose a novel pretraining pipeline that learns to
+generate conditional noise mask specifically tailored to improve performance on
+multi-modal and multi-organ datasets. As a reinforcement learning algorithm,
+our approach employs a dual-component system comprising a very light-weight
+policy network that learns to sample conditional noise using a differentiable
+beta distribution as well as a classifier network. The policy network is
+trained using the reinforce algorithm to generate image-specific noise masks
+that regularize the classifier during pretraining. A key aspect is that the
+policy network's role is limited to obtaining an intermediate (or heated) model
+before fine-tuning. During inference, the policy network is omitted, allowing
+direct comparison between the baseline and noise-regularized models.
+  We conducted experiments and related analyses on RadImageNet datasets.
+Results demonstrate that fine-tuning the intermediate models consistently
+outperforms conventional training algorithms on both classification and
+generalization to unseen concept tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages; 8 figures; 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Computer User Interface Understanding. A New <span class="highlight-title">Dataset</span> and a Learning
+  Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.10170v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.10170v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrés Muñoz, Daniel Borrajo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  User Interface (UI) understanding has been an increasingly popular topic over
+the last few years. So far, there has been a vast focus solely on web and
+mobile applications. In this paper, we introduce the harder task of computer UI
+understanding. With the goal of enabling research in this field, we have
+generated a dataset with a set of videos where a user is performing a sequence
+of actions and each image shows the desktop contents at that time point. We
+also present a framework that is composed of a synthetic sample generation
+pipeline to augment the dataset with relevant characteristics, and a
+contrastive learning method to classify images in the videos. We take advantage
+of the natural conditional, tree-like, relationship of the images'
+characteristics to regularize the learning of the representations by dealing
+with multiple partial tasks simultaneously. Experimental results show that the
+proposed framework outperforms previously proposed hierarchical multi-label
+contrastive losses in fine-grain UI classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages main paper, 6 pages appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">15</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Into the Unknown Unknowns: Engaged Human Learning through Participation
+  in Language Model Agent Conversations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15232v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15232v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yucheng Jiang, Yijia Shao, Dekun Ma, Sina J. Semnani, Monica S. Lam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While language model (LM)-powered chatbots and generative search engines
+excel at answering concrete queries, discovering information in the terrain of
+unknown unknowns remains challenging for users. To emulate the common
+educational scenario where children/students learn by listening to and
+participating in conversations of their parents/teachers, we create
+Collaborative STORM (Co-STORM). Unlike QA systems that require users to ask all
+the questions, Co-STORM lets users observe and occasionally steer the discourse
+among several LM agents. The agents ask questions on the user's behalf,
+allowing the user to discover unknown unknowns serendipitously. To facilitate
+user interaction, Co-STORM assists users in tracking the discourse by
+organizing the uncovered information into a dynamic mind map, ultimately
+generating a comprehensive report as takeaways. For automatic evaluation, we
+construct the WildSeek dataset by collecting real information-seeking records
+with user goals. Co-STORM outperforms baseline methods on both discourse trace
+and report quality. In a further human evaluation, 70% of participants prefer
+Co-STORM over a search engine, and 78% favor it over a RAG chatbot.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ X-Reflect: Cross-Reflection <span class="highlight-title">Prompt</span>ing for Multimodal Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15172v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15172v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanjia Lyu, Ryan Rossi, Xiang Chen, Md Mehrab Tanjim, Stefano Petrangeli, Somdeb Sarkhel, Jiebo Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) and Large Multimodal Models (LMMs) have been
+shown to enhance the effectiveness of enriching item descriptions, thereby
+improving the accuracy of recommendation systems. However, most existing
+approaches either rely on text-only prompting or employ basic multimodal
+strategies that do not fully exploit the complementary information available
+from both textual and visual modalities. This paper introduces a novel
+framework, Cross-Reflection Prompting, termed X-Reflect, designed to address
+these limitations by prompting LMMs to explicitly identify and reconcile
+supportive and conflicting information between text and images. By capturing
+nuanced insights from both modalities, this approach generates more
+comprehensive and contextually richer item representations. Extensive
+experiments conducted on two widely used benchmarks demonstrate that our method
+outperforms existing prompting baselines in downstream recommendation accuracy.
+Additionally, we evaluate the generalizability of our framework across
+different LMM backbones and the robustness of the prompting strategies,
+offering insights for optimization. This work underscores the importance of
+integrating multimodal information and presents a novel solution for improving
+item understanding in multimodal recommendation systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Measuring publication relatedness using controlled vocabularies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15004v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15004v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emil Dolmer Alnor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Measuring the relatedness between scientific publications has important
+applications in many areas of bibliometrics and science policy. Controlled
+vocabularies provide a promising basis for measuring relatedness because they
+address issues that arise when using citation or textual similarity to measure
+relatedness. While several controlled-vocabulary-based relatedness measures
+have been developed, there exists no comprehensive and direct test of their
+accuracy and suitability for different types of research questions. This paper
+reviews existing measures, develops a new measure, and benchmarks the measures
+using TREC Genomics data as a ground truth of topics. The benchmark test show
+that the new measure and the measure proposed by Ahlgren et al. (2020) have
+differing strengths and weaknesses. These results inform a discussion of which
+method to choose when studying interdisciplinarity, information retrieval,
+clustering of science, and researcher topic switching.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for presentation at the 28th International Conference on
+  Science, Technology and Innovation Indicators, 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Knowledge Discovery in Optical Music Recognition: Enhancing Information
+  Retrieval with Instance Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15002v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15002v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elona Shatri, George Fazekas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optical Music Recognition (OMR) automates the transcription of musical
+notation from images into machine-readable formats like MusicXML, MEI, or MIDI,
+significantly reducing the costs and time of manual transcription. This study
+explores knowledge discovery in OMR by applying instance segmentation using
+Mask R-CNN to enhance the detection and delineation of musical symbols in sheet
+music. Unlike Optical Character Recognition (OCR), OMR must handle the
+intricate semantics of Common Western Music Notation (CWMN), where symbol
+meanings depend on shape, position, and context. Our approach leverages
+instance segmentation to manage the density and overlap of musical symbols,
+facilitating more precise information retrieval from music scores. Evaluations
+on the DoReMi and MUSCIMA++ datasets demonstrate substantial improvements, with
+our method achieving a mean Average Precision (mAP) of up to 59.70\% in dense
+symbol environments, achieving comparable results to object detection.
+Furthermore, using traditional computer vision techniques, we add a parallel
+step for staff detection to infer the pitch for the recognised symbols. This
+study emphasises the role of pixel-wise segmentation in advancing accurate
+music symbol recognition, contributing to knowledge discovery in OMR. Our
+findings indicate that instance segmentation provides more precise
+representations of musical symbols, particularly in densely populated scores,
+advancing OMR technology. We make our implementation, pre-processing scripts,
+trained models, and evaluation results publicly available to support further
+research and development.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages content and one references, accepted version at the
+  International Conference on Knowledge Discovery and Information Retrieval
+  2024, Porto, Portugal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MRSE: An Efficient Multi-modality Retrieval System for Large Scale
+  E-commerce 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14968v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14968v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Jiang, Haoxiang Zhang, Qingshan Hou, Chaofeng Chen, Weisi Lin, Jingchang Zhang, Annan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Providing high-quality item recall for text queries is crucial in large-scale
+e-commerce search systems. Current Embedding-based Retrieval Systems (ERS)
+embed queries and items into a shared low-dimensional space, but uni-modality
+ERS rely too heavily on textual features, making them unreliable in complex
+contexts. While multi-modality ERS incorporate various data sources, they often
+overlook individual preferences for different modalities, leading to suboptimal
+results. To address these issues, we propose MRSE, a Multi-modality Retrieval
+System that integrates text, item images, and user preferences through
+lightweight mixture-of-expert (LMoE) modules to better align features across
+and within modalities. MRSE also builds user profiles at a multi-modality level
+and introduces a novel hybrid loss function that enhances consistency and
+robustness using hard negative sampling. Experiments on a large-scale dataset
+from Shopee and online A/B testing show that MRSE achieves an 18.9% improvement
+in offline relevance and a 3.7% gain in online core metrics compared to
+Shopee's state-of-the-art uni-modality system.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Triplètoile: Extraction of Knowledge from Microblogging Text 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14908v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14908v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vanni Zavarella, Sergio Consoli, Diego Reforgiato Recupero, Gianni Fenu, Simone Angioni, Davide Buscaldi, Danilo Dessì, Francesco Osborne
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Numerous methods and pipelines have recently emerged for the automatic
+extraction of knowledge graphs from documents such as scientific publications
+and patents. However, adapting these methods to incorporate alternative text
+sources like micro-blogging posts and news has proven challenging as they
+struggle to model open-domain entities and relations, typically found in these
+sources. In this paper, we propose an enhanced information extraction pipeline
+tailored to the extraction of a knowledge graph comprising open-domain entities
+from micro-blogging posts on social media platforms. Our pipeline leverages
+dependency parsing and classifies entity relations in an unsupervised manner
+through hierarchical clustering over word embeddings. We provide a use case on
+extracting semantic triples from a corpus of 100 thousand tweets about digital
+transformation and publicly release the generated knowledge graph. On the same
+dataset, we conduct two experimental evaluations, showing that the system
+produces triples with precision over 95% and outperforms similar pipelines of
+around 5% in terms of precision, while generating a comparatively higher number
+of triples.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>42 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Writing in the Margins: Better Inference Pattern for Long Context
+  Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14906v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14906v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Melisa Russak, Umar Jamil, Christopher Bryant, Kiran Kamble, Axel Magnuson, Mateusz Russak, Waseem AlShikh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce Writing in the Margins (WiM), a new inference
+pattern for Large Language Models designed to optimize the handling of long
+input sequences in retrieval-oriented tasks. This approach leverages the
+chunked prefill of the key-value cache to perform segment-wise inference, which
+enables efficient processing of extensive contexts along with the generation
+and classification of intermediate information ("margins") that guide the model
+towards specific tasks. This method increases computational overhead marginally
+while significantly enhancing the performance of off-the-shelf models without
+the need for fine-tuning. Specifically, we observe that WiM provides an average
+enhancement of 7.5% in accuracy for reasoning skills (HotpotQA, MultiHop-RAG)
+and more than a 30.0% increase in the F1-score for aggregation tasks (CWE).
+Additionally, we show how the proposed pattern fits into an interactive
+retrieval design that provides end-users with ongoing updates about the
+progress of context processing, and pinpoints the integration of relevant
+information into the final response. We release our implementation of WiM using
+Hugging Face Transformers library at
+https://github.com/writer/writing-in-the-margins.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Graph and Sequential Neural Networks in Session-based Recommendation: A
+  <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14851v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14851v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihao Li, Chao Yang, Yakun Chen, Xianzhi Wang, Hongxu Chen, Guandong Xu, Lina Yao, Quan Z. Sheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent years have witnessed the remarkable success of recommendation systems
+(RSs) in alleviating the information overload problem. As a new paradigm of
+RSs, session-based recommendation (SR) specializes in users' short-term
+preference capture and aims to provide a more dynamic and timely recommendation
+based on the ongoing interacted actions. In this survey, we will give a
+comprehensive overview of the recent works on SR. First, we clarify the
+definitions of various SR tasks and introduce the characteristics of
+session-based recommendation against other recommendation tasks. Then, we
+summarize the existing methods in two categories: sequential neural network
+based methods and graph neural network (GNN) based methods. The standard
+frameworks and technical are also introduced. Finally, we discuss the
+challenges of SR and new research directions in this area.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Personalized Video Summarization using Text-Based Queries and
+  Conditional Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14743v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14743v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jia-Hong Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The proliferation of video content on platforms like YouTube and Vimeo
+presents significant challenges in efficiently locating relevant information.
+Automatic video summarization aims to address this by extracting and presenting
+key content in a condensed form. This thesis explores enhancing video
+summarization by integrating text-based queries and conditional modeling to
+tailor summaries to user needs. Traditional methods often produce fixed
+summaries that may not align with individual requirements. To overcome this, we
+propose a multi-modal deep learning approach that incorporates both textual
+queries and visual information, fusing them at different levels of the model
+architecture. Evaluation metrics such as accuracy and F1-score assess the
+quality of the generated summaries. The thesis also investigates improving
+text-based query representations using contextualized word embeddings and
+specialized attention networks. This enhances the semantic understanding of
+queries, leading to better video summaries. To emulate human-like
+summarization, which accounts for both visual coherence and abstract factors
+like storyline consistency, we introduce a conditional modeling approach. This
+method uses multiple random variables and joint distributions to capture key
+summarization components, resulting in more human-like and explainable
+summaries. Addressing data scarcity in fully supervised learning, the thesis
+proposes a segment-level pseudo-labeling approach. This self-supervised method
+generates additional data, improving model performance even with limited
+human-labeled datasets. In summary, this research aims to enhance automatic
+video summarization by incorporating text-based queries, improving query
+representations, introducing conditional modeling, and addressing data
+scarcity, thereby creating more effective and personalized video summaries.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Ph.D. thesis, 137 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Snap and Diagnose: An Advanced Multimodal Retrieval System for
+  Identifying Plant Diseases in the Wild 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14723v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14723v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianqi Wei, Zhi Chen, Xin Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Plant disease recognition is a critical task that ensures crop health and
+mitigates the damage caused by diseases. A handy tool that enables farmers to
+receive a diagnosis based on query pictures or the text description of
+suspicious plants is in high demand for initiating treatment before potential
+diseases spread further. In this paper, we develop a multimodal plant disease
+image retrieval system to support disease search based on either image or text
+prompts. Specifically, we utilize the largest in-the-wild plant disease dataset
+PlantWild, which includes over 18,000 images across 89 categories, to provide a
+comprehensive view of potential diseases relating to the query. Furthermore,
+cross-modal retrieval is achieved in the developed system, facilitated by a
+novel CLIP-based vision-language model that encodes both disease descriptions
+and disease images into the same latent space. Built on top of the retriever,
+our retrieval system allows users to upload either plant disease images or
+disease descriptions to retrieve the corresponding images with similar
+characteristics from the disease dataset to suggest candidate diseases for end
+users' consideration.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Temporal Graph Neural Network-Powered Paper Recommendation on Dynamic
+  Citation Networks <span class="chip">AAAI-2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15371v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15371v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junhao Shen, Mohammad Ausaf Ali Haqqani, Beichen Hu, Cheng Huang, Xihao Xie, Tsengdar Lee, Jia Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to the rapid growth of scientific publications, identifying all related
+reference articles in the literature has become increasingly challenging yet
+highly demanding. Existing methods primarily assess candidate publications from
+a static perspective, focusing on the content of articles and their structural
+information, such as citation relationships. There is a lack of research
+regarding how to account for the evolving impact among papers on their
+embeddings. Toward this goal, this paper introduces a temporal dimension to
+paper recommendation strategies. The core idea is to continuously update a
+paper's embedding when new citation relationships appear, enhancing its
+relevance for future recommendations. Whenever a citation relationship is added
+to the literature upon the publication of a paper, the embeddings of the two
+related papers are updated through a Temporal Graph Neural Network (TGN). A
+learnable memory update module based on a Recurrent Neural Network (RNN) is
+utilized to study the evolution of the embedding of a paper in order to predict
+its reference impact in a future timestamp. Such a TGN-based model learns a
+pattern of how people's views of the paper may evolve, aiming to guide paper
+recommendations more precisely. Extensive experiments on an open citation
+network dataset, including 313,278 articles from
+https://paperswithcode.com/about PaperWithCode, have demonstrated the
+effectiveness of the proposed approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 4 figures, accepted by SDU@AAAI-2024. The AAAI Workshop on
+  Scientific Document Understanding (2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Pareto Front Approximation for Multi-Objective Session-Based Recommender
+  Systems <span class="chip">RecSys '24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.16828v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.16828v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Timo Wilm, Philipp Normann, Felix Stepprath
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work introduces MultiTRON, an approach that adapts Pareto front
+approximation techniques to multi-objective session-based recommender systems
+using a transformer neural network. Our approach optimizes trade-offs between
+key metrics such as click-through and conversion rates by training on sampled
+preference vectors. A significant advantage is that after training, a single
+model can access the entire Pareto front, allowing it to be tailored to meet
+the specific requirements of different stakeholders by adjusting an additional
+input vector that weights the objectives. We validate the model's performance
+through extensive offline and online evaluation. For broader application and
+research, the source code is made available at
+https://github.com/otto-de/MultiTRON. The results confirm the model's ability
+to manage multiple recommendation objectives effectively, offering a flexible
+tool for diverse business needs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the Eighteenth ACM Conference on Recommender Systems
+  (RecSys '24)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From Variability to Stability: Advancing RecSys Benchmarking Practices 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.09766v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.09766v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Valeriy Shevchenko, Nikita Belousov, Alexey Vasilev, Vladimir Zholobov, Artyom Sosedka, Natalia Semenova, Anna Volodkevich, Andrey Savchenko, Alexey Zaytsev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the rapidly evolving domain of Recommender Systems (RecSys), new
+algorithms frequently claim state-of-the-art performance based on evaluations
+over a limited set of arbitrarily selected datasets. However, this approach may
+fail to holistically reflect their effectiveness due to the significant impact
+of dataset characteristics on algorithm performance. Addressing this
+deficiency, this paper introduces a novel benchmarking methodology to
+facilitate a fair and robust comparison of RecSys algorithms, thereby advancing
+evaluation practices. By utilizing a diverse set of $30$ open datasets,
+including two introduced in this work, and evaluating $11$ collaborative
+filtering algorithms across $9$ metrics, we critically examine the influence of
+dataset characteristics on algorithm performance. We further investigate the
+feasibility of aggregating outcomes from multiple datasets into a unified
+ranking. Through rigorous experimental analysis, we validate the reliability of
+our methodology under the variability of datasets, offering a benchmarking
+strategy that balances quality and computational demands. This methodology
+enables a fair yet effective means of evaluating RecSys algorithms, providing
+valuable guidance for future research endeavors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages with 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Taxonomy-Guided Zero-Shot Recommendations with LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.14043v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.14043v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yueqing Liang, Liangwei Yang, Chen Wang, Xiongxiao Xu, Philip S. Yu, Kai Shu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the emergence of large language models (LLMs) and their ability to
+perform a variety of tasks, their application in recommender systems (RecSys)
+has shown promise. However, we are facing significant challenges when deploying
+LLMs into RecSys, such as limited prompt length, unstructured item information,
+and un-constrained generation of recommendations, leading to sub-optimal
+performance. To address these issues, we propose a novel method using a
+taxonomy dictionary. This method provides a systematic framework for
+categorizing and organizing items, improving the clarity and structure of item
+information. By incorporating the taxonomy dictionary into LLM prompts, we
+achieve efficient token utilization and controlled feature generation, leading
+to more accurate and contextually relevant recommendations. Our Taxonomy-guided
+Recommendation (TaxRec) approach features a two-step process: one-time taxonomy
+categorization and LLM-based recommendation, enabling zero-shot recommendations
+without the need for domain-specific fine-tuning. Experimental results
+demonstrate TaxRec significantly enhances recommendation quality compared to
+traditional zero-shot approaches, showcasing its efficacy as personal
+recommender with LLMs. Code is available at
+https://github.com/yueqingliang1/TaxRec.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RAGEval: Scenario Specific RAG Evaluation <span class="highlight-title">Dataset</span> Generation Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.01262v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.01262v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kunlun Zhu, Yifan Luo, Dingling Xu, Ruobing Wang, Shi Yu, Shuo Wang, Yukun Yan, Zhenghao Liu, Xu Han, Zhiyuan Liu, Maosong Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-Augmented Generation (RAG) systems have demonstrated their
+advantages in alleviating the hallucination of Large Language Models (LLMs).
+Existing RAG benchmarks mainly focus on evaluating whether LLMs can correctly
+answer the general knowledge. However, they are unable to evaluate the
+effectiveness of the RAG system in dealing with the data from different
+vertical domains. This paper introduces RAGEval, a framework for automatically
+generating evaluation datasets to evaluate the knowledge usage ability of
+different LLMs in different scenarios. Specifically, RAGEval summarizes a
+schema from seed documents, applies the configurations to generate diverse
+documents, and constructs question-answering pairs according to both articles
+and configurations. We propose three novel metrics, Completeness,
+Hallucination, and Irrelevance, to carefully evaluate the responses generated
+by LLMs. By benchmarking RAG models in vertical domains, RAGEval has the
+ability to better evaluate the knowledge usage ability of LLMs, which avoids
+the confusion regarding the source of knowledge in answering question in
+existing QA datasets--whether it comes from parameterized memory or retrieval.
+The code and dataset will be released.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>add github repo</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">175</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Verifiers: Reward Modeling as Next-Token Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15240v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15240v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lunjun Zhang, Arian Hosseini, Hritik Bansal, Mehran Kazemi, Aviral Kumar, Rishabh Agarwal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Verifiers or reward models are often used to enhance the reasoning
+performance of large language models (LLMs). A common approach is the Best-of-N
+method, where N candidate solutions generated by the LLM are ranked by a
+verifier, and the best one is selected. While LLM-based verifiers are typically
+trained as discriminative classifiers to score solutions, they do not utilize
+the text generation capabilities of pretrained LLMs. To overcome this
+limitation, we instead propose training verifiers using the ubiquitous
+next-token prediction objective, jointly on verification and solution
+generation. Compared to standard verifiers, such generative verifiers (GenRM)
+can benefit from several advantages of LLMs: they integrate seamlessly with
+instruction tuning, enable chain-of-thought reasoning, and can utilize
+additional inference-time compute via majority voting for better verification.
+We demonstrate that when using Gemma-based verifiers on algorithmic and
+grade-school math reasoning tasks, GenRM outperforms discriminative verifiers
+and LLM-as-a-Judge, showing a 16-64% improvement in the percentage of problems
+solved with Best-of-N. Furthermore, we show that GenRM scales favorably across
+dataset size, model capacity, and inference-time compute.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Mamba in the Llama: Distilling and Accelerating Hybrid Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15237v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15237v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junxiong Wang, Daniele Paliotta, Avner May, Alexander M. Rush, Tri Dao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Linear RNN architectures, like Mamba, can be competitive with Transformer
+models in language modeling while having advantageous deployment
+characteristics. Given the focus on training large-scale Transformer models, we
+consider the challenge of converting these pretrained models for deployment. We
+demonstrate that it is feasible to distill large Transformers into linear RNNs
+by reusing the linear projection weights from attention layers with academic
+GPU resources. The resulting hybrid model, which incorporates a quarter of the
+attention layers, achieves performance comparable to the original Transformer
+in chat benchmarks and outperforms open-source hybrid Mamba models trained from
+scratch with trillions of tokens in both chat benchmarks and general
+benchmarks. Moreover, we introduce a hardware-aware speculative decoding
+algorithm that accelerates the inference speed of Mamba and hybrid models.
+Overall we show how, with limited computation resources, we can remove many of
+the original attention layers and generate from the resulting model more
+efficiently. Our top-performing model, distilled from Llama3-8B-Instruct,
+achieves a 29.61 length-controlled win rate on AlpacaEval 2 against GPT-4 and
+7.35 on MT-Bench, surpassing the best instruction-tuned linear RNN model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code is open-sourced at https://github.com/jxiw/MambaInLlama</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DCT-CryptoNets: Scaling Private Inference in the Frequency Domain 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15231v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15231v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arjun Roy, Kaushik Roy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The convergence of fully homomorphic encryption (FHE) and machine learning
+offers unprecedented opportunities for private inference of sensitive data. FHE
+enables computation directly on encrypted data, safeguarding the entire machine
+learning pipeline, including data and model confidentiality. However, existing
+FHE-based implementations for deep neural networks face significant challenges
+in computational cost, latency, and scalability, limiting their practical
+deployment. This paper introduces DCT-CryptoNets, a novel approach that
+leverages frequency-domain learning to tackle these issues. Our method operates
+directly in the frequency domain, utilizing the discrete cosine transform (DCT)
+commonly employed in JPEG compression. This approach is inherently compatible
+with remote computing services, where images are usually transmitted and stored
+in compressed formats. DCT-CryptoNets reduces the computational burden of
+homomorphic operations by focusing on perceptually relevant low-frequency
+components. This is demonstrated by substantial latency reduction of up to
+5.3$\times$ compared to prior work on image classification tasks, including a
+novel demonstration of ImageNet inference within 2.5 hours, down from 12.5
+hours compared to prior work on equivalent compute resources. Moreover,
+DCT-CryptoNets improves the reliability of encrypted accuracy by reducing
+variability (e.g., from $\pm$2.5\% to $\pm$1.0\% on ImageNet). This study
+demonstrates a promising avenue for achieving efficient and practical
+privacy-preserving deep learning on high resolution images seen in real-world
+applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review; 10 pages content, 3 pages appendix, 4 figures, 8
+  tables; Code TBD</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLM Defenses Are Not Robust to Multi-Turn Human Jailbreaks Yet 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15221v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15221v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nathaniel Li, Ziwen Han, Ian Steneker, Willow Primack, Riley Goodside, Hugh Zhang, Zifan Wang, Cristina Menghini, Summer Yue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent large language model (LLM) defenses have greatly improved models'
+ability to refuse harmful queries, even when adversarially attacked. However,
+LLM defenses are primarily evaluated against automated adversarial attacks in a
+single turn of conversation, an insufficient threat model for real-world
+malicious use. We demonstrate that multi-turn human jailbreaks uncover
+significant vulnerabilities, exceeding 70% attack success rate (ASR) on
+HarmBench against defenses that report single-digit ASRs with automated
+single-turn attacks. Human jailbreaks also reveal vulnerabilities in machine
+unlearning defenses, successfully recovering dual-use biosecurity knowledge
+from unlearned models. We compile these results into Multi-Turn Human
+Jailbreaks (MHJ), a dataset of 2,912 prompts across 537 multi-turn jailbreaks.
+We publicly release MHJ alongside a compendium of jailbreak tactics developed
+across dozens of commercial red teaming engagements, supporting research
+towards stronger LLM defenses.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automatic 8-tissue Segmentation for 6-month Infant Brains <span class="chip">MICCAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15198v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15198v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yilan Dong, Vanessa Kyriakopoulou, Irina Grigorescu, Grainne McAlonan, Dafnis Batalle, Maria Deprez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Numerous studies have highlighted that atypical brain development,
+particularly during infancy and toddlerhood, is linked to an increased
+likelihood of being diagnosed with a neurodevelopmental condition, such as
+autism. Accurate brain tissue segmentations for morphological analysis are
+essential in numerous infant studies. However, due to ongoing white matter (WM)
+myelination changing tissue contrast in T1- and T2-weighted images, automatic
+tissue segmentation in 6-month infants is particularly difficult. On the other
+hand, manual labelling by experts is time-consuming and labor-intensive. In
+this study, we propose the first 8-tissue segmentation pipeline for
+six-month-old infant brains. This pipeline utilizes domain adaptation (DA)
+techniques to leverage our longitudinal data, including neonatal images
+segmented with the neonatal Developing Human Connectome Project structural
+pipeline. Our pipeline takes raw 6-month images as inputs and generates the
+8-tissue segmentation as outputs, forming an end-to-end segmentation pipeline.
+The segmented tissues include WM, gray matter (GM), cerebrospinal fluid (CSF),
+ventricles, cerebellum, basal ganglia, brainstem, and hippocampus/amygdala.
+Cycle-Consistent Generative Adversarial Network (CycleGAN) and Attention U-Net
+were employed to achieve the image contrast transformation between neonatal and
+6-month images and perform tissue segmentation on the synthesized 6-month
+images (neonatal images with 6-month intensity contrast), respectively.
+Moreover, we incorporated the segmentation outputs from Infant Brain Extraction
+and Analysis Toolbox (iBEAT) and another Attention U-Net to further enhance the
+performance and construct the end-to-end segmentation pipeline. Our evaluation
+with real 6-month images achieved a DICE score of 0.92, an HD95 of 1.6, and an
+ASSD of 0.42.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 4 figures, to be published in MICCAI PIPPI workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On latent dynamics learning in nonlinear reduced order modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15183v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15183v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicola Farenga, Stefania Fresca, Simone Brivio, Andrea Manzoni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we present the novel mathematical framework of latent dynamics
+models (LDMs) for reduced order modeling of parameterized nonlinear
+time-dependent PDEs. Our framework casts this latter task as a nonlinear
+dimensionality reduction problem, while constraining the latent state to evolve
+accordingly to an (unknown) dynamical system. A time-continuous setting is
+employed to derive error and stability estimates for the LDM approximation of
+the full order model (FOM) solution. We analyze the impact of using an explicit
+Runge-Kutta scheme in the time-discrete setting, resulting in the
+$\Delta\text{LDM}$ formulation, and further explore the learnable setting,
+$\Delta\text{LDM}_\theta$, where deep neural networks approximate the discrete
+LDM components, while providing a bounded approximation error with respect to
+the FOM. Moreover, we extend the concept of parameterized Neural ODE - recently
+proposed as a possible way to build data-driven dynamical systems with varying
+input parameters - to be a convolutional architecture, where the input
+parameters information is injected by means of an affine modulation mechanism,
+while designing a convolutional autoencoder neural network able to retain
+spatial-coherence, thus enhancing interpretability at the latent level.
+Numerical experiments, including the Burgers' and the
+advection-reaction-diffusion equations, demonstrate the framework's ability to
+obtain, in a multi-query context, a time-continuous approximation of the FOM
+solution, thus being able to query the LDM approximation at any given time
+instance while retaining a prescribed level of accuracy. Our findings highlight
+the remarkable potential of the proposed LDMs, representing a mathematically
+rigorous framework to enhance the accuracy and approximation capabilities of
+reduced order modeling for time-dependent parameterized PDEs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>43 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploiting Approximate Symmetry for Efficient Multi-Agent Reinforcement
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15173v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15173v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Batuhan Yardim, Niao He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mean-field games (MFG) have become significant tools for solving large-scale
+multi-agent reinforcement learning problems under symmetry. However, the
+assumption of exact symmetry limits the applicability of MFGs, as real-world
+scenarios often feature inherent heterogeneity. Furthermore, most works on MFG
+assume access to a known MFG model, which might not be readily available for
+real-world finite-agent games. In this work, we broaden the applicability of
+MFGs by providing a methodology to extend any finite-player, possibly
+asymmetric, game to an "induced MFG". First, we prove that $N$-player dynamic
+games can be symmetrized and smoothly extended to the infinite-player continuum
+via explicit Kirszbraun extensions. Next, we propose the notion of
+$\alpha,\beta$-symmetric games, a new class of dynamic population games that
+incorporate approximate permutation invariance. For $\alpha,\beta$-symmetric
+games, we establish explicit approximation bounds, demonstrating that a Nash
+policy of the induced MFG is an approximate Nash of the $N$-player dynamic
+game. We show that TD learning converges up to a small bias using trajectories
+of the $N$-player game with finite-sample guarantees, permitting symmetrized
+learning without building an explicit MFG model. Finally, for certain games
+satisfying monotonicity, we prove a sample complexity of
+$\widetilde{\mathcal{O}}(\varepsilon^{-6})$ for the $N$-agent game to learn an
+$\varepsilon$-Nash up to symmetrization bias. Our theory is supported by
+evaluations on MARL benchmarks with thousands of agents.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Latent Ewald summation for machine learning of long-range interactions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15165v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15165v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bingqing Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning interatomic potentials (MLIPs) often neglect long-range
+interactions, such as electrostatic and dispersion forces. In this work, we
+introduce a straightforward and efficient method to account for long-range
+interactions by learning a latent variable from local atomic descriptors and
+applying an Ewald summation to this variable. We demonstrate that in systems
+including charged, polar, or apolar molecular dimers, bulk water, and
+water-vapor interface, standard short-ranged MLIPs can lead to unphysical
+predictions even when employing message passing. The long-range models
+effectively eliminate these artifacts, with only about twice the computational
+cost of short-range MLIPs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Delay as Payoff in MAB 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15158v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15158v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ofir Schlisselberg, Ido Cohen, Tal Lancewicki, Yishay Mansour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we investigate a variant of the classical stochastic
+Multi-armed Bandit (MAB) problem, where the payoff received by an agent (either
+cost or reward) is both delayed, and directly corresponds to the magnitude of
+the delay. This setting models faithfully many real world scenarios such as the
+time it takes for a data packet to traverse a network given a choice of route
+(where delay serves as the agent's cost); or a user's time spent on a web page
+given a choice of content (where delay serves as the agent's reward).
+  Our main contributions are tight upper and lower bounds for both the cost and
+reward settings. For the case that delays serve as costs, which we are the
+first to consider, we prove optimal regret that scales as $\sum_{i:\Delta_i >
+0}\frac{\log T}{\Delta_i} + d^*$, where $T$ is the maximal number of steps,
+$\Delta_i$ are the sub-optimality gaps and $d^*$ is the minimal expected delay
+amongst arms. For the case that delays serves as rewards, we show optimal
+regret of $\sum_{i:\Delta_i > 0}\frac{\log T}{\Delta_i} + \bar{d}$, where $\bar
+d$ is the second maximal expected delay. These improve over the regret in the
+general delay-dependent payoff setting, which scales as $\sum_{i:\Delta_i >
+0}\frac{\log T}{\Delta_i} + D$, where $D$ is the maximum possible delay. Our
+regret bounds highlight the difference between the cost and reward scenarios,
+showing that the improvement in the cost scenario is more significant than for
+the reward. Finally, we accompany our theoretical results with an empirical
+evaluation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How <span class="highlight-title">transformer</span>s learn structured data: insights from hierarchical
+  filtering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15138v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15138v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jerome Garnier-Brun, Marc Mézard, Emanuele Moscato, Luca Saglietti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a hierarchical filtering procedure for generative models of
+sequences on trees, enabling control over the range of positional correlations
+in the data. Leveraging this controlled setting, we provide evidence that
+vanilla encoder-only transformer architectures can implement the optimal Belief
+Propagation algorithm on both root classification and masked language modeling
+tasks. Correlations at larger distances corresponding to increasing layers of
+the hierarchy are sequentially included as the network is trained. We analyze
+how the transformer layers succeed by focusing on attention maps from models
+trained with varying degrees of filtering. These attention maps show clear
+evidence for iterative hierarchical reconstruction of correlations, and we can
+relate these observations to a plausible implementation of the exact inference
+algorithm for the network sizes considered.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Low-Budget Simulation-Based Inference with Bayesian Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15136v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15136v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arnaud Delaunoy, Maxence de la Brassinne Bonardeaux, Siddharth Mishra-Sharma, Gilles Louppe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Simulation-based inference methods have been shown to be inaccurate in the
+data-poor regime, when training simulations are limited or expensive. Under
+these circumstances, the inference network is particularly prone to
+overfitting, and using it without accounting for the computational uncertainty
+arising from the lack of identifiability of the network weights can lead to
+unreliable results. To address this issue, we propose using Bayesian neural
+networks in low-budget simulation-based inference, thereby explicitly
+accounting for the computational uncertainty of the posterior approximation. We
+design a family of Bayesian neural network priors that are tailored for
+inference and show that they lead to well-calibrated posteriors on tested
+benchmarks, even when as few as $O(10)$ simulations are available. This opens
+up the possibility of performing reliable simulation-based inference using very
+expensive simulators, as we demonstrate on a problem from the field of
+cosmology where single simulations are computationally expensive. We show that
+Bayesian neural networks produce informative and well-calibrated posterior
+estimates with only a few hundred simulations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Using LLMs for Explaining Sets of Counterfactual Examples to Final Users <span class="chip">KDD 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15133v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15133v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arturo Fredes, Jordi Vitria
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Causality is vital for understanding true cause-and-effect relationships
+between variables within predictive models, rather than relying on mere
+correlations, making it highly relevant in the field of Explainable AI. In an
+automated decision-making scenario, causal inference methods can analyze the
+underlying data-generation process, enabling explanations of a model's decision
+by manipulating features and creating counterfactual examples. These
+counterfactuals explore hypothetical scenarios where a minimal number of
+factors are altered, providing end-users with valuable information on how to
+change their situation. However, interpreting a set of multiple counterfactuals
+can be challenging for end-users who are not used to analyzing raw data
+records. In our work, we propose a novel multi-step pipeline that uses
+counterfactuals to generate natural language explanations of actions that will
+lead to a change in outcome in classifiers of tabular data using LLMs. This
+pipeline is designed to guide the LLM through smaller tasks that mimic human
+reasoning when explaining a decision based on counterfactual cases. We
+conducted various experiments using a public dataset and proposed a method of
+closed-loop evaluation to assess the coherence of the final explanation with
+the counterfactuals, as well as the quality of the content. Results are
+promising, although further experiments with other datasets and human
+evaluations should be carried out.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented as a poster in the 2nd Workshop on Causal Inference and
+  Machine Learning in Practice at KDD 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating the Energy Consumption of Machine Learning: Systematic
+  Literature <span class="highlight-title">Review</span> and Experiments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15128v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15128v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Charlotte Rodriguez, Laura Degioanni, Laetitia Kameni, Richard Vidal, Giovanni Neglia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Monitoring, understanding, and optimizing the energy consumption of Machine
+Learning (ML) are various reasons why it is necessary to evaluate the energy
+usage of ML. However, there exists no universal tool that can answer this
+question for all use cases, and there may even be disagreement on how to
+evaluate energy consumption for a specific use case. Tools and methods are
+based on different approaches, each with their own advantages and drawbacks,
+and they need to be mapped out and explained in order to select the most
+suitable one for a given situation. We address this challenge through two
+approaches. First, we conduct a systematic literature review of all tools and
+methods that permit to evaluate the energy consumption of ML (both at training
+and at inference), irrespective of whether they were originally designed for
+machine learning or general software. Second, we develop and use an
+experimental protocol to compare a selection of these tools and methods. The
+comparison is both qualitative and quantitative on a range of ML tasks of
+different nature (vision, language) and computational complexity. The
+systematic literature review serves as a comprehensive guide for understanding
+the array of tools and methods used in evaluating energy consumption of ML, for
+various use cases going from basic energy monitoring to consumption
+optimization. Two open-source repositories are provided for further
+exploration. The first one contains tools that can be used to replicate this
+work or extend the current review. The second repository houses the
+experimental protocol, allowing users to augment the protocol with new ML
+computing tasks and additional energy evaluation tools.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>52 pages,</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Force-Guided Bridge Matching for Full-Atom Time-Coarsened Dynamics of
+  Peptides 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15126v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15126v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyang Yu, Wenbing Huang, Yang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Molecular Dynamics (MD) simulations are irreplaceable and ubiquitous in
+fields of materials science, chemistry, pharmacology just to name a few.
+Conventional MD simulations are plagued by numerical stability as well as long
+equilibration time issues, which limits broader applications of MD simulations.
+Recently, a surge of deep learning approaches have been devised for
+time-coarsened dynamics, which learns the state transition mechanism over much
+larger time scales to overcome these limitations. However, only a few methods
+target the underlying Boltzmann distribution by resampling techniques, where
+proposals are rarely accepted as new states with low efficiency. In this work,
+we propose a force-guided bridge matching model, FBM, a novel framework that
+first incorporates physical priors into bridge matching for full-atom
+time-coarsened dynamics. With the guidance of our well-designed intermediate
+force field, FBM is feasible to target the Boltzmann-like distribution by
+direct inference without extra steps. Experiments on small peptides verify our
+superiority in terms of comprehensive metrics and demonstrate transferability
+to unseen peptide systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Few-Shot Unsupervised Implicit Neural Shape Representation Learning with
+  Spatial Adversaries <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15114v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15114v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amine Ouasfi, Adnane Boukhayma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Implicit Neural Representations have gained prominence as a powerful
+framework for capturing complex data modalities, encompassing a wide range from
+3D shapes to images and audio. Within the realm of 3D shape representation,
+Neural Signed Distance Functions (SDF) have demonstrated remarkable potential
+in faithfully encoding intricate shape geometry. However, learning SDFs from
+sparse 3D point clouds in the absence of ground truth supervision remains a
+very challenging task. While recent methods rely on smoothness priors to
+regularize the learning, our method introduces a regularization term that
+leverages adversarial samples around the shape to improve the learned SDFs.
+Through extensive experiments and evaluations, we illustrate the efficacy of
+our proposed method, highlighting its capacity to improve SDF learning with
+respect to baselines and the state-of-the-art using synthetic and real data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ No Regrets: Investigating and Improving Regret Approximations for
+  Curriculum Discovery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15099v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15099v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Rutherford, Michael Beukman, Timon Willi, Bruno Lacerda, Nick Hawes, Jakob Foerster
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  What data or environments to use for training to improve downstream
+performance is a longstanding and very topical question in reinforcement
+learning. In particular, Unsupervised Environment Design (UED) methods have
+gained recent attention as their adaptive curricula enable agents to be robust
+to in- and out-of-distribution tasks. We ask to what extent these methods are
+themselves robust when applied to a novel setting, closely inspired by a
+real-world robotics problem. Surprisingly, we find that the state-of-the-art
+UED methods either do not improve upon the na\"{i}ve baseline of Domain
+Randomisation (DR), or require substantial hyperparameter tuning to do so. Our
+analysis shows that this is due to their underlying scoring functions failing
+to predict intuitive measures of ``learnability'', i.e., in finding the
+settings that the agent sometimes solves, but not always. Based on this, we
+instead directly train on levels with high learnability and find that this
+simple and intuitive approach outperforms UED methods and DR in several
+binary-outcome environments, including on our domain and the standard UED
+domain of Minigrid. We further introduce a new adversarial evaluation procedure
+for directly measuring robustness, closely mirroring the conditional value at
+risk (CVaR). We open-source all our code and present visualisations of final
+policies here: https://github.com/amacrutherford/sampling-for-learnability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Data-Driven Nonlinear Deformation Design of 3D-Printable Shells 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15097v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15097v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samuel Silverman, Kelsey L. Snapp, Keith A. Brown, Emily Whiting
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Designing and fabricating structures with specific mechanical properties
+requires understanding the intricate relationship between design parameters and
+performance. Understanding the design-performance relationship becomes
+increasingly complicated for nonlinear deformations. Though successful at
+modeling elastic deformations, simulation-based techniques struggle to model
+large elastoplastic deformations exhibiting plasticity and densification. We
+propose a neural network trained on experimental data to learn the
+design-performance relationship between 3D-printable shells and their
+compressive force-displacement behavior. Trained on thousands of physical
+experiments, our network aids in both forward and inverse design to generate
+shells exhibiting desired elastoplastic and hyperelastic deformations. We
+validate a subset of generated designs through fabrication and testing.
+Furthermore, we demonstrate the network's inverse design efficacy in generating
+custom shells for several applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to 3D Printing and Additive Manufacturing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Post-processing fairness with minimal changes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15096v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15096v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Federico Di Gennaro, Thibault Laugel, Vincent Grari, Xavier Renard, Marcin Detyniecki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce a novel post-processing algorithm that is both
+model-agnostic and does not require the sensitive attribute at test time. In
+addition, our algorithm is explicitly designed to enforce minimal changes
+between biased and debiased predictions; a property that, while highly
+desirable, is rarely prioritized as an explicit objective in fairness
+literature. Our approach leverages a multiplicative factor applied to the logit
+value of probability scores produced by a black-box classifier. We demonstrate
+the efficacy of our method through empirical evaluations, comparing its
+performance against other four debiasing algorithms on two widely used datasets
+in fairness research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Constrained Diffusion Models via Dual Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15094v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15094v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shervin Khalafi, Dongsheng Ding, Alejandro Ribeiro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have attained prominence for their ability to synthesize a
+probability distribution for a given dataset via a diffusion process, enabling
+the generation of new data points with high fidelity. However, diffusion
+processes are prone to generating biased data based on the training dataset. To
+address this issue, we develop constrained diffusion models by imposing
+diffusion constraints based on desired distributions that are informed by
+requirements. Specifically, we cast the training of diffusion models under
+requirements as a constrained distribution optimization problem that aims to
+reduce the distribution difference between original and generated data while
+obeying constraints on the distribution of generated data. We show that our
+constrained diffusion models generate new data from a mixture data distribution
+that achieves the optimal trade-off among objective and constraints. To train
+constrained diffusion models, we develop a dual training algorithm and
+characterize the optimality of the trained constrained diffusion model. We
+empirically demonstrate the effectiveness of our constrained models in two
+constrained generation tasks: (i) we consider a dataset with one or more
+underrepresented classes where we train the model with constraints to ensure
+fairly sampling from all classes during inference; (ii) we fine-tune a
+pre-trained diffusion model to sample from a new dataset while avoiding
+overfitting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>41 pages, 4 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SiHGNN: Leveraging Properties of Semantic Graphs for Efficient HGNN
+  Acceleration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15089v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15089v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Runzhen Xue, Mingyu Yan, Dengke Han, Zhimin Tang, Xiaochun Ye, Dongrui Fan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Heterogeneous Graph Neural Networks (HGNNs) have expanded graph
+representation learning to heterogeneous graph fields. Recent studies have
+demonstrated their superior performance across various applications, including
+medical analysis and recommendation systems, often surpassing existing methods.
+However, GPUs often experience inefficiencies when executing HGNNs due to their
+unique and complex execution patterns. Compared to traditional Graph Neural
+Networks, these patterns further exacerbate irregularities in memory access. To
+tackle these challenges, recent studies have focused on developing
+domain-specific accelerators for HGNNs. Nonetheless, most of these efforts have
+concentrated on optimizing the datapath or scheduling data accesses, while
+largely overlooking the potential benefits that could be gained from leveraging
+the inherent properties of the semantic graph, such as its topology, layout,
+and generation.
+  In this work, we focus on leveraging the properties of semantic graphs to
+enhance HGNN performance. First, we analyze the Semantic Graph Build (SGB)
+stage and identify significant opportunities for data reuse during semantic
+graph generation. Next, we uncover the phenomenon of buffer thrashing during
+the Graph Feature Processing (GFP) stage, revealing potential optimization
+opportunities in semantic graph layout. Furthermore, we propose a lightweight
+hardware accelerator frontend for HGNNs, called SiHGNN. This accelerator
+frontend incorporates a tree-based Semantic Graph Builder for efficient
+semantic graph generation and features a novel Graph Restructurer for
+optimizing semantic graph layouts. Experimental results show that SiHGNN
+enables the state-of-the-art HGNN accelerator to achieve an average performance
+improvement of 2.95$\times$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 18 figures. arXiv admin note: text overlap with
+  arXiv:2404.04792</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MMASD+: A Novel <span class="highlight-title">Dataset</span> for Privacy-Preserving Behavior Analysis of
+  Children with Autism Spectrum Disorder 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15077v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15077v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pavan Uttej Ravva, Behdokht Kiafar, Pinar Kullu, Jicheng Li, Anjana Bhat, Roghayeh Leila Barmaki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autism spectrum disorder (ASD) is characterized by significant challenges in
+social interaction and comprehending communication signals. Recently,
+therapeutic interventions for ASD have increasingly utilized Deep learning
+powered-computer vision techniques to monitor individual progress over time.
+These models are trained on private, non-public datasets from the autism
+community, creating challenges in comparing results across different models due
+to privacy-preserving data-sharing issues. This work introduces MMASD+. MMASD+
+consists of diverse data modalities, including 3D-Skeleton, 3D Body Mesh, and
+Optical Flow data. It integrates the capabilities of Yolov8 and Deep SORT
+algorithms to distinguish between the therapist and children, addressing a
+significant barrier in the original dataset. Additionally, a Multimodal
+Transformer framework is proposed to predict 11 action types and the presence
+of ASD. This framework achieves an accuracy of 95.03% for predicting action
+types and 96.42% for predicting ASD presence, demonstrating over a 10%
+improvement compared to models trained on single data modalities. These
+findings highlight the advantages of integrating multiple data modalities
+within the Multimodal Transformer framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MiWaves Reinforcement Learning Algorithm 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15076v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15076v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Susobhan Ghosh, Yongyi Guo, Pei-Yao Hung, Lara Coughlin, Erin Bonar, Inbal Nahum-Shani, Maureen Walton, Susan Murphy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The escalating prevalence of cannabis use poses a significant public health
+challenge globally. In the U.S., cannabis use is more prevalent among emerging
+adults (EAs) (ages 18-25) than any other age group, with legalization in the
+multiple states contributing to a public perception that cannabis is less risky
+than in prior decades. To address this growing concern, we developed MiWaves, a
+reinforcement learning (RL) algorithm designed to optimize the delivery of
+personalized intervention prompts to reduce cannabis use among EAs. MiWaves
+leverages domain expertise and prior data to tailor the likelihood of delivery
+of intervention messages. This paper presents a comprehensive overview of the
+algorithm's design, including key decisions and experimental outcomes. The
+finalized MiWaves RL algorithm was deployed in a clinical trial from March to
+May 2024.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2402.17739</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Interactive dense pixel visualizations for time series and model
+  attribution explanations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15073v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15073v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Udo Schlegel, Daniel A. Keim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The field of Explainable Artificial Intelligence (XAI) for Deep Neural
+Network models has developed significantly, offering numerous techniques to
+extract explanations from models. However, evaluating explanations is often not
+trivial, and differences in applied metrics can be subtle, especially with
+non-intelligible data. Thus, there is a need for visualizations tailored to
+explore explanations for domains with such data, e.g., time series. We propose
+DAVOTS, an interactive visual analytics approach to explore raw time series
+data, activations of neural networks, and attributions in a dense-pixel
+visualization to gain insights into the data, models' decisions, and
+explanations. To further support users in exploring large datasets, we apply
+clustering approaches to the visualized data domains to highlight groups and
+present ordering strategies for individual and combined data exploration to
+facilitate finding patterns. We visualize a CNN trained on the FordA dataset to
+demonstrate the approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 2 figures, accepted at MLVIS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Benefits of Balance: From Information Projections to Variance
+  Reduction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15065v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15065v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lang Liu, Ronak Mehta, Soumik Pal, Zaid Harchaoui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data balancing across multiple modalities/sources appears in various forms in
+several foundation models (e.g., CLIP and DINO) achieving universal
+representation learning. We show that this iterative algorithm, usually used to
+avoid representation collapse, enjoys an unsuspected benefit: reducing the
+variance of estimators that are functionals of the empirical distribution over
+these sources. We provide non-asymptotic bounds quantifying this variance
+reduction effect and relate them to the eigendecays of appropriately defined
+Markov operators. We explain how various forms of data balancing in contrastive
+multimodal learning and self-supervised clustering can be interpreted as
+instances of this variance reduction scheme.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Subgroup Analysis via Model-based Rule Forest 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15057v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15057v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        I-Ling Cheng, Chan Hsu, Chantung Ku, Pei-Ju Lee, Yihuang Kang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning models are often criticized for their black-box nature,
+raising concerns about their applicability in critical decision-making
+scenarios. Consequently, there is a growing demand for interpretable models in
+such contexts. In this study, we introduce Model-based Deep Rule Forests
+(mobDRF), an interpretable representation learning algorithm designed to
+extract transparent models from data. By leveraging IF-THEN rules with
+multi-level logic expressions, mobDRF enhances the interpretability of existing
+models without compromising accuracy. We apply mobDRF to identify key risk
+factors for cognitive decline in an elderly population, demonstrating its
+effectiveness in subgroup analysis and local model optimization. Our method
+offers a promising solution for developing trustworthy and interpretable
+machine learning models, particularly valuable in fields like healthcare, where
+understanding differential effects across patient subgroups can lead to more
+personalized and effective treatments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Causal Rule Forest: Toward Interpretable and Precise Treatment Effect
+  Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15055v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15055v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chan Hsu, Jun-Ting Wu, Yihuang Kang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding and inferencing Heterogeneous Treatment Effects (HTE) and
+Conditional Average Treatment Effects (CATE) are vital for developing
+personalized treatment recommendations. Many state-of-the-art approaches
+achieve inspiring performance in estimating HTE on benchmark datasets or
+simulation studies. However, the indirect predicting manner and complex model
+architecture reduce the interpretability of these approaches. To mitigate the
+gap between predictive performance and heterogeneity interpretability, we
+introduce the Causal Rule Forest (CRF), a novel approach to learning hidden
+patterns from data and transforming the patterns into interpretable multi-level
+Boolean rules. By training the other interpretable causal inference models with
+data representation learned by CRF, we can reduce the predictive errors of
+these models in estimating HTE and CATE, while keeping their interpretability
+for identifying subgroups that a treatment is more effective. Our experiments
+underscore the potential of CRF to advance personalized interventions and
+policies, paving the way for future research to enhance its scalability and
+application across complex causal inference challenges.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The 25th IEEE International Conference on Information Reuse and
+  Integration for Data Science (IRI 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Earth Observation Satellite Scheduling with Graph Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15041v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15041v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Antoine Jacquet, Guillaume Infantes, Nicolas Meuleau, Emmanuel Benazera, Stéphanie Roussel, Vincent Baudoui, Jonathan Guerra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Earth Observation Satellite Planning (EOSP) is a difficult optimization
+problem with considerable practical interest. A set of requested observations
+must be scheduled on an agile Earth observation satellite while respecting
+constraints on their visibility window, as well as maneuver constraints that
+impose varying delays between successive observations. In addition, the problem
+is largely oversubscribed: there are much more candidate observations than what
+can possibly be achieved. Therefore, one must select the set of observations
+that will be performed while maximizing their weighted cumulative benefit, and
+propose a feasible schedule for these observations. As previous work mostly
+focused on heuristic and iterative search algorithms, this paper presents a new
+technique for selecting and scheduling observations based on Graph Neural
+Networks (GNNs) and Deep Reinforcement Learning (DRL). GNNs are used to extract
+relevant information from the graphs representing instances of the EOSP, and
+DRL drives the search for optimal schedules. Our simulations show that it is
+able to learn on small problem instances and generalize to larger real-world
+instances, with very competitive performance compared to traditional
+approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at 17th European Workshop on Reinforcement Learning (EWRL
+  2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Prior-free Balanced Replay: Uncertainty-guided Reservoir Sampling for
+  Long-Tailed Continual Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14976v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14976v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Liu, Li Liu, Yawen Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Even in the era of large models, one of the well-known issues in continual
+learning (CL) is catastrophic forgetting, which is significantly challenging
+when the continual data stream exhibits a long-tailed distribution, termed as
+Long-Tailed Continual Learning (LTCL). Existing LTCL solutions generally
+require the label distribution of the data stream to achieve re-balance
+training. However, obtaining such prior information is often infeasible in real
+scenarios since the model should learn without pre-identifying the majority and
+minority classes. To this end, we propose a novel Prior-free Balanced Replay
+(PBR) framework to learn from long-tailed data stream with less forgetting.
+Concretely, motivated by our experimental finding that the minority classes are
+more likely to be forgotten due to the higher uncertainty, we newly design an
+uncertainty-guided reservoir sampling strategy to prioritize rehearsing
+minority data without using any prior information, which is based on the mutual
+dependence between the model and samples. Additionally, we incorporate two
+prior-free components to further reduce the forgetting issue: (1) Boundary
+constraint is to preserve uncertain boundary supporting samples for continually
+re-estimating task boundaries. (2) Prototype constraint is to maintain the
+consistency of learned class prototypes along with training. Our approach is
+evaluated on three standard long-tailed benchmarks, demonstrating superior
+performance to existing CL methods and previous SOTA LTCL approach in both
+task- and class-incremental learning settings, as well as ordered- and
+shuffled-LTCL settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cross-Modal Learning for Chemistry Property Prediction: Large Language
+  Models Meet Graph Machine Learning <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14964v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14964v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sakhinana Sagar Srinivas, Venkataramana Runkana
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the field of chemistry, the objective is to create novel molecules with
+desired properties, facilitating accurate property predictions for applications
+such as material design and drug screening. However, existing graph deep
+learning methods face limitations that curb their expressive power. To address
+this, we explore the integration of vast molecular domain knowledge from Large
+Language Models (LLMs) with the complementary strengths of Graph Neural
+Networks (GNNs) to enhance performance in property prediction tasks. We
+introduce a Multi-Modal Fusion (MMF) framework that synergistically harnesses
+the analytical prowess of GNNs and the linguistic generative and predictive
+abilities of LLMs, thereby improving accuracy and robustness in predicting
+molecular properties. Our framework combines the effectiveness of GNNs in
+modeling graph-structured data with the zero-shot and few-shot learning
+capabilities of LLMs, enabling improved predictions while reducing the risk of
+overfitting. Furthermore, our approach effectively addresses distributional
+shifts, a common challenge in real-world applications, and showcases the
+efficacy of learning cross-modal representations, surpassing state-of-the-art
+baselines on benchmark datasets for property prediction tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper Accepted at Workshop on Robustness of Few-shot and Zero-shot
+  Learning in Foundation Models at NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Domain-decoupled Physics-informed Neural Networks with Closed-form
+  Gradients for Fast Model Learning of Dynamical Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14951v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14951v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Henrik Krauss, Tim-Lukas Habich, Max Bartholdt, Thomas Seel, Moritz Schappler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Physics-informed neural networks (PINNs) are trained using physical equations
+and can also incorporate unmodeled effects by learning from data. PINNs for
+control (PINCs) of dynamical systems are gaining interest due to their
+prediction speed compared to classical numerical integration methods for
+nonlinear state-space models, making them suitable for real-time control
+applications. We introduce the domain-decoupled physics-informed neural network
+(DD-PINN) to address current limitations of PINC in handling large and complex
+nonlinear dynamic systems. The time domain is decoupled from the feed-forward
+neural network to construct an Ansatz function, allowing for calculation of
+gradients in closed form. This approach significantly reduces training times,
+especially for large dynamical systems, compared to PINC, which relies on
+graph-based automatic differentiation. Additionally, the DD-PINN inherently
+fulfills the initial condition and supports higher-order excitation inputs,
+simplifying the training process and enabling improved prediction accuracy.
+Validation on three systems - a nonlinear mass-spring-damper, a
+five-mass-chain, and a two-link robot - demonstrates that the DD-PINN achieves
+significantly shorter training times. In cases where the PINC's prediction
+diverges, the DD-PINN's prediction remains stable and accurate due to higher
+physics loss reduction or use of a higher-order excitation input. The DD-PINN
+allows for fast and accurate learning of large dynamical systems previously out
+of reach for the PINC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to International Conference on Informatics in Control,
+  Automation and Robotics (ICINCO) 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Quotient Normalized Maximum Likelihood Criterion for Learning Bayesian
+  Network Structures <span class="chip">AISTATS 2018</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14935v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14935v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tomi Silander, Janne Leppä-aho, Elias Jääsaari, Teemu Roos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce an information theoretic criterion for Bayesian network
+structure learning which we call quotient normalized maximum likelihood (qNML).
+In contrast to the closely related factorized normalized maximum likelihood
+criterion, qNML satisfies the property of score equivalence. It is also
+decomposable and completely free of adjustable hyperparameters. For practical
+computations, we identify a remarkably accurate approximation proposed earlier
+by Szpankowski and Weinberger. Experiments on both simulated and real data
+demonstrate that the new criterion leads to parsimonious models with good
+predictive accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to AISTATS 2018</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Targetin the partition function of chemically disordered materials with
+  a generative approach based on inverse variational autoencoders 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14928v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14928v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maciej J. Karcz, Luca Messina, Eiji Kawasaki, Emeric Bourasseau
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Computing atomic-scale properties of chemically disordered materials requires
+an efficient exploration of their vast configuration space. Traditional
+approaches such as Monte Carlo or Special Quasirandom Structures either entail
+sampling an excessive amount of configurations or do not ensure that the
+configuration space has been properly covered. In this work, we propose a novel
+approach where generative machine learning is used to yield a representative
+set of configurations for accurate property evaluation and provide accurate
+estimations of atomic-scale properties with minimal computational cost. Our
+method employs a specific type of variational autoencoder with inverse roles
+for the encoder and decoder, enabling the application of an unsupervised active
+learning scheme that does not require any initial training database. The model
+iteratively generates configuration batches, whose properties are computed with
+conventional atomic-scale methods. These results are then fed back into the
+model to estimate the partition function, repeating the process until
+convergence. We illustrate our approach by computing point-defect formation
+energies and concentrations in (U, Pu)O2 mixed-oxide fuels. In addition, the ML
+model provides valuable insights into the physical factors influencing the
+target property. Our method is generally applicable to explore other
+properties, such as atomic-scale diffusion coefficients, in ideally or
+non-ideally disordered materials like high-entropy alloys.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can <span class="highlight-title">Transformer</span>s Do Enumerative Geometry? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14915v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14915v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Baran Hashemi, Roderic G. Corominas, Alessandro Giacchetto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  How can Transformers model and learn enumerative geometry? What is a robust
+procedure for using Transformers in abductive knowledge discovery within a
+mathematician-machine collaboration? In this work, we introduce a new paradigm
+in computational enumerative geometry in analyzing the $\psi$-class
+intersection numbers on the moduli space of curves. By formulating the
+enumerative problem as a continuous optimization task, we develop a
+Transformer-based model for computing $\psi$-class intersection numbers based
+on the underlying quantum Airy structure. For a finite range of genera, our
+model is capable of regressing intersection numbers that span an extremely wide
+range of values, from $10^{-45}$ to $10^{45}$. To provide a proper inductive
+bias for capturing the recursive behavior of intersection numbers, we propose a
+new activation function, Dynamic Range Activator (DRA). Moreover, given the
+severe heteroscedasticity of $\psi$-class intersections and the required
+precision, we quantify the uncertainty of the predictions using Conformal
+Prediction with a dynamic sliding window that is aware of the number of marked
+points. Next, we go beyond merely computing intersection numbers and explore
+the enumerative "world-model" of the Transformers. Through a series of causal
+inference and correlational interpretability analyses, we demonstrate that
+Transformers are actually modeling Virasoro constraints in a purely data-driven
+manner. Additionally, we provide evidence for the comprehension of several
+values appearing in the large genus asymptotic of $\psi$-class intersection
+numbers through abductive hypothesis testing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SpikingSSMs: Learning Long Sequences with Sparse and Parallel Spiking
+  State Space Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14909v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14909v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuaijie Shen, Chao Wang, Renzhuo Huang, Yan Zhong, Qinghai Guo, Zhichao Lu, Jianguo Zhang, Luziwei Leng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Known as low energy consumption networks, spiking neural networks (SNNs) have
+gained a lot of attention within the past decades. While SNNs are increasing
+competitive with artificial neural networks (ANNs) for vision tasks, they are
+rarely used for long sequence tasks, despite their intrinsic temporal dynamics.
+In this work, we develop spiking state space models (SpikingSSMs) for long
+sequence learning by leveraging on the sequence learning abilities of state
+space models (SSMs). Inspired by dendritic neuron structure, we hierarchically
+integrate neuronal dynamics with the original SSM block, meanwhile realizing
+sparse synaptic computation. Furthermore, to solve the conflict of event-driven
+neuronal dynamics with parallel computing, we propose a light-weight surrogate
+dynamic network which accurately predicts the after-reset membrane potential
+and compatible to learnable thresholds, enabling orders of acceleration in
+training speed compared with conventional iterative methods. On the long range
+arena benchmark task, SpikingSSM achieves competitive performance to
+state-of-the-art SSMs meanwhile realizing on average 90\% of network sparsity.
+On language modeling, our network significantly surpasses existing spiking
+large language models (spikingLLMs) on the WikiText-103 dataset with only a
+third of the model size, demonstrating its potential as backbone architecture
+for low computation cost LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Development of Large Annotated Music <span class="highlight-title">Dataset</span>s using HMM-based Forced
+  Viterbi Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14890v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14890v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        S. Johanan Joysingh, P. Vijayalakshmi, T. Nagarajan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Datasets are essential for any machine learning task. Automatic Music
+Transcription (AMT) is one such task, where considerable amount of data is
+required depending on the way the solution is achieved. Considering the fact
+that a music dataset, complete with audio and its time-aligned transcriptions
+would require the effort of people with musical experience, it could be stated
+that the task becomes even more challenging. Musical experience is required in
+playing the musical instrument(s), and in annotating and verifying the
+transcriptions. We propose a method that would help in streamlining this
+process, making the task of obtaining a dataset from a particular instrument
+easy and efficient. We use predefined guitar exercises and hidden Markov
+model(HMM) based forced viterbi alignment to accomplish this. The guitar
+exercises are designed to be simple. Since the note sequence are already
+defined, HMM based forced viterbi alignment provides time-aligned
+transcriptions of these audio files. The onsets of the transcriptions are
+manually verified and the labels are accurate up to 10ms, averaging at 5ms. The
+contributions of the proposed work is two fold, i) a well streamlined and
+efficient method for generating datasets for any instrument, especially
+monophonic and, ii) an acoustic plectrum guitar dataset containing wave files
+and transcriptions in the form of label files. This method will aid as a
+preliminary step towards building concrete datasets for building AMT systems
+for different instruments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>submitted to TENCON 2019</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards turbine-location-aware multi-decadal wind power predictions with
+  CMIP6 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14889v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14889v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nina Effenberger, Nicole Ludwig
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the increasing amount of renewable energy in the grid, long-term wind
+power forecasting for multiple decades becomes more critical. In these
+long-term forecasts, climate data is essential as it allows us to account for
+climate change. Yet the resolution of climate models is often very coarse. In
+this paper, we show that by including turbine locations when downscaling with
+Gaussian Processes, we can generate valuable aggregate wind power predictions
+despite the low resolution of the CMIP6 climate models. This work is a first
+step towards multi-decadal turbine-location-aware wind power forecasting using
+global climate model output.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, pre-print</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Literary and Colloquial Dialect Identification for Tamil using Acoustic
+  Features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14887v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14887v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        M. Nanmalar, P. Vijayalakshmi, T. Nagarajan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The evolution and diversity of a language is evident from it's various
+dialects. If the various dialects are not addressed in technological
+advancements like automatic speech recognition and speech synthesis, there is a
+chance that these dialects may disappear. Speech technology plays a role in
+preserving various dialects of a language from going extinct. In order to build
+a full fledged automatic speech recognition system that addresses various
+dialects, an Automatic Dialect Identification (ADI) system acting as the front
+end is required. This is similar to how language identification systems act as
+front ends to automatic speech recognition systems that handle multiple
+languages. The current work proposes a way to identify two popular and broadly
+classified Tamil dialects, namely literary and colloquial Tamil. Acoustical
+characteristics rather than phonetics and phonotactics are used, alleviating
+the requirement of language-dependant linguistic tools. Hence one major
+advantage of the proposed method is that it does not require an annotated
+corpus, hence it can be easily adapted to other languages. Gaussian Mixture
+Models (GMM) using Mel Frequency Cepstral Coefficient (MFCC) features are used
+to perform the classification task. The experiments yielded an error rate of
+12%. Vowel nasalization, as being the reason for this good performance, is
+discussed. The number of mixture models for the GMM is varied and the
+performance is analysed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>submitted to TENCON 2019</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adversarial Attacks and Defenses in Multivariate Time-Series Forecasting
+  for Smart and Connected Infrastructures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14875v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14875v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pooja Krishan, Rohan Mohapatra, Saptarshi Sengupta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The emergence of deep learning models has revolutionized various industries
+over the last decade, leading to a surge in connected devices and
+infrastructures. However, these models can be tricked into making incorrect
+predictions with high confidence, leading to disastrous failures and security
+concerns. To this end, we explore the impact of adversarial attacks on
+multivariate time-series forecasting and investigate methods to counter them.
+Specifically, we employ untargeted white-box attacks, namely the Fast Gradient
+Sign Method (FGSM) and the Basic Iterative Method (BIM), to poison the inputs
+to the training process, effectively misleading the model. We also illustrate
+the subtle modifications to the inputs after the attack, which makes detecting
+the attack using the naked eye quite difficult. Having demonstrated the
+feasibility of these attacks, we develop robust models through adversarial
+training and model hardening. We are among the first to showcase the
+transferability of these attacks and defenses by extrapolating our work from
+the benchmark electricity data to a larger, 10-year real-world data used for
+predicting the time-to-failure of hard disks. Our experimental results confirm
+that the attacks and defenses achieve the desired security thresholds, leading
+to a 72.41% and 94.81% decrease in RMSE for the electricity and hard disk
+datasets respectively after implementing the adversarial defenses.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 32 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Robust Reward Machines from Noisy Labels <span class="chip">KR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14871v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14871v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roko Parac, Lorenzo Nodari, Leo Ardon, Daniel Furelos-Blanco, Federico Cerutti, Alessandra Russo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents PROB-IRM, an approach that learns robust reward machines
+(RMs) for reinforcement learning (RL) agents from noisy execution traces. The
+key aspect of RM-driven RL is the exploitation of a finite-state machine that
+decomposes the agent's task into different subtasks. PROB-IRM uses a
+state-of-the-art inductive logic programming framework robust to noisy examples
+to learn RMs from noisy traces using the Bayesian posterior degree of beliefs,
+thus ensuring robustness against inconsistencies. Pivotal for the results is
+the interleaving between RM learning and policy learning: a new RM is learned
+whenever the RL agent generates a trace that is believed not to be accepted by
+the current RM. To speed up the training of the RL agent, PROB-IRM employs a
+probabilistic formulation of reward shaping that uses the posterior Bayesian
+beliefs derived from the traces. Our experimental analysis shows that PROB-IRM
+can learn (potentially imperfect) RMs from noisy traces and exploit them to
+train an RL agent to solve its tasks successfully. Despite the complexity of
+learning the RM from noisy traces, agents trained with PROB-IRM perform
+comparably to agents provided with handcrafted RMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint accepted for publication to the 21st International
+  Conference on Principles of Knowledge Representation and Reasoning (KR 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Advancing Adversarial Suffix Transfer Learning on Aligned Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14866v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14866v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongfu Liu, Yuxi Xie, Ye Wang, Michael Shieh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language Language Models (LLMs) face safety concerns due to potential misuse
+by malicious users. Recent red-teaming efforts have identified adversarial
+suffixes capable of jailbreaking LLMs using the gradient-based search algorithm
+Greedy Coordinate Gradient (GCG). However, GCG struggles with computational
+inefficiency, limiting further investigations regarding suffix transferability
+and scalability across models and data. In this work, we bridge the connection
+between search efficiency and suffix transferability. We propose a two-stage
+transfer learning framework, DeGCG, which decouples the search process into
+behavior-agnostic pre-searching and behavior-relevant post-searching.
+Specifically, we employ direct first target token optimization in pre-searching
+to facilitate the search process. We apply our approach to cross-model,
+cross-data, and self-transfer scenarios. Furthermore, we introduce an
+interleaved variant of our approach, i-DeGCG, which iteratively leverages
+self-transferability to accelerate the search process. Experiments on HarmBench
+demonstrate the efficiency of our approach across various models and domains.
+Notably, our i-DeGCG outperforms the baseline on Llama2-chat-7b with ASRs of
+$43.9$ ($+22.2$) and $39.0$ ($+19.5$) on valid and test sets, respectively.
+Further analysis on cross-model transfer indicates the pivotal role of first
+target token optimization in leveraging suffix transferability for efficient
+searching.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Data downlink prioritization using image classification on-board a 6U
+  CubeSat 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14865v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14865v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Keenan A. A. Chatar, Ezra Fielding, Kei Sano, Kentaro Kitamura
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nanosatellites are proliferating as low-cost dedicated sensing systems with
+lean development cycles. Kyushu Institute of Technology and collaborators have
+launched a joint venture for a nanosatellite mission, VERTECS. The primary
+mission is to elucidate the formation history of stars by observing the
+optical-wavelength cosmic background radiation. The VERTECS satellite will be
+equipped with a small-aperture telescope and a high-precision attitude control
+system to capture the cosmic data for analysis on the ground. However,
+nanosatellites are limited by their onboard memory resources and downlink speed
+capabilities. Additionally, due to a limited number of ground stations, the
+satellite mission will face issues meeting the required data budget for mission
+success. To alleviate this issue, we propose an on-orbit system to autonomously
+classify and then compress desirable image data for data downlink
+prioritization and optimization. The system comprises a prototype Camera
+Controller Board (CCB) which carries a Raspberry Pi Compute Module 4 which is
+used for classification and compression. The system uses a lightweight
+Convolutional Neural Network (CNN) model to classify and determine the
+desirability of captured image data. The model is designed to be lean and
+robust to reduce the computational and memory load on the satellite. The model
+is trained and tested on a novel star field dataset consisting of data captured
+by the Sloan Digital Sky Survey (SDSS). The dataset is meant to simulate the
+expected data produced by the 6U satellite. The compression step implements
+GZip, RICE or HCOMPRESS compression, which are standards for astronomical data.
+Preliminary testing on the proposed CNN model results in a classification
+accuracy of about 100\% on the star field dataset, with compression ratios of
+3.99, 5.16 and 5.43 for GZip, RICE and HCOMPRESS that were achieved on tested
+FITS image data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dynamic operator management in meta-heuristics using reinforcement
+  learning: an application to permutation flowshop scheduling problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14864v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14864v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maryam Karimi Mamaghan, Mehrdad Mohammadi, Wout Dullaert, Daniele Vigo, Amir Pirayesh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study develops a framework based on reinforcement learning to
+dynamically manage a large portfolio of search operators within
+meta-heuristics. Using the idea of tabu search, the framework allows for
+continuous adaptation by temporarily excluding less efficient operators and
+updating the portfolio composition during the search. A Q-learning-based
+adaptive operator selection mechanism is used to select the most suitable
+operator from the dynamically updated portfolio at each stage. Unlike
+traditional approaches, the proposed framework requires no input from the
+experts regarding the search operators, allowing domain-specific non-experts to
+effectively use the framework. The performance of the proposed framework is
+analyzed through an application to the permutation flowshop scheduling problem.
+The results demonstrate the superior performance of the proposed framework
+against state-of-the-art algorithms in terms of optimality gap and convergence
+speed.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Intraoperative Glioma Segmentation with YOLO + SAM for Improved Accuracy
+  in Tumor Resection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14847v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14847v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samir Kassam, Angelo Markham, Katie Vo, Yashas Revanakara, Michael Lam, Kevin Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gliomas, a common type of malignant brain tumor, present significant surgical
+challenges due to their similarity to healthy tissue. Preoperative Magnetic
+Resonance Imaging (MRI) images are often ineffective during surgery due to
+factors such as brain shift, which alters the position of brain structures and
+tumors. This makes real-time intraoperative MRI (ioMRI) crucial, as it provides
+updated imaging that accounts for these shifts, ensuring more accurate tumor
+localization and safer resections. This paper presents a deep learning pipeline
+combining You Only Look Once Version 8 (YOLOv8) and Segment Anything Model
+Vision Transformer-base (SAM ViT-b) to enhance glioma detection and
+segmentation during ioMRI. Our model was trained using the Brain Tumor
+Segmentation 2021 (BraTS 2021) dataset, which includes standard magnetic
+resonance imaging (MRI) images, and noise-augmented MRI images that simulate
+ioMRI images. Noised MRI images are harder for a deep learning pipeline to
+segment, but they are more representative of surgical conditions. Achieving a
+Dice Similarity Coefficient (DICE) score of 0.79, our model performs comparably
+to state-of-the-art segmentation models tested on noiseless data. This
+performance demonstrates the model's potential to assist surgeons in maximizing
+tumor resection and improving surgical outcomes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Correntropy-Based Improper Likelihood Model for Robust
+  Electrophysiological Source Imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14843v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14843v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanhao Li, Badong Chen, Zhongxu Hu, Keita Suzuki, Wenjun Bai, Yasuharu Koike, Okito Yamashita
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bayesian learning provides a unified skeleton to solve the
+electrophysiological source imaging task. From this perspective, existing
+source imaging algorithms utilize the Gaussian assumption for the observation
+noise to build the likelihood function for Bayesian inference. However, the
+electromagnetic measurements of brain activity are usually affected by
+miscellaneous artifacts, leading to a potentially non-Gaussian distribution for
+the observation noise. Hence the conventional Gaussian likelihood model is a
+suboptimal choice for the real-world source imaging task. In this study, we aim
+to solve this problem by proposing a new likelihood model which is robust with
+respect to non-Gaussian noises. Motivated by the robust maximum correntropy
+criterion, we propose a new improper distribution model concerning the noise
+assumption. This new noise distribution is leveraged to structure a robust
+likelihood function and integrated with hierarchical prior distributions to
+estimate source activities by variational inference. In particular, the score
+matching is adopted to determine the hyperparameters for the improper
+likelihood model. A comprehensive performance evaluation is performed to
+compare the proposed noise assumption to the conventional Gaussian model.
+Simulation results show that, the proposed method can realize more precise
+source reconstruction by designing known ground-truth. The real-world dataset
+also demonstrates the superiority of our new method with the visual perception
+task. This study provides a new backbone for Bayesian source imaging, which
+would facilitate its application using real-world noisy brain signal.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Bias to Balance: Detecting Facial Expression Recognition Biases in
+  Large Multimodal Foundation Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14842v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14842v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaylee Chhua, Zhoujinyi Wen, Vedant Hathalia, Kevin Zhu, Sean O'Brien
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study addresses the racial biases in facial expression recognition (FER)
+systems within Large Multimodal Foundation Models (LMFMs). Despite advances in
+deep learning and the availability of diverse datasets, FER systems often
+exhibit higher error rates for individuals with darker skin tones. Existing
+research predominantly focuses on traditional FER models (CNNs, RNNs, ViTs),
+leaving a gap in understanding racial biases in LMFMs. We benchmark four
+leading LMFMs: GPT-4o, PaliGemma, Gemini, and CLIP to assess their performance
+in facial emotion detection across different racial demographics. A linear
+classifier trained on CLIP embeddings obtains accuracies of 95.9\% for RADIATE,
+90.3\% for Tarr, and 99.5\% for Chicago Face. Furthermore, we identify that
+Anger is misclassified as Disgust 2.1 times more often in Black Females than
+White Females. This study highlights the need for fairer FER systems and
+establishes a foundation for developing unbiased, accurate FER technologies.
+Visit https://kvjvhub.github.io/FERRacialBias/ for further information
+regarding the biases within facial expression recognition.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CL4KGE: A Curriculum Learning Method for Knowledge Graph Embedding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14840v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14840v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Liu, Chuan Zhou, Peng Zhang, Yanan Cao, Yongchao Liu, Zhao Li, Hongyang Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge graph embedding (KGE) constitutes a foundational task, directed
+towards learning representations for entities and relations within knowledge
+graphs (KGs), with the objective of crafting representations comprehensive
+enough to approximate the logical and symbolic interconnections among entities.
+In this paper, we define a metric Z-counts to measure the difficulty of
+training each triple ($<$head entity, relation, tail entity$>$) in KGs with
+theoretical analysis. Based on this metric, we propose \textbf{CL4KGE}, an
+efficient \textbf{C}urriculum \textbf{L}earning based training strategy for
+\textbf{KGE}. This method includes a difficulty measurer and a training
+scheduler that aids in the training of KGE models. Our approach possesses the
+flexibility to act as a plugin within a wide range of KGE models, with the
+added advantage of adaptability to the majority of KGs in existence. The
+proposed method has been evaluated on popular KGE models, and the results
+demonstrate that it enhances the state-of-the-art methods. The use of Z-counts
+as a metric has enabled the identification of challenging triples in KGs, which
+helps in devising effective training strategies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diffusion Models Are Real-Time Game Engines 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14837v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14837v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dani Valevski, Yaniv Leviathan, Moab Arar, Shlomi Fruchter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present GameNGen, the first game engine powered entirely by a neural model
+that enables real-time interaction with a complex environment over long
+trajectories at high quality. GameNGen can interactively simulate the classic
+game DOOM at over 20 frames per second on a single TPU. Next frame prediction
+achieves a PSNR of 29.4, comparable to lossy JPEG compression. Human raters are
+only slightly better than random chance at distinguishing short clips of the
+game from clips of the simulation. GameNGen is trained in two phases: (1) an
+RL-agent learns to play the game and the training sessions are recorded, and
+(2) a diffusion model is trained to produce the next frame, conditioned on the
+sequence of past frames and actions. Conditioning augmentations enable stable
+auto-regressive generation over long trajectories.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://gamengen.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DRL-Based Federated <span class="highlight-title">Self-Supervised</span> Learning for Task Offloading and
+  Resource Allocation in ISAC-Enabled Vehicle Edge Computing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14831v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14831v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xueying Gu, Qiong Wu, Pingyi Fan, Nan Cheng, Wen Chen, Khaled B. Letaief
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Intelligent Transportation Systems (ITS) leverage Integrated Sensing and
+Communications (ISAC) to enhance data exchange between vehicles and
+infrastructure in the Internet of Vehicles (IoV). This integration inevitably
+increases computing demands, risking real-time system stability. Vehicle Edge
+Computing (VEC) addresses this by offloading tasks to Road Side Unit (RSU),
+ensuring timely services. Our previous work FLSimCo algorithm, which uses local
+resources for Federated Self-Supervised Learning (SSL), though vehicles often
+can't complete all iterations task. Our improved algorithm offloads partial
+task to RSU and optimizes energy consumption by adjusting transmission power,
+CPU frequency, and task assignment ratios, balancing local and RSU-based
+training. Meanwhile, setting an offloading threshold further prevents
+inefficiencies. Simulation results show that the enhanced algorithm reduces
+energy consumption, improves offloading efficiency and the accuracy of
+Federated SSL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been submitted to Digital Communications and Networks.
+  The source code has been released at:
+  https://github.com/qiongwu86/Federated-SSL-task-offloading-and-resource-allocation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Rule-Based Models to Deep Learning <span class="highlight-title">Transformer</span>s Architectures for
+  Natural Language Processing and Sign Language Translation Systems: <span class="highlight-title">Survey</span>,
+  Taxonomy and Performance Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14825v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14825v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nada Shahin, Leila Ismail
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the growing Deaf and Hard of Hearing population worldwide and the
+persistent shortage of certified sign language interpreters, there is a
+pressing need for an efficient, signs-driven, integrated end-to-end translation
+system, from sign to gloss to text and vice-versa. There has been a wealth of
+research on machine translations and related reviews. However, there are few
+works on sign language machine translation considering the particularity of the
+language being continuous and dynamic. This paper aims to address this void,
+providing a retrospective analysis of the temporal evolution of sign language
+machine translation algorithms and a taxonomy of the Transformers
+architectures, the most used approach in language translation. We also present
+the requirements of a real-time Quality-of-Service sign language ma-chine
+translation system underpinned by accurate deep learning algorithms. We propose
+future research directions for sign language translation systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Data-driven Effective Modeling of Multiscale Stochastic Dynamical
+  Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14821v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14821v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuan Chen, Dongbin Xiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a numerical method for learning the dynamics of slow components of
+unknown multiscale stochastic dynamical systems. While the governing equations
+of the systems are unknown, bursts of observation data of the slow variables
+are available. By utilizing the observation data, our proposed method is
+capable of constructing a generative stochastic model that can accurately
+capture the effective dynamics of the slow variables in distribution. We
+present a comprehensive set of numerical examples to demonstrate the
+performance of the proposed method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2406.15747</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Comprehensive Benchmark of Machine and Deep Learning Across Diverse
+  Tabular <span class="highlight-title">Dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14817v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14817v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Assaf Shmuel, Oren Glickman, Teddy Lazebnik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The analysis of tabular datasets is highly prevalent both in scientific
+research and real-world applications of Machine Learning (ML). Unlike many
+other ML tasks, Deep Learning (DL) models often do not outperform traditional
+methods in this area. Previous comparative benchmarks have shown that DL
+performance is frequently equivalent or even inferior to models such as
+Gradient Boosting Machines (GBMs). In this study, we introduce a comprehensive
+benchmark aimed at better characterizing the types of datasets where DL models
+excel. Although several important benchmarks for tabular datasets already
+exist, our contribution lies in the variety and depth of our comparison: we
+evaluate 111 datasets with 20 different models, including both regression and
+classification tasks. These datasets vary in scale and include both those with
+and without categorical variables. Importantly, our benchmark contains a
+sufficient number of datasets where DL models perform best, allowing for a
+thorough analysis of the conditions under which DL models excel. Building on
+the results of this benchmark, we train a model that predicts scenarios where
+DL models outperform alternative methods with 86.1% accuracy (AUC 0.78). We
+present insights derived from this characterization and compare these findings
+to previous benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Poly2Vec: Polymorphic Encoding of Geospatial Objects for Spatial
+  Reasoning with Deep Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14806v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14806v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maria Despoina Siampou, Jialiang Li, John Krumm, Cyrus Shahabi, Hua Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Encoding geospatial data is crucial for enabling machine learning (ML) models
+to perform tasks that require spatial reasoning, such as identifying the
+topological relationships between two different geospatial objects. However,
+existing encoding methods are limited as they are typically customized to
+handle only specific types of spatial data, which impedes their applicability
+across different downstream tasks where multiple data types coexist. To address
+this, we introduce Poly2Vec, an encoding framework that unifies the modeling of
+different geospatial objects, including 2D points, polylines, and polygons,
+irrespective of the downstream task. We leverage the power of the 2D Fourier
+transform to encode useful spatial properties, such as shape and location, from
+geospatial objects into fixed-length vectors. These vectors are then inputted
+into neural network models for spatial reasoning tasks.This unified approach
+eliminates the need to develop and train separate models for each distinct
+spatial type. We evaluate Poly2Vec on both synthetic and real datasets of mixed
+geometry types and verify its consistent performance across several downstream
+spatial reasoning tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MaskCycleGAN-based Whisper to Normal Speech Conversion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14797v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14797v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        K. Rohith Gupta, K. Ramnath, S. Johanan Joysingh, P. Vijayalakshmi, T. Nagarajan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Whisper to normal speech conversion is an active area of research. Various
+architectures based on generative adversarial networks have been proposed in
+the recent past. Especially, recent study shows that MaskCycleGAN, which is a
+mask guided, and cyclic consistency keeping, generative adversarial network,
+performs really well for voice conversion from spectrogram representations. In
+the current work we present a MaskCycleGAN approach for the conversion of
+whispered speech to normal speech. We find that tuning the mask parameters, and
+pre-processing the signal with a voice activity detector provides superior
+performance when compared to the existing approach. The wTIMIT dataset is used
+for evaluation. Objective metrics such as PESQ and G-Loss are used to evaluate
+the converted speech, along with subjective evaluation using mean opinion
+score. The results show that the proposed approach offers considerable
+benefits.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>submitted to TENCON 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning from Complementary Features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14788v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14788v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kosuke Sugiyama, Masato Uchida
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While precise data observation is essential for the learning processes of
+predictive models, it can be challenging owing to factors such as insufficient
+observation accuracy, high collection costs, and privacy constraints. In this
+paper, we examines cases where some qualitative features are unavailable as
+precise information indicating "what it is," but rather as complementary
+information indicating "what it is not." We refer to features defined by
+precise information as ordinary features (OFs) and those defined by
+complementary information as complementary features (CFs). We then formulate a
+new learning scenario termed Complementary Feature Learning (CFL), where
+predictive models are constructed using instances consisting of OFs and CFs.
+The simplest formalization of CFL applies conventional supervised learning
+directly using the observed values of CFs. However, this approach does not
+resolve the ambiguity associated with CFs, making learning challenging and
+complicating the interpretation of the predictive model's specific predictions.
+Therefore, we derive an objective function from an information-theoretic
+perspective to estimate the OF values corresponding to CFs and to predict
+output labels based on these estimations. Based on this objective function, we
+propose a theoretically guaranteed graph-based estimation method along with its
+practical approximation, for estimating OF values corresponding to CFs. The
+results of numerical experiments conducted with real-world data demonstrate
+that our proposed method effectively estimates OF values corresponding to CFs
+and predicts output labels.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unsupervised-to-Online Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14785v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14785v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junsu Kim, Seohong Park, Sergey Levine
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Offline-to-online reinforcement learning (RL), a framework that trains a
+policy with offline RL and then further fine-tunes it with online RL, has been
+considered a promising recipe for data-driven decision-making. While sensible,
+this framework has drawbacks: it requires domain-specific offline RL
+pre-training for each task, and is often brittle in practice. In this work, we
+propose unsupervised-to-online RL (U2O RL), which replaces domain-specific
+supervised offline RL with unsupervised offline RL, as a better alternative to
+offline-to-online RL. U2O RL not only enables reusing a single pre-trained
+model for multiple downstream tasks, but also learns better representations,
+which often result in even better performance and stability than supervised
+offline-to-online RL. To instantiate U2O RL in practice, we propose a general
+recipe for U2O RL to bridge task-agnostic unsupervised offline skill-based
+policy pre-training and supervised online fine-tuning. Throughout our
+experiments in nine state-based and pixel-based environments, we empirically
+demonstrate that U2O RL achieves strong performance that matches or even
+outperforms previous offline-to-online RL approaches, while being able to reuse
+a single pre-trained model for a number of different downstream tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GINN-KAN: Interpretability pipelining with applications in Physics
+  Informed Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14780v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14780v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nisal Ranasinghe, Yu Xia, Sachith Seneviratne, Saman Halgamuge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural networks are powerful function approximators, yet their ``black-box"
+nature often renders them opaque and difficult to interpret. While many
+post-hoc explanation methods exist, they typically fail to capture the
+underlying reasoning processes of the networks. A truly interpretable neural
+network would be trained similarly to conventional models using techniques such
+as backpropagation, but additionally provide insights into the learned
+input-output relationships. In this work, we introduce the concept of
+interpretability pipelineing, to incorporate multiple interpretability
+techniques to outperform each individual technique. To this end, we first
+evaluate several architectures that promise such interpretability, with a
+particular focus on two recent models selected for their potential to
+incorporate interpretability into standard neural network architectures while
+still leveraging backpropagation: the Growing Interpretable Neural Network
+(GINN) and Kolmogorov Arnold Networks (KAN). We analyze the limitations and
+strengths of each and introduce a novel interpretable neural network GINN-KAN
+that synthesizes the advantages of both models. When tested on the Feynman
+symbolic regression benchmark datasets, GINN-KAN outperforms both GINN and KAN.
+To highlight the capabilities and the generalizability of this approach, we
+position GINN-KAN as an alternative to conventional black-box networks in
+Physics-Informed Neural Networks (PINNs). We expect this to have far-reaching
+implications in the application of deep learning pipelines in the natural
+sciences. Our experiments with this interpretable PINN on 15 different partial
+differential equations demonstrate that GINN-KAN augmented PINNs outperform
+PINNs with black-box networks in solving differential equations and surpass the
+capabilities of both GINN and KAN.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GPU-Accelerated Counterfactual Regret Minimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14778v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14778v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juho Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Counterfactual regret minimization (CFR) is a family of algorithms of
+no-regret learning dynamics capable of solving large-scale imperfect
+information games. There has been a notable lack of work on making CFR more
+computationally efficient. We propose implementing this algorithm as a series
+of dense and sparse matrix and vector operations, thereby making it highly
+parallelizable for a graphical processing unit. Our experiments show that our
+implementation performs up to about 352.5 times faster than OpenSpiel's Python
+implementation and up to about 22.2 times faster than OpenSpiel's C++
+implementation and the speedup becomes more pronounced as the size of the game
+being solved grows.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Quartered Chirp Spectral Envelope for Whispered vs Normal Speech
+  Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14777v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14777v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        S. Johanan Joysingh, P. Vijayalakshmi, T. Nagarajan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Whispered speech as an acceptable form of human-computer interaction is
+gaining traction. Systems that address multiple modes of speech require a
+robust front-end speech classifier. Performance of whispered vs normal speech
+classification drops in the presence of additive white Gaussian noise, since
+normal speech takes on some of the characteristics of whispered speech. In this
+work, we propose a new feature named the quartered chirp spectral envelope, a
+combination of the chirp spectrum and the quartered spectral envelope, to
+classify whispered and normal speech. The chirp spectrum can be fine-tuned to
+obtain customized features for a given task, and the quartered spectral
+envelope has been proven to work especially well for the current task. The
+feature is trained on a one dimensional convolutional neural network, that
+captures the trends in the spectral envelope. The proposed system performs
+better than the state of the art, in the presence of white noise.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>submitted to TENCON 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Instruct-SkillMix: A Powerful Pipeline for LLM Instruction Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14774v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14774v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simran Kaur, Simon Park, Anirudh Goyal, Sanjeev Arora
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Instruct-SkillMix, an automated approach for creating diverse,
+high quality SFT data. The Instruct-SkillMix pipeline involves two stages, each
+leveraging an existing powerful LLM: (1) Skill extraction: uses the LLM to
+extract core "skills" for instruction-following, either from existing datasets,
+or by directly prompting the model; (2) Data generation: uses the powerful LLM
+to generate (instruction, response) data that exhibit a randomly chosen pair of
+these skills. Here, the use of random skill combinations promotes diversity and
+difficulty.
+  Vanilla SFT (i.e., no PPO, DPO, or RL methods) on data generated from
+Instruct-SkillMix leads to strong gains on instruction following benchmarks
+such as AlpacaEval 2.0, MT-Bench, and WildBench. With just $4$K examples,
+LLaMA-3-8B-Base achieves 42.76% length-controlled win rate on AlpacaEval 2.0.
+To our knowledge, this achieves state-of-the-art performance among all models
+that have only undergone SFT (no RL methods) and competes with proprietary
+models such as Claude 3 Opus and LLaMA-3.1-405B-Instruct.
+  Ablation studies also suggest plausible reasons for why creating open
+instruction-tuning datasets via naive crowd-sourcing has proved difficult.
+Introducing low quality answers ("shirkers") in $20\%$ of Instruct-SkillMix
+examples causes performance to plummet, sometimes catastrophically.
+  The Instruct-SkillMix pipeline is flexible and is adaptable to other
+settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Channel-wise Influence: Estimating Data Influence for Multivariate Time
+  Series 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14763v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14763v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muyao Wang, Zeke Xie, Bo Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The influence function, a technique from robust statistics, measures the
+impact on model parameters or related functions when training data is removed
+or modified. This effective and valuable post-hoc method allows for studying
+the interpretability of machine learning models without requiring costly model
+retraining. It would provide extensions like increasing model performance,
+improving model generalization, and offering interpretability. Recently,
+Multivariate Time Series (MTS) analysis has become an important yet challenging
+task, attracting significant attention. However, there is no preceding research
+on the influence functions of MTS to shed light on the effects of modifying the
+channel of training MTS. Given that each channel in an MTS plays a crucial role
+in its analysis, it is essential to characterize the influence of different
+channels. To fill this gap, we propose a channel-wise influence function, which
+is the first method that can estimate the influence of different channels in
+MTS, utilizing a first-order gradient approximation that leverages the more
+informative average gradient of the data set. Additionally, we demonstrate how
+this influence function can be used to estimate the impact of a channel in MTS.
+Finally, we validated the accuracy and effectiveness of our influence
+estimation function in critical MTS analysis tasks, such as MTS anomaly
+detection and MTS forecasting. According to abundant experiments on real-world
+dataset, the original influence function performs worse than our method and
+even fail for the channel pruning problem, which demonstrate the superiority
+and necessity of channel-wise influence function in MTS analysis tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explainable Hierarchical Urban Representation Learning for Commuting
+  Flow Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14762v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14762v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingfei Cai, Yanbo Pang, Yoshihide Sekimoto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Commuting flow prediction is an essential task for municipal operations in
+the real world. Previous studies have revealed that it is feasible to estimate
+the commuting origin-destination (OD) demand within a city using multiple
+auxiliary data. However, most existing methods are not suitable to deal with a
+similar task at a large scale, namely within a prefecture or the whole nation,
+owing to the increased number of geographical units that need to be maintained.
+In addition, region representation learning is a universal approach for gaining
+urban knowledge for diverse metropolitan downstream tasks. Although many
+researchers have developed comprehensive frameworks to describe urban units
+from multi-source data, they have not clarified the relationship between the
+selected geographical elements. Furthermore, metropolitan areas naturally
+preserve ranked structures, like cities and their inclusive districts, which
+makes elucidating relations between cross-level urban units necessary.
+Therefore, we develop a heterogeneous graph-based model to generate meaningful
+region embeddings at multiple spatial resolutions for predicting different
+types of inter-level OD flows. To demonstrate the effectiveness of the proposed
+method, extensive experiments were conducted using real-world aggregated mobile
+phone datasets collected from Shizuoka Prefecture, Japan. The results indicate
+that our proposed model outperforms existing models in terms of a uniform urban
+structure. We extend the understanding of predicted results using reasonable
+explanations to enhance the credibility of the model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning effective pruning at initialization from iterative pruning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14757v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14757v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengkai Liu, Yaofeng Cheng, Fusheng Zha, Wei Guo, Lining Sun, Zhenshan Bing, Chenguang Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pruning at initialization (PaI) reduces training costs by removing weights
+before training, which becomes increasingly crucial with the growing network
+size. However, current PaI methods still have a large accuracy gap with
+iterative pruning, especially at high sparsity levels. This raises an
+intriguing question: can we get inspiration from iterative pruning to improve
+the PaI performance? In the lottery ticket hypothesis, the iterative rewind
+pruning (IRP) finds subnetworks retroactively by rewinding the parameter to the
+original initialization in every pruning iteration, which means all the
+subnetworks are based on the initial state. Here, we hypothesise the surviving
+subnetworks are more important and bridge the initial feature and their
+surviving score as the PaI criterion. We employ an end-to-end neural network
+(\textbf{AutoS}parse) to learn this correlation, input the model's initial
+features, output their score and then prune the lowest score parameters before
+training. To validate the accuracy and generalization of our method, we
+performed PaI across various models. Results show that our approach outperforms
+existing methods in high-sparsity settings. Notably, as the underlying logic of
+model pruning is consistent in different models, only one-time IRP on one model
+is needed (e.g., once IRP on ResNet-18/CIFAR-10, AutoS can be generalized to
+VGG-16/CIFAR-10, ResNet-18/TinyImageNet, et al.). As the first neural
+network-based PaI method, we conduct extensive experiments to validate the
+factors influencing this approach. These results reveal the learning tendencies
+of neural networks and provide new insights into our understanding and research
+of PaI from a practical perspective. Our code is available at:
+https://github.com/ChengYaofeng/AutoSparse.git.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Training-Free Time-Series Anomaly Detection: Leveraging Image Foundation
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14756v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14756v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nobuo Namura, Yuma Ichikawa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in time-series anomaly detection have relied on deep
+learning models to handle the diverse behaviors of time-series data. However,
+these models often suffer from unstable training and require extensive
+hyperparameter tuning, leading to practical limitations. Although foundation
+models present a potential solution, their use in time series is limited. To
+overcome these issues, we propose an innovative image-based, training-free
+time-series anomaly detection (ITF-TAD) approach. ITF-TAD converts time-series
+data into images using wavelet transform and compresses them into a single
+representation, leveraging image foundation models for anomaly detection. This
+approach achieves high-performance anomaly detection without unstable neural
+network training or hyperparameter tuning. Furthermore, ITF-TAD identifies
+anomalies across different frequencies, providing users with a detailed
+visualization of anomalies and their corresponding frequencies. Comprehensive
+experiments on five benchmark datasets, including univariate and multivariate
+time series, demonstrate that ITF-TAD offers a practical and effective solution
+with performance exceeding or comparable to that of deep models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Benchmarking Reinforcement Learning Methods for Dexterous Robotic
+  Manipulation with a Three-Fingered Gripper 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14747v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14747v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elizabeth Cutler, Yuning Xing, Tony Cui, Brendan Zhou, Koen van Rijnsoever, Ben Hart, David Valencia, Lee Violet C. Ong, Trevor Gee, Minas Liarokapis, Henry Williams
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement Learning (RL) training is predominantly conducted in
+cost-effective and controlled simulation environments. However, the transfer of
+these trained models to real-world tasks often presents unavoidable challenges.
+This research explores the direct training of RL algorithms in controlled yet
+realistic real-world settings for the execution of dexterous manipulation. The
+benchmarking results of three RL algorithms trained on intricate in-hand
+manipulation tasks within practical real-world contexts are presented. Our
+study not only demonstrates the practicality of RL training in authentic
+real-world scenarios, facilitating direct real-world applications, but also
+provides insights into the associated challenges and considerations.
+Additionally, our experiences with the employed experimental methods are
+shared, with the aim of empowering and engaging fellow researchers and
+practitioners in this dynamic field of robotics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Differentially Private Diffusion Models via Stochastic
+  Adversarial Distillation <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14738v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14738v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bochao Liu, Pengju Wang, Shiming Ge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While the success of deep learning relies on large amounts of training
+datasets, data is often limited in privacy-sensitive domains. To address this
+challenge, generative model learning with differential privacy has emerged as a
+solution to train private generative models for desensitized data generation.
+However, the quality of the images generated by existing methods is limited due
+to the complexity of modeling data distribution. We build on the success of
+diffusion models and introduce DP-SAD, which trains a private diffusion model
+by a stochastic adversarial distillation method. Specifically, we first train a
+diffusion model as a teacher and then train a student by distillation, in which
+we achieve differential privacy by adding noise to the gradients from other
+models to the student. For better generation quality, we introduce a
+discriminator to distinguish whether an image is from the teacher or the
+student, which forms the adversarial training. Extensive experiments and
+analysis clearly demonstrate the effectiveness of our proposed method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted by ECCV 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bandwidth-Aware and Overlap-Weighted Compression for
+  Communication-Efficient Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14736v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14736v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zichen Tang, Junlin Huang, Rudan Yan, Yuxin Wang, Zhenheng Tang, Shaohuai Shi, Amelie Chi Zhou, Xiaowen Chu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current data compression methods, such as sparsification in Federated
+Averaging (FedAvg), effectively enhance the communication efficiency of
+Federated Learning (FL). However, these methods encounter challenges such as
+the straggler problem and diminished model performance due to heterogeneous
+bandwidth and non-IID (Independently and Identically Distributed) data. To
+address these issues, we introduce a bandwidth-aware compression framework for
+FL, aimed at improving communication efficiency while mitigating the problems
+associated with non-IID data. First, our strategy dynamically adjusts
+compression ratios according to bandwidth, enabling clients to upload their
+models at a close pace, thus exploiting the otherwise wasted time to transmit
+more data. Second, we identify the non-overlapped pattern of retained
+parameters after compression, which results in diminished client update signals
+due to uniformly averaged weights. Based on this finding, we propose a
+parameter mask to adjust the client-averaging coefficients at the parameter
+level, thereby more closely approximating the original updates, and improving
+the training convergence under heterogeneous environments. Our evaluations
+reveal that our method significantly boosts model accuracy, with a maximum
+improvement of 13% over the uncompressed FedAvg. Moreover, it achieves a
+$3.37\times$ speedup in reaching the target accuracy compared to FedAvg with a
+Top-K compressor, demonstrating its effectiveness in accelerating convergence
+with compression. The integration of common compression techniques into our
+framework further establishes its potential as a versatile foundation for
+future cross-device, communication-efficient FL research, addressing critical
+challenges in FL and advancing the field of distributed machine learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ General-Kindred Physics-Informed Neural Network to the Solutions of
+  Singularly Perturbed Differential Equations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14734v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14734v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sen Wang, Peizhi Zhao, Qinglong Ma, Tao Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Physics-Informed Neural Networks (PINNs) have become a promising research
+direction in the field of solving Partial Differential Equations (PDEs).
+Dealing with singular perturbation problems continues to be a difficult
+challenge in the field of PINN. The solution of singular perturbation problems
+often exhibits sharp boundary layers and steep gradients, and traditional PINN
+cannot achieve approximation of boundary layers. In this manuscript, we propose
+the General-Kindred Physics-Informed Neural Network (GKPINN) for solving
+Singular Perturbation Differential Equations (SPDEs). This approach utilizes
+asymptotic analysis to acquire prior knowledge of the boundary layer from the
+equation and establishes a novel network to assist PINN in approximating the
+boundary layer. It is compared with traditional PINN by solving examples of
+one-dimensional, two-dimensional, and time-varying SPDE equations. The research
+findings underscore the exceptional performance of our novel approach, GKPINN,
+which delivers a remarkable enhancement in reducing the $L_2$ error by two to
+four orders of magnitude compared to the established PINN methodology. This
+significant improvement is accompanied by a substantial acceleration in
+convergence rates, without compromising the high precision that is critical for
+our applications. Furthermore, GKPINN still performs well in extreme cases with
+perturbation parameters of ${1\times10}^{-38}$, demonstrating its excellent
+generalization ability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TART: Boosting Clean Accuracy Through Tangent Direction Guided
+  Adversarial Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14728v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14728v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bongsoo Yi, Rongjie Lai, Yao Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adversarial training has been shown to be successful in enhancing the
+robustness of deep neural networks against adversarial attacks. However, this
+robustness is accompanied by a significant decline in accuracy on clean data.
+In this paper, we propose a novel method, called Tangent Direction Guided
+Adversarial Training (TART), that leverages the tangent space of the data
+manifold to ameliorate the existing adversarial defense algorithms. We argue
+that training with adversarial examples having large normal components
+significantly alters the decision boundary and hurts accuracy. TART mitigates
+this issue by estimating the tangent direction of adversarial examples and
+allocating an adaptive perturbation limit according to the norm of their
+tangential component. To the best of our knowledge, our paper is the first work
+to consider the concept of tangent space and direction in the context of
+adversarial defense. We validate the effectiveness of TART through extensive
+experiments on both simulated and benchmark datasets. The results demonstrate
+that TART consistently boosts clean accuracy while retaining a high level of
+robustness against adversarial attacks. Our findings suggest that incorporating
+the geometric properties of data can lead to more effective and efficient
+adversarial training methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PAT: Pruning-Aware Tuning for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14721v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14721v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yijiang Liu, Huanrui Yang, Youxin Chen, Rongyu Zhang, Miao Wang, Yuan Du, Li Du
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) excel in language tasks, especially with
+supervised fine-tuning after pre-training. However, their substantial memory
+and computational requirements hinder practical applications. Structural
+pruning, which reduces less significant weight dimensions, is one solution.
+Yet, traditional post-hoc pruning often leads to significant performance loss,
+with limited recovery from further fine-tuning due to reduced capacity. Since
+the model fine-tuning refines the general and chaotic knowledge in pre-trained
+models, we aim to incorporate structural pruning with the fine-tuning, and
+propose the Pruning-Aware Tuning (PAT) paradigm to eliminate model redundancy
+while preserving the model performance to the maximum extend. Specifically, we
+insert the innovative Hybrid Sparsification Modules (HSMs) between the
+Attention and FFN components to accordingly sparsify the upstream and
+downstream linear modules. The HSM comprises a lightweight operator and a
+globally shared trainable mask. The lightweight operator maintains a training
+overhead comparable to that of LoRA, while the trainable mask unifies the
+channels to be sparsified, ensuring structural pruning. Additionally, we
+propose the Identity Loss which decouples the transformation and scaling
+properties of the HSMs to enhance training robustness. Extensive experiments
+demonstrate that PAT excels in both performance and efficiency. For example,
+our Llama2-7b model with a 25\% pruning ratio achieves 1.33$\times$ speedup
+while outperforming the LoRA-finetuned model by up to 1.26\% in accuracy with a
+similar training cost. Code:
+https://github.com/kriskrisliu/PAT_Pruning-Aware-Tuning
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Graph Attention Inference of Network Topology in Multi-Agent Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15449v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15449v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Akshay Kolli, Reza Azadeh, Kshitj Jerath
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurately identifying the underlying graph structures of multi-agent systems
+remains a difficult challenge. Our work introduces a novel machine
+learning-based solution that leverages the attention mechanism to predict
+future states of multi-agent systems by learning node representations. The
+graph structure is then inferred from the strength of the attention values.
+This approach is applied to both linear consensus dynamics and the non-linear
+dynamics of Kuramoto oscillators, resulting in implicit learning the graph by
+learning good agent representations. Our results demonstrate that the presented
+data-driven graph attention machine learning model can identify the network
+topology in multi-agent systems, even when the underlying dynamic model is not
+known, as evidenced by the F1 scores achieved in the link prediction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at Modeling and Estimation Control
+  Conference 2024; 6 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Simultaneous Training of First- and Second-Order Optimizers in
+  Population-Based Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15421v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15421v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Felix Pfeiffer, Shahram Eivazi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The tuning of hyperparameters in reinforcement learning (RL) is critical, as
+these parameters significantly impact an agent's performance and learning
+efficiency. Dynamic adjustment of hyperparameters during the training process
+can significantly enhance both the performance and stability of learning.
+Population-based training (PBT) provides a method to achieve this by
+continuously tuning hyperparameters throughout the training. This ongoing
+adjustment enables models to adapt to different learning stages, resulting in
+faster convergence and overall improved performance. In this paper, we propose
+an enhancement to PBT by simultaneously utilizing both first- and second-order
+optimizers within a single population. We conducted a series of experiments
+using the TD3 algorithm across various MuJoCo environments. Our results, for
+the first time, empirically demonstrate the potential of incorporating
+second-order optimizers within PBT-based RL. Specifically, the combination of
+the K-FAC optimizer with Adam led to up to a 10% improvement in overall
+performance compared to PBT using only Adam. Additionally, in environments
+where Adam occasionally fails, such as the Swimmer environment, the mixed
+population with K-FAC exhibited more reliable learning outcomes, offering a
+significant advantage in training stability without a substantial increase in
+computational time.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Understanding GNNs for Boolean Satisfiability through Approximation
+  Algorithms <span class="chip">CIKM 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15418v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15418v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jan Hůla, David Mojžíšek, Mikoláš Janota
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The paper deals with the interpretability of Graph Neural Networks in the
+context of Boolean Satisfiability. The goal is to demystify the internal
+workings of these models and provide insightful perspectives into their
+decision-making processes. This is done by uncovering connections to two
+approximation algorithms studied in the domain of Boolean Satisfiability:
+Belief Propagation and Semidefinite Programming Relaxations. Revealing these
+connections has empowered us to introduce a suite of impactful enhancements.
+The first significant enhancement is a curriculum training procedure, which
+incrementally increases the problem complexity in the training set, together
+with increasing the number of message passing iterations of the Graph Neural
+Network. We show that the curriculum, together with several other
+optimizations, reduces the training time by more than an order of magnitude
+compared to the baseline without the curriculum. Furthermore, we apply
+decimation and sampling of initial embeddings, which significantly increase the
+percentage of solved problems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CIKM 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Implicit Geometry of Next-token Prediction: From Language Sparsity
+  Patterns to Model Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15417v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15417v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yize Zhao, Tina Behnia, Vala Vakilian, Christos Thrampoulidis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Next-token prediction (NTP) over large text corpora has become the go-to
+paradigm to train large language models. Yet, it remains unclear how NTP
+influences the mapping of linguistic patterns to geometric properties of the
+resulting model representations. We frame training of large language models as
+soft-label classification over sparse probabilistic label vectors, coupled with
+an analytical approximation that allows unrestricted generation of context
+embeddings. This approach links NTP training to rank-constrained, nuclear-norm
+regularized optimization in the logit domain, offering a framework for
+analyzing the geometry of word and context embeddings. In large embedding
+spaces, we find that NTP implicitly favors learning logits with a sparse plus
+low-rank structure. While the sparse component captures the co-occurrence
+frequency of context-word pairs, the orthogonal low-rank component, which
+becomes dominant as training progresses, depends solely on the sparsity pattern
+of the co-occurrence matrix. Consequently, when projected onto an appropriate
+subspace, representations of contexts that are followed by the same set of
+next-tokens collapse, a phenomenon we term subspace-collapse. We validate our
+findings on synthetic and small-scale real language datasets. Finally, we
+outline potential research directions aimed at deepening the understanding of
+NTP's influence on the learning of linguistic patterns and regularities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at COLM 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Divergence-free neural operators for stress field modeling in
+  polycrystalline materials 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15408v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15408v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad S. Khorrami, Pawan Goyal, Jaber R. Mianroodi, Bob Svendsen, Peter Benner, Dierk Raabe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The purpose of the current work is the development and comparison of Fourier
+neural operators (FNOs) for surrogate modeling of the quasi-static mechanical
+response of polycrystalline materials. Three types of such FNOs are considered
+here: a physics-guided FNO (PgFNO), a physics-informed FNO (PiFNO), and a
+physics-encoded FNO (PeFNO). These are trained and compared with the help of
+stress field data from a reference model for heterogeneous elastic materials
+with a periodic grain microstructure. Whereas PgFNO training is based solely on
+these data, that of the PiFNO and PeFNO is in addition constrained by the
+requirement that stress fields satisfy mechanical equilibrium, i.e., be
+divergence-free. The difference between the PiFNO and PeFNO lies in how this
+constraint is taken into account; in the PiFNO, it is included in the loss
+function, whereas in the PeFNO, it is "encoded" in the operator architecture.
+In the current work, this encoding is based on a stress potential and Fourier
+transforms. As a result, only the training of the PiFNO is constrained by
+mechanical equilibrium; in contrast, mechanical equilibrium constrains both the
+training and output of the PeFNO. Due in particular to this, stress fields
+calculated by the trained PeFNO are significantly more accurate than those
+calculated by the trained PiFNO in the example cases considered.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating Credit VIX (CDS IV) Prediction Methods with Incremental Batch
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15404v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15404v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Robert Taylor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents the experimental process and results of SVM, Gradient
+Boosting, and an Attention-GRU Hybrid model in predicting the Implied
+Volatility of rolled-over five-year spread contracts of credit default swaps
+(CDS) on European corporate debt during the quarter following mid-May '24, as
+represented by the iTraxx/Cboe Europe Main 1-Month Volatility Index (BP
+Volatility). The analysis employs a feature matrix inspired by Merton's
+determinants of default probability. Our comparative assessment aims to
+identify strengths in SOTA and classical machine learning methods for financial
+risk prediction
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring the origins of switching dynamics in a multifunctional
+  reservoir computer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15400v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15400v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrew Flynn, Andreas Amann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The concept of multifunctionality has enabled reservoir computers (RCs), a
+type of dynamical system that is typically realised as an artificial neural
+network, to reconstruct multiple attractors simultaneously using the same set
+of trained weights. However there are many additional phenomena that arise when
+training a RC to reconstruct more than one attractor. Previous studies have
+found that, in certain cases, if the RC fails to reconstruct a coexistence of
+attractors then it exhibits a form of metastability whereby, without any
+external input, the state of the RC switches between different modes of
+behaviour that resemble properties of the attractors it failed to reconstruct.
+In this paper we explore the origins of these switching dynamics in a
+paradigmatic setting via the `seeing double' problem.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint submitted to Frontiers in Network Physiology</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Statistical Framework for Data-dependent Retrieval-Augmented Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15399v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15399v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soumya Basu, Ankit Singh Rawat, Manzil Zaheer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern ML systems increasingly augment input instances with additional
+relevant information to enhance final prediction. Despite growing interest in
+such retrieval-augmented models, their fundamental properties and training are
+not well understood. We propose a statistical framework to study such models
+with two components: 1) a {\em retriever} to identify the relevant information
+out of a large corpus via a data-dependent metric; and 2) a {\em predictor}
+that consumes the input instances along with the retrieved information to make
+the final predictions. We present a principled method for end-to-end training
+of both components and draw connections with various training approaches in the
+literature. Furthermore, we establish excess risk bounds for
+retrieval-augmented models while delineating the contributions of both
+retriever and predictor towards the model performance. We validate the utility
+of our proposed training methods along with the key takeaways from our
+statistical analysis on open domain question answering task where retrieval
+augmentation is important.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating <span class="highlight-title">Pre-Train</span>ing Bias on Severe Acute Respiratory Syndrome
+  <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15398v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15398v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Diego Dimer Rodrigues
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning (ML) is a growing field of computer science that has found
+many practical applications in several domains, including Health. However, as
+data grows in size and availability, and the number of models that aim to aid
+or replace human decisions, it raises the concern that these models can be
+susceptible to bias, which can lead to harm to specific individuals by basing
+its decisions on protected attributes such as gender, religion, sexual
+orientation, ethnicity, and others. Visualization techniques might generate
+insights and help summarize large datasets, enabling data scientists to
+understand the data better before training a model by evaluating pre-training
+metrics applied to the datasets before training, which might contribute to
+identifying potential harm before any effort is put into training and deploying
+the models. This work uses the severe acute respiratory syndrome dataset from
+OpenDataSUS to visualize three pre-training bias metrics and their distribution
+across different regions in Brazil. A random forest model is trained in each
+region and applied to the others. The aim is to compare the bias for the
+different regions, focusing on their protected attributes and comparing the
+model's performance with the metric values.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>short paper for eurovis, 5 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SCAN-Edge: Finding MobileNet-speed Hybrid Networks for Diverse Edge
+  Devices via Hardware-Aware Evolutionary Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15395v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15395v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hung-Yueh Chiang, Diana Marculescu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Designing low-latency and high-efficiency hybrid networks for a variety of
+low-cost commodity edge devices is both costly and tedious, leading to the
+adoption of hardware-aware neural architecture search (NAS) for finding optimal
+architectures. However, unifying NAS for a wide range of edge devices presents
+challenges due to the variety of hardware designs, supported operations, and
+compilation optimizations. Existing methods often fix the search space of
+architecture choices (e.g., activation, convolution, or self-attention) and
+estimate latency using hardware-agnostic proxies (e.g., FLOPs), which fail to
+achieve proclaimed latency across various edge devices. To address this issue,
+we propose SCAN-Edge, a unified NAS framework that jointly searches for
+self-attention, convolution, and activation to accommodate the wide variety of
+edge devices, including CPU-, GPU-, and hardware accelerator-based systems. To
+handle the large search space, SCAN-Edge relies on with a hardware-aware
+evolutionary algorithm that improves the quality of the search space to
+accelerate the sampling process. Experiments on large-scale datasets
+demonstrate that our hybrid networks match the actual MobileNetV2 latency for
+224x224 input resolution on various commodity edge devices.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Stability Analysis of Physics-Informed Neural Networks for Stiff Linear
+  Differential Equations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15393v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15393v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gianluca Fabiani, Erik Bollt, Constantinos Siettos, Athanasios N. Yannacopoulos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a stability analysis of Physics-Informed Neural Networks (PINNs)
+coupled with random projections, for the numerical solution of (stiff) linear
+differential equations. For our analysis, we consider systems of linear ODEs,
+and linear parabolic PDEs. We prove that properly designed PINNs offer
+consistent and asymptotically stable numerical schemes, thus convergent
+schemes. In particular, we prove that multi-collocation random projection PINNs
+guarantee asymptotic stability for very high stiffness and that
+single-collocation PINNs are $A$-stable. To assess the performance of the PINNs
+in terms of both numerical approximation accuracy and computational cost, we
+compare it with other implicit schemes and in particular backward Euler, the
+midpoint, trapezoidal (Crank-Nikolson), the 2-stage Gauss scheme and the 2 and
+3 stages Radau schemes. We show that the proposed PINNs outperform the above
+traditional schemes, in both numerical approximation accuracy and importantly
+computational cost, for a wide range of step sizes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Panoptic Perception for Autonomous Driving: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15388v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15388v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunge Li, Lanyu Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Panoptic perception represents a forefront advancement in autonomous driving
+technology, unifying multiple perception tasks into a singular, cohesive
+framework to facilitate a thorough understanding of the vehicle's surroundings.
+This survey reviews typical panoptic perception models for their unique inputs
+and architectures and compares them to performance, responsiveness, and
+resource utilization. It also delves into the prevailing challenges faced in
+panoptic perception and explores potential trajectories for future research.
+Our goal is to furnish researchers in autonomous driving with a detailed
+synopsis of panoptic perception, positioning this survey as a pivotal reference
+in the ever-evolving landscape of autonomous driving technologies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CycleGAN with Better Cycles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15374v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15374v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tongzhou Wang, Yihan Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  CycleGAN provides a framework to train image-to-image translation with
+unpaired datasets using cycle consistency loss [4]. While results are great in
+many applications, the pixel level cycle consistency can potentially be
+problematic and causes unrealistic images in certain cases. In this project, we
+propose three simple modifications to cycle consistency, and show that such an
+approach achieves better results with fewer artifacts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report 2018</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Handling Geometric Domain Shifts in Semantic Segmentation of Surgical
+  RGB and Hyperspectral Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15373v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15373v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Silvia Seidlitz, Jan Sellner, Alexander Studier-Fischer, Alessandro Motta, Berkin Özdemir, Beat P. Müller-Stich, Felix Nickel, Lena Maier-Hein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robust semantic segmentation of intraoperative image data holds promise for
+enabling automatic surgical scene understanding and autonomous robotic surgery.
+While model development and validation are primarily conducted on idealistic
+scenes, geometric domain shifts, such as occlusions of the situs, are common in
+real-world open surgeries. To close this gap, we (1) present the first analysis
+of state-of-the-art (SOA) semantic segmentation models when faced with
+geometric out-of-distribution (OOD) data, and (2) propose an augmentation
+technique called "Organ Transplantation", to enhance generalizability. Our
+comprehensive validation on six different OOD datasets, comprising 600 RGB and
+hyperspectral imaging (HSI) cubes from 33 pigs, each annotated with 19 classes,
+reveals a large performance drop in SOA organ segmentation models on geometric
+OOD data. This performance decline is observed not only in conventional RGB
+data (with a dice similarity coefficient (DSC) drop of 46 %) but also in HSI
+data (with a DSC drop of 45 %), despite the richer spectral information
+content. The performance decline increases with the spatial granularity of the
+input data. Our augmentation technique improves SOA model performance by up to
+67 % for RGB data and 90 % for HSI data, achieving performance at the level of
+in-distribution performance on real OOD test data. Given the simplicity and
+effectiveness of our augmentation method, it is a valuable tool for addressing
+geometric domain shifts in surgical scene segmentation, regardless of the
+underlying model. Our code and pre-trained models are publicly available at
+https://github.com/IMSY-DKFZ/htc.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Silvia Seidlitz and Jan Sellner contributed equally</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Temporal Graph Neural Network-Powered Paper Recommendation on Dynamic
+  Citation Networks <span class="chip">AAAI-2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15371v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15371v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junhao Shen, Mohammad Ausaf Ali Haqqani, Beichen Hu, Cheng Huang, Xihao Xie, Tsengdar Lee, Jia Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to the rapid growth of scientific publications, identifying all related
+reference articles in the literature has become increasingly challenging yet
+highly demanding. Existing methods primarily assess candidate publications from
+a static perspective, focusing on the content of articles and their structural
+information, such as citation relationships. There is a lack of research
+regarding how to account for the evolving impact among papers on their
+embeddings. Toward this goal, this paper introduces a temporal dimension to
+paper recommendation strategies. The core idea is to continuously update a
+paper's embedding when new citation relationships appear, enhancing its
+relevance for future recommendations. Whenever a citation relationship is added
+to the literature upon the publication of a paper, the embeddings of the two
+related papers are updated through a Temporal Graph Neural Network (TGN). A
+learnable memory update module based on a Recurrent Neural Network (RNN) is
+utilized to study the evolution of the embedding of a paper in order to predict
+its reference impact in a future timestamp. Such a TGN-based model learns a
+pattern of how people's views of the paper may evolve, aiming to guide paper
+recommendations more precisely. Extensive experiments on an open citation
+network dataset, including 313,278 articles from
+https://paperswithcode.com/about PaperWithCode, have demonstrated the
+effectiveness of the proposed approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 4 figures, accepted by SDU@AAAI-2024. The AAAI Workshop on
+  Scientific Document Understanding (2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimization Solution Functions as Deterministic Policies for Offline
+  Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15368v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15368v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vanshaj Khattar, Ming Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Offline reinforcement learning (RL) is a promising approach for many control
+applications but faces challenges such as limited data coverage and value
+function overestimation. In this paper, we propose an implicit actor-critic
+(iAC) framework that employs optimization solution functions as a deterministic
+policy (actor) and a monotone function over the optimal value of optimization
+as a critic. By encoding optimality in the actor policy, we show that the
+learned policies are robust to the suboptimality of the learned actor
+parameters via the exponentially decaying sensitivity (EDS) property. We obtain
+performance guarantees for the proposed iAC framework and show its benefits
+over general function approximation schemes. Finally, we validate the proposed
+framework on two real-world applications and show a significant improvement
+over state-of-the-art (SOTA) offline RL methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>American Control Conference 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the effectiveness of smartphone IMU sensors and Deep Learning in the
+  detection of cardiorespiratory conditions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15357v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15357v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lorenzo Simone, Luca Miglior, Vincenzo Gervasi, Luca Moroni, Emanuele Vignali, Emanuele Gasparotti, Simona Celi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This research introduces an innovative method for the early screening of
+cardiorespiratory diseases based on an acquisition protocol, which leverages
+commodity smartphone's Inertial Measurement Units (IMUs) and deep learning
+techniques. We collected, in a clinical setting, a dataset featuring recordings
+of breathing kinematics obtained by accelerometer and gyroscope readings from
+five distinct body regions. We propose an end-to-end deep learning pipeline for
+early cardiorespiratory disease screening, incorporating a preprocessing step
+segmenting the data into individual breathing cycles, and a recurrent
+bidirectional module capturing features from diverse body regions. We employed
+Leave-one-out-cross-validation with Bayesian optimization for hyperparameter
+tuning and model selection. The experimental results consistently demonstrated
+the superior performance of a bidirectional Long-Short Term Memory (Bi-LSTM) as
+a feature encoder architecture, yielding an average sensitivity of $0.81 \pm
+0.02$, specificity of $0.82 \pm 0.05$, F1 score of $0.81 \pm 0.02$, and
+accuracy of $80.2\% \pm 3.9$ across diverse seed variations. We also assessed
+generalization capabilities on a skewed distribution, comprising exclusively
+healthy patients not used in training, revealing a true negative rate of $74.8
+\% \pm 4.5$. The sustained accuracy of predictions over time during breathing
+cycles within a single patient underscores the efficacy of the preprocessing
+strategy, highlighting the model's ability to discern significant patterns
+throughout distinct phases of the respiratory cycle. This investigation
+underscores the potential usefulness of widely available smartphones as devices
+for timely cardiorespiratory disease screening in the general population, in
+at-home settings, offering crucial assistance to public health efforts
+(especially during a pandemic outbreaks, such as the recent COVID-19).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimal level set estimation for non-parametric tournament and
+  crowdsourcing problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15356v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15356v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maximilian Graf, Alexandra Carpentier, Nicolas Verzelen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motivated by crowdsourcing, we consider a problem where we partially observe
+the correctness of the answers of $n$ experts on $d$ questions. In this paper,
+we assume that both the experts and the questions can be ordered, namely that
+the matrix $M$ containing the probability that expert $i$ answers correctly to
+question $j$ is bi-isotonic up to a permutation of it rows and columns. When
+$n=d$, this also encompasses the strongly stochastic transitive (SST) model
+from the tournament literature. Here, we focus on the relevant problem of
+deciphering small entries of $M$ from large entries of $M$, which is key in
+crowdsourcing for efficient allocation of workers to questions. More precisely,
+we aim at recovering a (or several) level set $p$ of the matrix up to a
+precision $h$, namely recovering resp. the sets of positions $(i,j)$ in $M$
+such that $M_{ij}>p+h$ and $M_{i,j}<p-h$. We consider, as a loss measure, the
+number of misclassified entries. As our main result, we construct an efficient
+polynomial-time algorithm that turns out to be minimax optimal for this
+classification problem. This heavily contrasts with existing literature in the
+SST model where, for the stronger reconstruction loss,
+statistical-computational gaps have been conjectured. More generally, this
+shades light on the nature of statistical-computational gaps for permutations
+models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimizing Lung Cancer Detection in CT Imaging: A Wavelet Multi-Layer
+  Perceptron (WMLP) Approach Enhanced by Dragonfly Algorithm (DA) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15355v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15355v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bitasadat Jamshidi, Nastaran Ghorbani, Mohsen Rostamy-Malkhalifeh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lung cancer stands as the preeminent cause of cancer-related mortality
+globally. Prompt and precise diagnosis, coupled with effective treatment, is
+imperative to reduce the fatality rates associated with this formidable
+disease. This study introduces a cutting-edge deep learning framework for the
+classification of lung cancer from CT scan imagery. The research encompasses a
+suite of image pre-processing strategies, notably Canny edge detection, and
+wavelet transformations, which precede the extraction of salient features and
+subsequent classification via a Multi-Layer Perceptron (MLP). The optimization
+process is further refined using the Dragonfly Algorithm (DA). The methodology
+put forth has attained an impressive training and testing accuracy of 99.82\%,
+underscoring its efficacy and reliability in the accurate diagnosis of lung
+cancer.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Conformal Disentanglement: A Neural Framework for Perspective Synthesis
+  and Differentiation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15344v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15344v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        George A. Kevrekidis, Eleni D. Koronaki, Yannis G. Kevrekidis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For multiple scientific endeavors it is common to measure a phenomenon of
+interest in more than one ways. We make observations of objects from several
+different perspectives in space, at different points in time; we may also
+measure different properties of a mixture using different types of instruments.
+After collecting this heterogeneous information, it is necessary to be able to
+synthesize a complete picture of what is `common' across its sources: the
+subject we ultimately want to study. However, isolated (`clean') observations
+of a system are not always possible: observations often contain information
+about other systems in its environment, or about the measuring instruments
+themselves. In that sense, each observation may contain information that `does
+not matter' to the original object of study; this `uncommon' information
+between sensors observing the same object may still be important, and
+decoupling it from the main signal(s) useful. We introduce a neural network
+autoencoder framework capable of both tasks: it is structured to identify
+`common' variables, and, making use of orthogonality constraints to define
+geometric independence, to also identify disentangled `uncommon' information
+originating from the heterogeneous sensors. We demonstrate applications in
+several computational examples.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UNA: Unifying Alignments of RLHF/PPO, DPO and KTO by a Generalized
+  Implicit Reward Function 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15339v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15339v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhichao Wang, Bin Bi, Can Huang, Shiva Kumar Pentyala, Zixu James Zhu, Sitaram Asur, Na Claire Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  An LLM is pretrained on trillions of tokens, but the pretrained LLM may still
+generate undesired responses. To solve this problem, alignment techniques such
+as RLHF, DPO and KTO are proposed. However, these alignment techniques have
+limitations. For example, RLHF requires training the reward model and policy
+separately, which is complex, time-consuming, memory intensive and unstable
+during training processes. DPO proposes a mapping between an optimal policy and
+a reward, greatly simplifying the training process of RLHF. However, it can not
+take full advantages of a reward model and it is limited to pairwise preference
+data.
+  In this paper, we propose \textbf{UN}ified \textbf{A}lignment (UNA) which
+unifies RLHF/PPO, DPO and KTO. Firstly, we mathematically prove that given the
+classical RLHF objective, the optimal policy is induced by a generalize
+implicit reward function. With this novel mapping between a reward model and an
+optimal policy, UNA can 1. unify RLHF/PPO, DPO and KTO into a supervised
+learning of minimizing the difference between an implicit reward and an
+explicit reward; 2. outperform RLHF/PPO while simplify, stabilize, speed up and
+reduce memory burden of RL fine-tuning process; 3. accommodate different
+feedback types including pairwise, binary and scalar feedback. Downstream
+experiments show UNA outperforms DPO, KTO and RLHF.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ What makes math problems hard for reinforcement learning: a case study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15332v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15332v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Shehper, Anibal M. Medina-Mardones, Bartłomiej Lewandowski, Angus Gruen, Piotr Kucharski, Sergei Gukov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Using a long-standing conjecture from combinatorial group theory, we explore,
+from multiple angles, the challenges of finding rare instances carrying
+disproportionately high rewards. Based on lessons learned in the mathematical
+context defined by the Andrews-Curtis conjecture, we propose algorithmic
+improvements that can be relevant in other domains with ultra-sparse reward
+problems. Although our case study can be formulated as a game, its shortest
+winning sequences are potentially $10^6$ or $10^9$ times longer than those
+encountered in chess. In the process of our study, we demonstrate that one of
+the potential counterexamples due to Akbulut and Kirby, whose status escaped
+direct mathematical methods for 39 years, is stably AC-trivial.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>39 pages, 18 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Artificially intelligent Maxwell's demon for optimal control of open
+  quantum systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15328v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15328v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paolo Andrea Erdman, Robert Czupryniak, Bibek Bhandari, Andrew N. Jordan, Frank Noé, Jens Eisert, Giacomo Guarnieri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Feedback control of open quantum systems is of fundamental importance for
+practical applications in various contexts, ranging from quantum computation to
+quantum error correction and quantum metrology. Its use in the context of
+thermodynamics further enables the study of the interplay between information
+and energy. However, deriving optimal feedback control strategies is highly
+challenging, as it involves the optimal control of open quantum systems, the
+stochastic nature of quantum measurement, and the inclusion of policies that
+maximize a long-term time- and trajectory-averaged goal. In this work, we
+employ a reinforcement learning approach to automate and capture the role of a
+quantum Maxwell's demon: the agent takes the literal role of discovering
+optimal feedback control strategies in qubit-based systems that maximize a
+trade-off between measurement-powered cooling and measurement efficiency.
+Considering weak or projective quantum measurements, we explore different
+regimes based on the ordering between the thermalization, the measurement, and
+the unitary feedback timescales, finding different and highly non-intuitive,
+yet interpretable, strategies. In the thermalization-dominated regime, we find
+strategies with elaborate finite-time thermalization protocols conditioned on
+measurement outcomes. In the measurement-dominated regime, we find that optimal
+strategies involve adaptively measuring different qubit observables reflecting
+the acquired information, and repeating multiple weak measurements until the
+quantum state is "sufficiently pure", leading to random walks in state space.
+Finally, we study the case when all timescales are comparable, finding new
+feedback control strategies that considerably outperform more intuitive ones.
+We discuss a two-qubit example where we explore the role of entanglement and
+conclude discussing the scaling of our results to quantum many-body systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16+10 pages, 21 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MPC-Pipe: an Efficient Pipeline Scheme for Secure Multi-party Machine
+  Learning Inference <span class="chip">ASPLOS'25</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.13643v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.13643v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongqin Wang, Rachit Rajat, Murali Annavaram
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-party computing (MPC) has been gaining popularity as a secure computing
+model over the past few years. However, prior works have demonstrated that MPC
+protocols still pay substantial performance penalties compared to plaintext,
+particularly when applied to ML algorithms. The overhead is due to added
+computation and communication costs. Prior studies, as well as our own
+analysis, found that most MPC protocols today sequentially perform
+communication and computation. The participating parties must compute on their
+shares first and then perform data communication to allow the distribution of
+new secret shares before proceeding to the next computation step. In this work,
+we show that serialization is unnecessary, particularly in the context of ML
+computations (both in Convolutional neural networks and in Transformer-based
+models). We demonstrate that it is possible to carefully orchestrate the
+computation and communication steps to overlap.
+  We propose MPC-Pipe, an efficient MPC system for both training and inference
+of ML workloads, which pipelines computations and communications in an MPC
+protocol during the online phase. MPC-Pipe proposes three pipeline schemes to
+optimize the online phase of ML in the semi-honest majority adversary setting.
+We implement MPC-Pipe by augmenting a modified version of CrypTen, which
+separates online and offline phases. We evaluate the end-to-end system
+performance benefits of the online phase of MPC using deep neural networks
+(VGG16, ResNet50) and Transformers using different network settings. We show
+that MPC-Pipe can improve the throughput and latency of ML workloads.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be appeared in ASPLOS'25</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Assessing Lower Limb Strength using Internet-of-Things Enabled Chair 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.04042v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.04042v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chelsea Yeh, Hanna Kaitlin Dy, Phillip Schodinger, Hudson Kaleb Dy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This project describes the application of the technologies of Machine
+Learning and Internet-of-Things to assess the lower limb strength of
+individuals undergoing rehabilitation or therapy. Specifically, it seeks to
+measure and assess the progress of individuals by sensors attached to chairs
+and processing the data through Google GPU Tensorflow CoLab. Pressure sensors
+are attached to various locations on a chair, including but not limited to the
+seating area, backrest, hand rests, and legs. Sensor data from the individual
+performing both sit-to-stand transition and stand-to-sit transition provides a
+time series dataset regarding the pressure distribution and vibratory motion on
+the chair. The dataset and timing information can then be fed into a machine
+learning model to estimate the relative strength and weakness during various
+phases of the movement.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 Pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On Newton's Method to Unlearn Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.14507v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.14507v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nhung Bui, Xinyang Lu, Rachael Hwee Ling Sim, See-Kiong Ng, Bryan Kian Hsiang Low
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the widespread applications of neural networks (NNs) trained on personal
+data, machine unlearning has become increasingly important for enabling
+individuals to exercise their personal data ownership, particularly the "right
+to be forgotten" from trained NNs. Since retraining is computationally
+expensive, we seek approximate unlearning algorithms for NNs that return
+identical models to the retrained oracle. While Newton's method has been
+successfully used to approximately unlearn linear models, we observe that
+adapting it for NN is challenging due to degenerate Hessians that make
+computing Newton's update impossible. Additionally, we show that when coupled
+with popular techniques to resolve the degeneracy, Newton's method often incurs
+offensively large norm updates and empirically degrades model performance
+post-unlearning. To address these challenges, we propose CureNewton's method, a
+principle approach that leverages cubic regularization to handle the Hessian
+degeneracy effectively. The added regularizer eliminates the need for manual
+finetuning and affords a natural interpretation within the unlearning context.
+Experiments across different models and datasets show that our method can
+achieve competitive unlearning performance to the state-of-the-art algorithm in
+practical unlearning settings, while being theoretically justified and
+efficient in running time.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TAPVid-3D: A Benchmark for Tracking Any Point in 3D 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.05921v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.05921v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Skanda Koppula, Ignacio Rocco, Yi Yang, Joe Heyward, João Carreira, Andrew Zisserman, Gabriel Brostow, Carl Doersch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a new benchmark, TAPVid-3D, for evaluating the task of
+long-range Tracking Any Point in 3D (TAP-3D). While point tracking in two
+dimensions (TAP) has many benchmarks measuring performance on real-world
+videos, such as TAPVid-DAVIS, three-dimensional point tracking has none. To
+this end, leveraging existing footage, we build a new benchmark for 3D point
+tracking featuring 4,000+ real-world videos, composed of three different data
+sources spanning a variety of object types, motion patterns, and indoor and
+outdoor environments. To measure performance on the TAP-3D task, we formulate a
+collection of metrics that extend the Jaccard-based metric used in TAP to
+handle the complexities of ambiguous depth scales across models, occlusions,
+and multi-track spatio-temporal smoothness. We manually verify a large sample
+of trajectories to ensure correct video annotations, and assess the current
+state of the TAP-3D task by constructing competitive baselines using existing
+tracking models. We anticipate this benchmark will serve as a guidepost to
+improve our ability to understand precise 3D motion and surface deformation
+from monocular video. Code for dataset download, generation, and model
+evaluation is available at https://tapvid3d.github.io
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Revisiting LARS for Large Batch Training Generalization of Neural
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.14053v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.14053v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Khoi Do, Duong Nguyen, Hoa Nguyen, Long Tran-Thanh, Nguyen-Hoang Tran, Quoc-Viet Pham
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper explores Large Batch Training techniques using layer-wise adaptive
+scaling ratio (LARS) across diverse settings, uncovering insights. LARS
+algorithms with warm-up tend to be trapped in sharp minimizers early on due to
+redundant ratio scaling. Additionally, a fixed steep decline in the latter
+phase restricts deep neural networks from effectively navigating early-phase
+sharp minimizers. Building on these findings, we propose Time Varying LARS
+(TVLARS), a novel algorithm that replaces warm-up with a configurable
+sigmoid-like function for robust training in the initial phase. TVLARS promotes
+gradient exploration early on, surpassing sharp optimizers and gradually
+transitioning to LARS for robustness in later phases. Extensive experiments
+demonstrate that TVLARS consistently outperforms LARS and LAMB in most cases,
+with up to 2\% improvement in classification scenarios. Notably, in all
+self-supervised learning cases, TVLARS dominates LARS and LAMB with performance
+improvements of up to 10\%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A domain decomposition-based autoregressive deep learning model for
+  unsteady and nonlinear partial differential equations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14461v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14461v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sheel Nidhan, Haoliang Jiang, Lalit Ghule, Clancy Umphrey, Rishikesh Ranade, Jay Pathak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a domain-decomposition-based deep learning (DL)
+framework, named transient-CoMLSim, for accurately modeling unsteady and
+nonlinear partial differential equations (PDEs). The framework consists of two
+key components: (a) a convolutional neural network (CNN)-based autoencoder
+architecture and (b) an autoregressive model composed of fully connected
+layers. Unlike existing state-of-the-art methods that operate on the entire
+computational domain, our CNN-based autoencoder computes a lower-dimensional
+basis for solution and condition fields represented on subdomains. Timestepping
+is performed entirely in the latent space, generating embeddings of the
+solution variables from the time history of embeddings of solution and
+condition variables. This approach not only reduces computational complexity
+but also enhances scalability, making it well-suited for large-scale
+simulations. Furthermore, to improve the stability of our rollouts, we employ a
+curriculum learning (CL) approach during the training of the autoregressive
+model. The domain-decomposition strategy enables scaling to out-of-distribution
+domain sizes while maintaining the accuracy of predictions -- a feature not
+easily integrated into popular DL-based approaches for physics simulations. We
+benchmark our model against two widely-used DL architectures, Fourier Neural
+Operator (FNO) and U-Net, and demonstrate that our framework outperforms them
+in terms of accuracy, extrapolation to unseen timesteps, and stability for a
+wide range of use cases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Frustrated Random Walks: A Fast Method to Compute Node Distances on
+  Hypergraphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13054v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13054v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Enzhi Li, Scott Nickleach, Bilal Fadlallah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A hypergraph is a generalization of a graph that arises naturally when
+attribute-sharing among entities is considered. Compared to graphs, hypergraphs
+have the distinct advantage that they contain explicit communities and are more
+convenient to manipulate. An open problem in hypergraph research is how to
+accurately and efficiently calculate node distances on hypergraphs. Estimating
+node distances enables us to find a node's nearest neighbors, which has
+important applications in such areas as recommender system, targeted
+advertising, etc. In this paper, we propose using expected hitting times of
+random walks to compute hypergraph node distances. We note that simple random
+walks (SRW) cannot accurately compute node distances on highly complex
+real-world hypergraphs, which motivates us to introduce frustrated random walks
+(FRW) for this task. We further benchmark our method against DeepWalk, and show
+that while the latter can achieve comparable results, FRW has a distinct
+computational advantage in cases where the number of targets is fairly small.
+For such cases, we show that FRW runs in significantly shorter time than
+DeepWalk. Finally, we analyze the time complexity of our method, and show that
+for large and sparse hypergraphs, the complexity is approximately linear,
+rendering it superior to the DeepWalk alternative.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Comprehensive <span class="highlight-title">Survey</span> on Kolmogorov Arnold Networks (KAN) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.11075v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.11075v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuntian Hou, Di Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Through this comprehensive survey of Kolmogorov-Arnold Networks(KAN), we have
+gained a thorough understanding of its theoretical foundation, architectural
+design, application scenarios, and current research progress. KAN, with its
+unique architecture and flexible activation functions, excels in handling
+complex data patterns and nonlinear relationships, demonstrating wide-ranging
+application potential. While challenges remain, KAN is poised to pave the way
+for innovative solutions in various fields, potentially revolutionizing how we
+approach complex computational problems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Reinforcement Learning for Multi-Truck Vehicle Routing Problems
+  with Multi-Leg Demand Routes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.08669v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.08669v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joshua Levin, Randall Correll, Takanori Ide, Takafumi Suzuki, Takaho Saito, Alan Arai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep reinforcement learning (RL) has been shown to be effective in producing
+approximate solutions to some vehicle routing problems (VRPs), especially when
+using policies generated by encoder-decoder attention mechanisms. While these
+techniques have been quite successful for relatively simple problem instances,
+there are still under-researched and highly complex VRP variants for which no
+effective RL method has been demonstrated. In this work we focus on one such
+VRP variant, which contains multiple trucks and multi-leg routing requirements.
+In these problems, demand is required to move along sequences of nodes, instead
+of just from a start node to an end node. With the goal of making deep RL a
+viable strategy for real-world industrial-scale supply chain logistics, we
+develop new extensions to existing encoder-decoder attention models which allow
+them to handle multiple trucks and multi-leg routing requirements. Our models
+have the advantage that they can be trained for a small number of trucks and
+nodes, and then embedded into a large supply chain to yield solutions for
+larger numbers of trucks and nodes. We test our approach on a real supply chain
+environment arising in the operations of Japanese automotive parts manufacturer
+Aisin Corporation, and find that our algorithm outperforms Aisin's previous
+best solution.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NoRA: Nested Low-Rank Adaptation for Efficient Fine-Tuning Large Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.10280v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.10280v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng Lin, Lujun Li, Dezhi Li, Jie Zou, Wei Xue, Yike Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce Nested Low-Rank Adaptation (NoRA), a novel
+approach to parameter-efficient fine-tuning that extends the capabilities of
+Low-Rank Adaptation (LoRA) techniques. Vanilla LoRA overlooks pre-trained
+weight inheritance and still requires fine-tuning numerous parameters. To
+addresses these issues, our NoRA adopts a dual-layer nested structure with
+Singular Value Decomposition (SVD), effectively leveraging original matrix
+knowledge while reducing tunable parameters. Specifically, NoRA freezes the
+outer LoRA weights and utilizes an inner LoRA design, providing enhanced
+control over model optimization. This approach allows the model to more
+precisely adapt to specific tasks while maintaining a compact parameter space.
+By freezing outer LoRA weights and using an inner LoRA design, NoRA enables
+precise task adaptation with a compact parameter space. Evaluations on tasks
+including commonsense reasoning with large language models, fine-tuning
+vision-language models, and subject-driven generation demonstrate NoRA's
+superiority over LoRA and its variants. Code will be released upon acceptance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress, revisions ongoing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Graph GOSPA metric: a metric to measure the discrepancy between graphs
+  of different sizes <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.07596v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.07596v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinhao Gu, Ángel F. García-Fernández, Robert E. Firth, Lennart Svensson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a metric to measure the dissimilarity between graphs that
+may have a different number of nodes. The proposed metric extends the
+generalised optimal subpattern assignment (GOSPA) metric, which is a metric for
+sets, to graphs. The proposed graph GOSPA metric includes costs associated with
+node attribute errors for properly assigned nodes, missed and false nodes and
+edge mismatches between graphs. The computation of this metric is based on
+finding the optimal assignments between nodes in the two graphs, with the
+possibility of leaving some of the nodes unassigned. We also propose a lower
+bound for the metric, which is also a metric for graphs and is computable in
+polynomial time using linear programming. The metric is first derived for
+undirected unweighted graphs and it is then extended to directed and weighted
+graphs. The properties of the metric are demonstrated via simulated and
+empirical datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in IEEE Transactions on Signal Processing. The code is
+  available at https://github.com/JinhaoGu/The-graph-GOSPA-metric</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Local Causal Discovery for Structural Evidence of Direct Discrimination 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.14848v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.14848v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jacqueline Maasch, Kyra Gan, Violet Chen, Agni Orfanoudaki, Nil-Jana Akpinar, Fei Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Identifying the causal pathways of unfairness is a critical objective in
+improving policy design and algorithmic decision-making. Prior work in causal
+fairness analysis often requires knowledge of the causal graph, hindering
+practical applications in complex or low-knowledge domains. Moreover, global
+discovery methods that learn causal structure from data can result in unstable
+performance with finite samples, potentially leading to contradictory fairness
+conclusions. To mitigate these issues, we introduce local discovery for direct
+discrimination (LD3): a method that uncovers structural evidence of direct
+discrimination by identifying the causal parents of an outcome variable. LD3
+performs a linear number of conditional independence tests relative to variable
+set size, and allows for latent confounding under the sufficient condition that
+no parent of the outcome is latent. We show that LD3 returns a valid adjustment
+set (VAS) under a new graphical criterion for the weighted controlled direct
+effect, a qualitative indicator of direct discrimination. LD3 limits
+unnecessary adjustment, providing interpretable VAS for assessing unfairness.
+We use LD3 to analyze causal fairness in two complex decision systems: criminal
+recidivism prediction and liver transplant allocation. LD3 was more
+time-efficient and returned more plausible results on real-world data than
+baselines, which took 46x to 5870x longer to execute.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Development of a Large Language Model-based Multi-Agent Clinical
+  Decision Support System for Korean Triage and Acuity Scale (KTAS)-Based
+  Triage and Treatment Planning in Emergency Departments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.07531v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.07531v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seungjun Han, Wongyung Choi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Emergency department (ED) overcrowding and the complexity of rapid
+decision-making in critical care settings pose significant challenges to
+healthcare systems worldwide. While clinical decision support systems (CDSS)
+have shown promise, the integration of large language models (LLMs) offers new
+possibilities for enhancing triage accuracy and clinical decision-making. This
+study presents an LLM-driven CDSS designed to assist ED physicians and nurses
+in patient triage, treatment planning, and overall emergency care management.
+  We developed a multi-agent CDSS utilizing Llama-3-70b as the base LLM,
+orchestrated by CrewAI and Langchain. The system comprises four AI agents
+emulating key ED roles: Triage Nurse, Emergency Physician, Pharmacist, and ED
+Coordinator. It incorporates the Korean Triage and Acuity Scale (KTAS) for
+triage assessment and integrates with the RxNorm API for medication management.
+  The model was evaluated using the Asclepius dataset, with performance
+assessed by a clinical emergency medicine specialist. The CDSS demonstrated
+high accuracy in triage decision-making compared to the baseline of a
+single-agent system. Furthermore, the system exhibited strong performance in
+critical areas, including primary diagnosis, critical findings identification,
+disposition decision-making, treatment planning, and resource allocation.
+  Our multi-agent CDSS demonstrates significant potential for supporting
+comprehensive emergency care management. By leveraging state-of-the-art AI
+technologies, this system offers a scalable and adaptable tool that could
+enhance emergency medical care delivery, potentially alleviating ED
+overcrowding and improving patient outcomes. This work contributes to the
+growing field of AI applications in emergency medicine and offers a promising
+direction for future research and clinical implementation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Pareto Front Approximation for Multi-Objective Session-Based Recommender
+  Systems <span class="chip">RecSys '24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.16828v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.16828v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Timo Wilm, Philipp Normann, Felix Stepprath
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work introduces MultiTRON, an approach that adapts Pareto front
+approximation techniques to multi-objective session-based recommender systems
+using a transformer neural network. Our approach optimizes trade-offs between
+key metrics such as click-through and conversion rates by training on sampled
+preference vectors. A significant advantage is that after training, a single
+model can access the entire Pareto front, allowing it to be tailored to meet
+the specific requirements of different stakeholders by adjusting an additional
+input vector that weights the objectives. We validate the model's performance
+through extensive offline and online evaluation. For broader application and
+research, the source code is made available at
+https://github.com/otto-de/MultiTRON. The results confirm the model's ability
+to manage multiple recommendation objectives effectively, offering a flexible
+tool for diverse business needs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the Eighteenth ACM Conference on Recommender Systems
+  (RecSys '24)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Time Series Analysis for Education: Methods, Applications, and Future
+  Directions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13960v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13960v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengzhong Mao, Chaoli Zhang, Yichi Song, Jindong Wang, Xiao-Jun Zeng, Zenglin Xu, Qingsong Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in the collection and analysis of sequential educational
+data have brought time series analysis to a pivotal position in educational
+research, highlighting its essential role in facilitating data-driven
+decision-making. However, there is a lack of comprehensive summaries that
+consolidate these advancements. To the best of our knowledge, this paper is the
+first to provide a comprehensive review of time series analysis techniques
+specifically within the educational context. We begin by exploring the
+landscape of educational data analytics, categorizing various data sources and
+types relevant to education. We then review four prominent time series
+methods-forecasting, classification, clustering, and anomaly
+detection-illustrating their specific application points in educational
+settings. Subsequently, we present a range of educational scenarios and
+applications, focusing on how these methods are employed to address diverse
+educational tasks, which highlights the practical integration of multiple time
+series methods to solve complex educational problems. Finally, we conclude with
+a discussion on future directions, including personalized learning analytics,
+multimodal data fusion, and the role of large language models (LLMs) in
+educational time series. The contributions of this paper include a detailed
+taxonomy of educational data, a synthesis of time series techniques with
+specific educational applications, and a forward-looking perspective on
+emerging trends and future research opportunities in educational analysis. The
+related papers and resources are available and regularly updated at the project
+page.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 3 figures, 6 tables, project page: see
+  https://github.com/ai-for-edu/time-series-analysis-for-education</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Polyp SAM 2: Advancing Zero shot Polyp Segmentation in Colorectal Cancer
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.05892v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.05892v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mobina Mansoori, Sajjad Shahabodini, Jamshid Abouei, Konstantinos N. Plataniotis, Arash Mohammadi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Polyp segmentation plays a crucial role in the early detection and diagnosis
+of colorectal cancer. However, obtaining accurate segmentations often requires
+labor-intensive annotations and specialized models. Recently, Meta AI Research
+released a general Segment Anything Model 2 (SAM 2), which has demonstrated
+promising performance in several segmentation tasks. In this manuscript, we
+evaluate the performance of SAM 2 in segmenting polyps under various prompted
+settings. We hope this report will provide insights to advance the field of
+polyp segmentation and promote more interesting work in the future. This
+project is publicly available at https://github.com/ sajjad-sh33/Polyp-SAM-2.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EX-DRL: Hedging Against Heavy Losses with EXtreme Distributional
+  Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.12446v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.12446v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Parvin Malekzadeh, Zissis Poulos, Jacky Chen, Zeyu Wang, Konstantinos N. Plataniotis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in Distributional Reinforcement Learning (DRL) for
+modeling loss distributions have shown promise in developing hedging strategies
+in derivatives markets. A common approach in DRL involves learning the
+quantiles of loss distributions at specified levels using Quantile Regression
+(QR). This method is particularly effective in option hedging due to its direct
+quantile-based risk assessment, such as Value at Risk (VaR) and Conditional
+Value at Risk (CVaR). However, these risk measures depend on the accurate
+estimation of extreme quantiles in the loss distribution's tail, which can be
+imprecise in QR-based DRL due to the rarity and extremity of tail data, as
+highlighted in the literature. To address this issue, we propose EXtreme DRL
+(EX-DRL), which enhances extreme quantile prediction by modeling the tail of
+the loss distribution with a Generalized Pareto Distribution (GPD). This method
+introduces supplementary data to mitigate the scarcity of extreme quantile
+observations, thereby improving estimation accuracy through QR. Comprehensive
+experiments on gamma hedging options demonstrate that EX-DRL improves existing
+QR-based models by providing more precise estimates of extreme quantiles,
+thereby improving the computation and reliability of risk metrics for complex
+financial risk management.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Localising the Seizure Onset Zone from Single-Pulse Electrical
+  Stimulation Responses with a CNN <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.20324v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.20324v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jamie Norris, Aswin Chari, Dorien van Blooijs, Gerald Cooray, Karl Friston, Martin Tisdall, Richard Rosch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Epilepsy is one of the most common neurological disorders, often requiring
+surgical intervention when medication fails to control seizures. For effective
+surgical outcomes, precise localisation of the epileptogenic focus - often
+approximated through the Seizure Onset Zone (SOZ) - is critical yet remains a
+challenge. Active probing through electrical stimulation is already standard
+clinical practice for identifying epileptogenic areas. Our study advances the
+application of deep learning for SOZ localisation using Single-Pulse Electrical
+Stimulation (SPES) responses, with two key contributions. Firstly, we implement
+an existing deep learning model to compare two SPES analysis paradigms:
+divergent and convergent. These paradigms evaluate outward and inward effective
+connections, respectively. We assess the generalisability of these models to
+unseen patients and electrode placements using held-out test sets. Our findings
+reveal a notable improvement in moving from a divergent (AUROC: 0.574) to a
+convergent approach (AUROC: 0.666), marking the first application of the latter
+in this context. Secondly, we demonstrate the efficacy of CNN Transformers with
+cross-channel attention in handling heterogeneous electrode placements,
+increasing the AUROC to 0.730. These findings represent a significant step in
+modelling patient-specific intracranial EEG electrode placements in SPES.
+Future work will explore integrating these models into clinical decision-making
+processes to bridge the gap between deep learning research and practical
+healthcare applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 6 figures, accepted at Machine Learning for Healthcare 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Estimating optical vegetation indices and biophysical variables for
+  temperate forests with Sentinel-1 SAR data using machine learning techniques:
+  A case study for Czechia 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.07537v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.07537v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Paluba, Bertrand Le Saux, Přemysl Stych
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current optical vegetation indices (VIs) for monitoring forest ecosystems are
+well established and widely used in various applications, but can be limited by
+atmospheric effects such as clouds. In contrast, synthetic aperture radar (SAR)
+data can offer insightful and systematic forest monitoring with complete time
+series (TS) due to signal penetration through clouds and day and night image
+acquisitions. This study aims to address the limitations of optical satellite
+data by using SAR data as an alternative for estimating optical VIs for forests
+through machine learning (ML). While this approach is less direct and likely
+only feasible through the power of ML, it raises the scientific question of
+whether enough relevant information is contained in the SAR signal to
+accurately estimate VIs. This work covers the estimation of TS of four VIs
+(LAI, FAPAR, EVI and NDVI) using multitemporal Sentinel-1 SAR and ancillary
+data. The study focused on both healthy and disturbed temperate forest areas in
+Czechia for the year 2021, while ground truth labels generated from Sentinel-2
+multispectral data. This was enabled by creating a paired multi-modal TS
+dataset in Google Earth Engine (GEE), including temporally and spatially
+aligned Sentinel-1, Sentinel-2, DEM, weather and land cover datasets. The
+inclusion of DEM-derived auxiliary features and additional meteorological
+information, further improved the results. In the comparison of ML models, the
+traditional ML algorithms, RFR and XGBoost slightly outperformed the AutoML
+approach, auto-sklearn, for all VIs, achieving high accuracies ($R^2$ between
+70-86%) and low errors (0.055-0.29 of MAE). In general, up to 240 measurements
+per year and a spatial resolution of 20 m can be achieved using estimated
+SAR-based VIs with high accuracy. A great advantage of the SAR-based VI is the
+ability to detect abrupt forest changes with sub-weekly temporal accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Revised version of the preprint, based on comments from the
+  reviewers. Full research article. 23 pages, 10 figures, 7 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Consistent machine learning for topology optimization with
+  microstructure-dependent neural network material models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13843v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13843v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Harikrishnan Vijayakumaran, Jonathan B. Russ, Glaucio H. Paulino, Miguel A. Bessa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Additive manufacturing methods together with topology optimization have
+enabled the creation of multiscale structures with controlled spatially-varying
+material microstructure. However, topology optimization or inverse design of
+such structures in the presence of nonlinearities remains a challenge due to
+the expense of computational homogenization methods and the complexity of
+differentiably parameterizing the microstructural response. A solution to this
+challenge lies in machine learning techniques that offer efficient,
+differentiable mappings between the material response and its microstructural
+descriptors. This work presents a framework for designing multiscale
+heterogeneous structures with spatially varying microstructures by merging a
+homogenization-based topology optimization strategy with a consistent machine
+learning approach grounded in hyperelasticity theory. We leverage neural
+architectures that adhere to critical physical principles such as
+polyconvexity, objectivity, material symmetry, and thermodynamic consistency to
+supply the framework with a reliable constitutive model that is dependent on
+material microstructural descriptors. Our findings highlight the potential of
+integrating consistent machine learning models with density-based topology
+optimization for enhancing design optimization of heterogeneous hyperelastic
+structures under finite deformations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Variational Autoencoding of Dental Point Clouds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10895v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10895v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Johan Ziruo Ye, Thomas Ørkild, Peter Lempel Søndergaard, Søren Hauberg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Digital dentistry has made significant advancements, yet numerous challenges
+remain. This paper introduces the FDI 16 dataset, an extensive collection of
+tooth meshes and point clouds. Additionally, we present a novel approach:
+Variational FoldingNet (VF-Net), a fully probabilistic variational autoencoder
+for point clouds. Notably, prior latent variable models for point clouds lack a
+one-to-one correspondence between input and output points. Instead, they rely
+on optimizing Chamfer distances, a metric that lacks a normalized
+distributional counterpart, rendering it unsuitable for probabilistic modeling.
+We replace the explicit minimization of Chamfer distances with a suitable
+encoder, increasing computational efficiency while simplifying the
+probabilistic extension. This allows for straightforward application in various
+tasks, including mesh generation, shape completion, and representation
+learning. Empirically, we provide evidence of lower reconstruction error in
+dental reconstruction and interpolation, showcasing state-of-the-art
+performance in dental sample generation while identifying valuable latent
+representations
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Foundation Models for Music: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14340v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14340v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yinghao Ma, Anders Øland, Anton Ragni, Bleiz MacSen Del Sette, Charalampos Saitis, Chris Donahue, Chenghua Lin, Christos Plachouras, Emmanouil Benetos, Elio Quinton, Elona Shatri, Fabio Morreale, Ge Zhang, György Fazekas, Gus Xia, Huan Zhang, Ilaria Manco, Jiawen Huang, Julien Guinot, Liwei Lin, Luca Marinelli, Max W. Y. Lam, Megha Sharma, Qiuqiang Kong, Roger B. Dannenberg, Ruibin Yuan, Shangda Wu, Shih-Lun Wu, Shuqi Dai, Shun Lei, Shiyin Kang, Simon Dixon, Wenhu Chen, Wenhao Huang, Xingjian Du, Xingwei Qu, Xu Tan, Yizhi Li, Zeyue Tian, Zhiyong Wu, Zhizheng Wu, Ziyang Ma, Ziyu Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, foundation models (FMs) such as large language models (LLMs)
+and latent diffusion models (LDMs) have profoundly impacted diverse sectors,
+including music. This comprehensive review examines state-of-the-art (SOTA)
+pre-trained models and foundation models in music, spanning from representation
+learning, generative learning and multimodal learning. We first contextualise
+the significance of music in various industries and trace the evolution of AI
+in music. By delineating the modalities targeted by foundation models, we
+discover many of the music representations are underexplored in FM development.
+Then, emphasis is placed on the lack of versatility of previous methods on
+diverse music applications, along with the potential of FMs in music
+understanding, generation and medical application. By comprehensively exploring
+the details of the model pre-training paradigm, architectural choices,
+tokenisation, finetuning methodologies and controllability, we emphasise the
+important topics that should have been well explored, like instruction tuning
+and in-context learning, scaling law and emergent ability, as well as
+long-sequence modelling etc. A dedicated section presents insights into music
+agents, accompanied by a thorough analysis of datasets and evaluations
+essential for pre-training and downstream tasks. Finally, by underscoring the
+vital importance of ethical considerations, we advocate that following research
+on FM for music should focus more on such issues as interpretability,
+transparency, human responsibility, and copyright issues. The paper offers
+insights into future challenges and trends on FMs for music, aiming to shape
+the trajectory of human-AI collaboration in the music realm.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Comprehensive <span class="highlight-title">Survey</span> of Cross-Domain Policy Transfer for Embodied
+  Agents <span class="chip">IJCAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.04580v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.04580v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoyi Niu, Jianming Hu, Guyue Zhou, Xianyuan Zhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The burgeoning fields of robot learning and embodied AI have triggered an
+increasing demand for large quantities of data. However, collecting sufficient
+unbiased data from the target domain remains a challenge due to costly data
+collection processes and stringent safety requirements. Consequently,
+researchers often resort to data from easily accessible source domains, such as
+simulation and laboratory environments, for cost-effective data acquisition and
+rapid model iteration. Nevertheless, the environments and embodiments of these
+source domains can be quite different from their target domain counterparts,
+underscoring the need for effective cross-domain policy transfer approaches. In
+this paper, we conduct a systematic review of existing cross-domain policy
+transfer methods. Through a nuanced categorization of domain gaps, we
+encapsulate the overarching insights and design considerations of each problem
+setting. We also provide a high-level discussion about the key methodologies
+used in cross-domain policy transfer problems. Lastly, we summarize the open
+challenges that lie beyond the capabilities of current paradigms and discuss
+potential future directions in this field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IJCAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bayesian Learning in a Nonlinear Multiscale State-Space Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.06425v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.06425v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nayely Vélez-Cruz, Manfred D. Laubichler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ubiquity of multiscale interactions in complex systems is
+well-recognized, with development and heredity serving as a prime example of
+how processes at different temporal scales influence one another. This work
+introduces a novel multiscale state-space model to explore the dynamic
+interplay between systems interacting across different time scales, with
+feedback between each scale. We propose a Bayesian learning framework to
+estimate unknown states by learning the unknown process noise covariances
+within this multiscale model. We develop a Particle Gibbs with Ancestor
+Sampling (PGAS) algorithm for inference and demonstrate through simulations the
+efficacy of our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Corrected a typo</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generating $SROI^-$ Ontologies via Knowledge Graph Query Embedding
+  Learning <span class="chip">ECAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.09212v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.09212v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunjie He, Daniel Hernandez, Mojtaba Nayyeri, Bo Xiong, Yuqicheng Zhu, Evgeny Kharlamov, Steffen Staab
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Query embedding approaches answer complex logical queries over incomplete
+knowledge graphs (KGs) by computing and operating on low-dimensional vector
+representations of entities, relations, and queries. However, current query
+embedding models heavily rely on excessively parameterized neural networks and
+cannot explain the knowledge learned from the graph. We propose a novel query
+embedding method, AConE, which explains the knowledge learned from the graph in
+the form of $SROI^-$ description logic axioms while being more
+parameter-efficient than most existing approaches. AConE associates queries to
+a $SROI^-$ description logic concept. Every $SROI^-$ concept is embedded as a
+cone in complex vector space, and each $SROI^-$ relation is embedded as a
+transformation that rotates and scales cones. We show theoretically that AConE
+can learn $SROI^-$ axioms, and defines an algebra whose operations correspond
+one to one to $SROI^-$ description logic concept constructs. Our empirical
+study on multiple query datasets shows that AConE achieves superior results
+over previous baselines with fewer parameters. Notably on the WN18RR dataset,
+AConE achieves significant improvement over baseline models. We provide
+comprehensive analyses showing that the capability to represent axioms
+positively impacts the results of query answering.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ECAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep R Programming 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.01188v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.01188v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marek Gagolewski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep R Programming is a comprehensive and in-depth introductory course on one
+of the most popular languages for data science. It equips ambitious students,
+professionals, and researchers with the knowledge and skills to become
+independent users of this potent environment so that they can tackle any
+problem related to data wrangling and analytics, numerical computing,
+statistics, and machine learning. This textbook is a non-profit project. Its
+online and PDF versions are freely available at
+<https://deepr.gagolewski.com/>.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>v1.0.1 (2024-08-27)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Structured Deep Neural Networks-Based Backstepping Trajectory Tracking
+  Control for Lagrangian Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.00381v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.00381v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiajun Qian, Liang Xu, Xiaoqiang Ren, Xiaofan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks (DNN) are increasingly being used to learn controllers
+due to their excellent approximation capabilities. However, their black-box
+nature poses significant challenges to closed-loop stability guarantees and
+performance analysis. In this paper, we introduce a structured DNN-based
+controller for the trajectory tracking control of Lagrangian systems using
+backing techniques. By properly designing neural network structures, the
+proposed controller can ensure closed-loop stability for any compatible neural
+network parameters. In addition, improved control performance can be achieved
+by further optimizing neural network parameters. Besides, we provide explicit
+upper bounds on tracking errors in terms of controller parameters, which allows
+us to achieve the desired tracking performance by properly selecting the
+controller parameters. Furthermore, when system models are unknown, we propose
+an improved Lagrangian neural network (LNN) structure to learn the system
+dynamics and design the controller. We show that in the presence of model
+approximation errors and external disturbances, the closed-loop stability and
+tracking control performance can still be guaranteed. The effectiveness of the
+proposed approach is demonstrated through simulations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Glauber Generative Model: Discrete Diffusion Models via Binary
+  Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17035v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17035v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Harshit Varma, Dheeraj Nagaraj, Karthikeyan Shanmugam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce the Glauber Generative Model (GGM), a new class of discrete
+diffusion models, to obtain new samples from a distribution given samples from
+a discrete space. GGM deploys a discrete Markov chain called the heat bath
+dynamics (or the Glauber dynamics) to denoise a sequence of noisy tokens to a
+sample from a joint distribution of discrete tokens. Our novel conceptual
+framework provides an exact reduction of the task of learning the denoising
+Markov chain to solving a class of binary classification tasks. More
+specifically, the model learns to classify a given token in a noisy sequence as
+signal or noise. In contrast, prior works on discrete diffusion models either
+solve regression problems to learn importance ratios, or minimize loss
+functions given by variational approximations. We apply GGM to language
+modeling and image generation, where images are discretized using image
+tokenizers like VQGANs. We show that it outperforms existing discrete diffusion
+models in language generation, and demonstrates strong performance for image
+generation without using dataset-specific image tokenizers. We also show that
+our model is capable of performing well in zero-shot control settings like text
+and image infilling.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From Variability to Stability: Advancing RecSys Benchmarking Practices 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.09766v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.09766v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Valeriy Shevchenko, Nikita Belousov, Alexey Vasilev, Vladimir Zholobov, Artyom Sosedka, Natalia Semenova, Anna Volodkevich, Andrey Savchenko, Alexey Zaytsev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the rapidly evolving domain of Recommender Systems (RecSys), new
+algorithms frequently claim state-of-the-art performance based on evaluations
+over a limited set of arbitrarily selected datasets. However, this approach may
+fail to holistically reflect their effectiveness due to the significant impact
+of dataset characteristics on algorithm performance. Addressing this
+deficiency, this paper introduces a novel benchmarking methodology to
+facilitate a fair and robust comparison of RecSys algorithms, thereby advancing
+evaluation practices. By utilizing a diverse set of $30$ open datasets,
+including two introduced in this work, and evaluating $11$ collaborative
+filtering algorithms across $9$ metrics, we critically examine the influence of
+dataset characteristics on algorithm performance. We further investigate the
+feasibility of aggregating outcomes from multiple datasets into a unified
+ranking. Through rigorous experimental analysis, we validate the reliability of
+our methodology under the variability of datasets, offering a benchmarking
+strategy that balances quality and computational demands. This methodology
+enables a fair yet effective means of evaluating RecSys algorithms, providing
+valuable guidance for future research endeavors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages with 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Uplift Modeling in Multi-Treatment Marketing Campaigns:
+  Leveraging Score Ranking and Calibration Techniques 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13628v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13628v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yoon Tae Park, Ting Xu, Mohamed Anany
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Uplift modeling is essential for optimizing marketing strategies by selecting
+individuals likely to respond positively to specific marketing campaigns. This
+importance escalates in multi-treatment marketing campaigns, where diverse
+treatment is available and we may want to assign the customers to treatment
+that can make the most impact. While there are existing approaches with
+convenient frameworks like Causalml, there are potential spaces to enhance the
+effect of uplift modeling in multi treatment cases. This paper introduces a
+novel approach to uplift modeling in multi-treatment campaigns, leveraging
+score ranking and calibration techniques to improve overall performance of the
+marketing campaign. We review existing uplift models, including Meta Learner
+frameworks (S, T, X), and their application in real-world scenarios.
+Additionally, we delve into insights from multi-treatment studies to highlight
+the complexities and potential advancements in the field. Our methodology
+incorporates Meta-Learner calibration and a scoring rank-based offer selection
+strategy. Extensive experiment results with real-world datasets demonstrate the
+practical benefits and superior performance of our approach. The findings
+underscore the critical role of integrating score ranking and calibration
+techniques in refining the performance and reliability of uplift predictions,
+thereby advancing predictive modeling in marketing analytics and providing
+actionable insights for practitioners seeking to optimize their campaign
+strategies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Compressed Federated Reinforcement Learning with a Generative Model <span class="chip">ECML-PKDD 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.10635v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.10635v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Beikmohammadi, Sarit Khirirat, Sindri Magnússon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning has recently gained unprecedented popularity, yet it
+still grapples with sample inefficiency. Addressing this challenge, federated
+reinforcement learning (FedRL) has emerged, wherein agents collaboratively
+learn a single policy by aggregating local estimations. However, this
+aggregation step incurs significant communication costs. In this paper, we
+propose CompFedRL, a communication-efficient FedRL approach incorporating both
+\textit{periodic aggregation} and (direct/error-feedback) compression
+mechanisms. Specifically, we consider compressed federated $Q$-learning with a
+generative model setup, where a central server learns an optimal $Q$-function
+by periodically aggregating compressed $Q$-estimates from local agents. For the
+first time, we characterize the impact of these two mechanisms (which have
+remained elusive) by providing a finite-time analysis of our algorithm,
+demonstrating strong convergence behaviors when utilizing either direct or
+error-feedback compression. Our bounds indicate improved solution accuracy
+concerning the number of agents and other federated hyperparameters while
+simultaneously reducing communication costs. To corroborate our theory, we also
+conduct in-depth numerical experiments to verify our findings, considering
+Top-$K$ and Sparsified-$K$ sparsification operators.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>European Conference on Machine Learning and Principles and Practice
+  of Knowledge Discovery in Databases (ECML-PKDD 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diffusion Tensor Estimation with Uncertainty Calibration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2111.10847v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2111.10847v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Davood Karimi, Simon K. Warfield, Ali Gholipour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is highly desirable to know how uncertain a model's predictions are,
+especially for models that are complex and hard to understand as in deep
+learning. Although there has been a growing interest in using deep learning
+methods in diffusion-weighted MRI, prior works have not addressed the issue of
+model uncertainty. Here, we propose a deep learning method to estimate the
+diffusion tensor and compute the estimation uncertainty. Data-dependent
+uncertainty is computed directly by the network and learned via loss
+attenuation. Model uncertainty is computed using Monte Carlo dropout. We also
+propose a new method for evaluating the quality of predicted uncertainties. We
+compare the new method with the standard least-squares tensor estimation and
+bootstrap-based uncertainty computation techniques. Our experiments show that
+when the number of measurements is small the deep learning method is more
+accurate and its uncertainty predictions are better calibrated than the
+standard methods. We show that the estimation uncertainties computed by the new
+method can highlight the model's biases, detect domain shift, and reflect the
+strength of noise in the measurements. Our study shows the importance and
+practical value of modeling prediction uncertainties in deep learning-based
+diffusion MRI analysis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ STD-PLM: Understanding Both Spatial and Temporal Properties of
+  Spatial-Temporal Data with PLM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.09096v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.09096v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        YiHeng Huang, Xiaowei Mao, Shengnan Guo, Yubin Chen, Junfeng Shen, Tiankuo Li, Youfang Lin, Huaiyu Wan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spatial-temporal forecasting and imputation are important for real-world
+intelligent systems. Most existing methods are tailored for individual
+forecasting or imputation tasks but are not designed for both. Additionally,
+they are less effective for zero-shot and few-shot learning. While pre-trained
+language model (PLM) have exhibited strong pattern recognition and reasoning
+abilities across various tasks, including few-shot and zero-shot learning,
+their applications in spatial-temporal data understanding has been constrained
+by insufficient modeling of complex correlations such as the temporal
+correlations, spatial connectivity, non-pairwise and high-order
+spatial-temporal correlations within data. In this paper, we propose STD-PLM
+for understanding both spatial and temporal properties of
+\underline{S}patial-\underline{T}emporal \underline{D}ata with \underline{PLM},
+which is capable of implementing both spatial-temporal forecasting and
+imputation tasks. STD-PLM understands spatial-temporal correlations via
+explicitly designed spatial and temporal tokenizers. Topology-aware node
+embeddings are designed for PLM to comprehend and exploit the topology
+structure of data in inductive manner. Furthermore, to mitigate the efficiency
+issues introduced by the PLM, we design a sandglass attention module (SGA)
+combined with a specific constrained loss function, which significantly
+improves the model's efficiency while ensuring performance. Extensive
+experiments demonstrate that STD-PLM exhibits competitive performance and
+generalization capabilities across the forecasting and imputation tasks on
+various datasets. Moreover, STD-PLM achieves promising results on both few-shot
+and zero-shot tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Does Audio Deepfake Detection Generalize? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.16263v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.16263v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicolas M. Müller, Pavel Czempin, Franziska Dieckmann, Adam Froghyar, Konstantin Böttinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current text-to-speech algorithms produce realistic fakes of human voices,
+making deepfake detection a much-needed area of research. While researchers
+have presented various techniques for detecting audio spoofs, it is often
+unclear exactly why these architectures are successful: Preprocessing steps,
+hyperparameter settings, and the degree of fine-tuning are not consistent
+across related work. Which factors contribute to success, and which are
+accidental? In this work, we address this problem: We systematize audio
+spoofing detection by re-implementing and uniformly evaluating architectures
+from related work. We identify overarching features for successful audio
+deepfake detection, such as using cqtspec or logspec features instead of
+melspec features, which improves performance by 37% EER on average, all other
+factors constant. Additionally, we evaluate generalization capabilities: We
+collect and publish a new dataset consisting of 37.9 hours of found audio
+recordings of celebrities and politicians, of which 17.2 hours are deepfakes.
+We find that related work performs poorly on such real-world data (performance
+degradation of up to one thousand percent). This may suggest that the community
+has tailored its solutions too closely to the prevailing ASVSpoof benchmark and
+that deepfakes are much harder to detect outside the lab than previously
+thought.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Interspeech 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Riemannian Flow Matching Policy for Robot Motion Learning <span class="chip">IROS'24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.10672v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.10672v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Max Braun, Noémie Jaquier, Leonel Rozo, Tamim Asfour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Riemannian Flow Matching Policies (RFMP), a novel model for
+learning and synthesizing robot visuomotor policies. RFMP leverages the
+efficient training and inference capabilities of flow matching methods. By
+design, RFMP inherits the strengths of flow matching: the ability to encode
+high-dimensional multimodal distributions, commonly encountered in robotic
+tasks, and a very simple and fast inference process. We demonstrate the
+applicability of RFMP to both state-based and vision-conditioned robot motion
+policies. Notably, as the robot state resides on a Riemannian manifold, RFMP
+inherently incorporates geometric awareness, which is crucial for realistic
+robotic tasks. To evaluate RFMP, we conduct two proof-of-concept experiments,
+comparing its performance against Diffusion Policies. Although both approaches
+successfully learn the considered tasks, our results show that RFMP provides
+smoother action trajectories with significantly lower inference times.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at IROS'24. 8 pages, 5 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BayTTA: Uncertainty-aware medical image classification with optimized
+  test-time augmentation using Bayesian model averaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.17640v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.17640v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeinab Sherkatghanad, Moloud Abdar, Mohammadreza Bakhtyari, Pawel Plawiak, Vladimir Makarenkov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Test-time augmentation (TTA) is a well-known technique employed during the
+testing phase of computer vision tasks. It involves aggregating multiple
+augmented versions of input data. Combining predictions using a simple average
+formulation is a common and straightforward approach after performing TTA. This
+paper introduces a novel framework for optimizing TTA, called BayTTA
+(Bayesian-based TTA), which is based on Bayesian Model Averaging (BMA). First,
+we generate a prediction list associated with different variations of the input
+data created through TTA. Then, we use BMA to combine predictions weighted by
+the respective posterior probabilities. Such an approach allows one to take
+into account model uncertainty, and thus to enhance the predictive performance
+of the related machine learning or deep learning model. We evaluate the
+performance of BayTTA on various public data, including three medical image
+datasets comprising skin cancer, breast cancer, and chest X-ray images and two
+well-known gene editing datasets, CRISPOR and GUIDE-seq. Our experimental
+results indicate that BayTTA can be effectively integrated into
+state-of-the-art deep learning models used in medical image analysis as well as
+into some popular pre-trained CNN models such as VGG-16, MobileNetV2,
+DenseNet201, ResNet152V2, and InceptionRes-NetV2, leading to the enhancement in
+their accuracy and robustness performance. The source code of the proposed
+BayTTA method is freely available at: \underline
+{https://github.com/Z-Sherkat/BayTTA}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Sign Language Detection through Mediapipe and Convolutional
+  Neural Networks (CNN) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03729v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03729v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aditya Raj Verma, Gagandeep Singh, Karnim Meghwal, Banawath Ramji, Praveen Kumar Dadheech
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This research combines MediaPipe and CNNs for the efficient and accurate
+interpretation of ASL dataset for the real-time detection of sign language. The
+system presented here captures and processes hands' gestures in real time. the
+intended purpose was to create a very easy, accurate, and fast way of entering
+commands without the necessity of touching something.MediaPipe supports one of
+the powerful frameworks in real-time hand tracking capabilities for the ability
+to capture and preprocess hand movements, which increases the accuracy of the
+gesture recognition system. Actually, the integration of CNN with the MediaPipe
+results in higher efficiency in using the model of real-time processing.The
+accuracy achieved by the model on ASL datasets is 99.12\%.The model was tested
+using American Sign Language (ASL) datasets. The results were then compared to
+those of existing methods to evaluate how well it performed, using established
+evaluation techniques. The system will have applications in the communication,
+education, and accessibility domains. Making systems such as described in this
+paper even better will assist people with hearing impairment and make things
+accessible to them. We tested the recognition and translation performance on an
+ASL dataset and achieved better accuracy over previous models.It is meant to
+the research is to identify the characters that American signs recognize using
+hand images taken from a web camera by based on mediapipe and CNNs
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>We have decided to withdraw our paper due to significant revisions
+  and improvements that need to be made based on new findings. After further
+  analysis, we believe these changes are necessary to ensure the accuracy and
+  completeness of our work. We plan to resubmit the revised version in the
+  future once the updates are complete</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dr.E Bridges Graphs with Large Language Models through Words 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.15504v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.15504v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zipeng Liu, Likang Wu, Ming He, Zhong Guan, Hongke Zhao, Nan Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Significant efforts have been dedicated to integrating the powerful Large
+Language Models (LLMs) with diverse modalities, particularly focusing on the
+fusion of language, vision and audio data. However, the graph-structured data,
+which is inherently rich in structural and domain-specific knowledge, has not
+yet been gracefully adapted to LLMs. Existing methods either describe the graph
+with raw text, suffering the loss of graph structural information, or feed
+Graph Neural Network (GNN) embeddings into LLMs at the cost of losing
+explainable prompt semantics. To bridge this gap, we introduce an end-to-end
+modality-aligning framework for LLM-graph alignment: Dual-Residual Vector
+Quantized-Variational AutoEncoder, namely Dr.E. Our approach is purposefully
+designed to facilitate token-level alignment with LLMs, enabling an effective
+translation of the intrinsic `language' of graphs into comprehensible natural
+language. We also manage to enhance LLMs' more robust structural understanding
+of graphs by incorporating multiple views of the central nodes based on their
+surrounding nodes at various distances. Our experimental evaluations on
+standard graph tasks demonstrate competitive performance against other
+state-of-the-art (SOTA) approaches. Additionally, our framework ensures certain
+visual interpretability, efficiency, and robustness, marking the promising
+successful endeavor to achieve token-level alignment between LLMs and GNNs. Our
+code is available at: https://anonymous.4open.science/r/dre-817.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Baseline Results for Selected Nonlinear System Identification Benchmarks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.10779v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.10779v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Max D. Champneys, Gerben I. Beintema, Roland Tóth, Maarten Schoukens, Timothy J. Rogers
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nonlinear system identification remains an important open challenge across
+research and academia. Large numbers of novel approaches are seen published
+each year, each presenting improvements or extensions to existing methods. It
+is natural, therefore, to consider how one might choose between these competing
+models. Benchmark datasets provide one clear way to approach this question.
+However, to make meaningful inference based on benchmark performance it is
+important to understand how well a new method performs comparatively to results
+available with well-established methods. This paper presents a set of ten
+baseline techniques and their relative performances on five popular benchmarks.
+The aim of this contribution is to stimulate thought and discussion regarding
+objective comparison of identification methodologies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Causal structure learning with momentum: Sampling distributions over
+  Markov Equivalence Classes of DAGs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.05655v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.05655v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Moritz Schauer, Marcel Wienöbst
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the context of inferring a Bayesian network structure (directed acyclic
+graph, DAG for short), we devise a non-reversible continuous time Markov chain,
+the ``Causal Zig-Zag sampler'', that targets a probability distribution over
+classes of observationally equivalent (Markov equivalent) DAGs. The classes are
+represented as completed partially directed acyclic graphs (CPDAGs). The
+non-reversible Markov chain relies on the operators used in Chickering's Greedy
+Equivalence Search (GES) and is endowed with a momentum variable, which
+improves mixing significantly as we show empirically. The possible target
+distributions include posterior distributions based on a prior over DAGs and a
+Markov equivalent likelihood. We offer an efficient implementation wherein we
+develop new algorithms for listing, counting, uniformly sampling, and applying
+possible moves of the GES operators, all of which significantly improve upon
+the state-of-the-art run-time.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Note on Knowledge Distillation Loss Function for Object Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2109.06458v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2109.06458v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Defang Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This research note provides a quick introduction to the knowledge
+distillation loss function used in object classification. In particular, we
+discuss its connection to a previously proposed logits matching loss function.
+We further treat knowledge distillation as a specific form of output
+regularization and demonstrate its connection to label smoothing and
+entropy-based regularization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Research Note, 4 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring Cross-model Neuronal Correlations in the Context of Predicting
+  Model Performance and Generalizability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.08448v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.08448v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haniyeh Ehsani Oskouie, Lionel Levine, Majid Sarrafzadeh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As Artificial Intelligence (AI) models are increasingly integrated into
+critical systems, the need for a robust framework to establish the
+trustworthiness of AI is increasingly paramount. While collaborative efforts
+have established conceptual foundations for such a framework, there remains a
+significant gap in developing concrete, technically robust methods for
+assessing AI model quality and performance. A critical drawback in the
+traditional methods for assessing the validity and generalizability of models
+is their dependence on internal developer datasets, rendering it challenging to
+independently assess and verify their performance claims. This paper introduces
+a novel approach for assessing a newly trained model's performance based on
+another known model by calculating correlation between neural networks. The
+proposed method evaluates correlations by determining if, for each neuron in
+one network, there exists a neuron in the other network that produces similar
+output. This approach has implications for memory efficiency, allowing for the
+use of smaller networks when high correlation exists between networks of
+different sizes. Additionally, the method provides insights into robustness,
+suggesting that if two highly correlated networks are compared and one
+demonstrates robustness when operating in production environments, the other is
+likely to exhibit similar robustness. This contribution advances the technical
+toolkit for responsible AI, supporting more comprehensive and nuanced
+evaluations of AI models to ensure their safe and effective deployment. Code is
+available at https://github.com/aheldis/Cross-model-correlation.git.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Predictive Modeling of Flexible EHD Pumps using Kolmogorov-Arnold
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.07488v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.07488v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanhong Peng, Yuxin Wang, Fangchao Hu, Miao He, Zebing Mao, Xia Huang, Jun Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel approach to predicting the pressure and flow rate of
+flexible electrohydrodynamic pumps using the Kolmogorov-Arnold Network.
+Inspired by the Kolmogorov-Arnold representation theorem, KAN replaces fixed
+activation functions with learnable spline-based activation functions, enabling
+it to approximate complex nonlinear functions more effectively than traditional
+models like Multi-Layer Perceptron and Random Forest. We evaluated KAN on a
+dataset of flexible EHD pump parameters and compared its performance against
+RF, and MLP models. KAN achieved superior predictive accuracy, with Mean
+Squared Errors of 12.186 and 0.001 for pressure and flow rate predictions,
+respectively. The symbolic formulas extracted from KAN provided insights into
+the nonlinear relationships between input parameters and pump performance.
+These findings demonstrate that KAN offers exceptional accuracy and
+interpretability, making it a promising alternative for predictive modeling in
+electrohydrodynamic pumping.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Irregular Traffic Time Series Forecasting Based on Asynchronous
+  Spatio-Temporal Graph Convolutional Network <span class="chip">KDD 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.16818v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.16818v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weijia Zhang, Le Zhang, Jindong Han, Hao Liu, Yanjie Fu, Jingbo Zhou, Yu Mei, Hui Xiong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate traffic forecasting is crucial for the development of Intelligent
+Transportation Systems (ITS), playing a pivotal role in modern urban traffic
+management. Traditional forecasting methods, however, struggle with the
+irregular traffic time series resulting from adaptive traffic signal controls,
+presenting challenges in asynchronous spatial dependency, irregular temporal
+dependency, and predicting variable-length sequences. To this end, we propose
+an Asynchronous Spatio-tEmporal graph convolutional nEtwoRk (ASeer) tailored
+for irregular traffic time series forecasting. Specifically, we first propose
+an Asynchronous Graph Diffusion Network to capture the spatial dependency
+between asynchronously measured traffic states regulated by adaptive traffic
+signals. After that, to capture the temporal dependency within irregular
+traffic state sequences, a personalized time encoding is devised to embed the
+continuous time signals. Then, we propose a Transformable Time-aware
+Convolution Network, which adapts meta-filters for time-aware convolution on
+the sequences with inconsistent temporal flow. Additionally, a
+Semi-Autoregressive Prediction Network, comprising a state evolution unit and a
+semi-autoregressive predictor, is designed to predict variable-length traffic
+sequences effectively and efficiently. Extensive experiments on a newly
+established benchmark demonstrate the superiority of ASeer compared with twelve
+competitive baselines across six metrics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work is published in the research track of KDD 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning to Decode Collaboratively with Multiple Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.03870v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.03870v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shannon Zejiang Shen, Hunter Lang, Bailin Wang, Yoon Kim, David Sontag
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a method to teach multiple large language models (LLM) to
+collaborate by interleaving their generations at the token level. We model the
+decision of which LLM generates the next token as a latent variable. By
+optimizing the marginal likelihood of a training set under our latent variable
+model, the base LLM automatically learns when to generate itself and when to
+call on one of the ``assistant'' language models to generate, all without
+direct supervision. Token-level collaboration during decoding allows for a
+fusion of each model's expertise in a manner tailored to the specific task at
+hand. Our collaborative decoding is especially useful in cross-domain settings
+where a generalist base LLM learns to invoke domain expert models. On
+instruction-following, domain-specific QA, and reasoning tasks, we show that
+the performance of the joint system exceeds that of the individual models.
+Through qualitative analysis of the learned latent decisions, we show models
+trained with our method exhibit several interesting collaboration patterns,
+e.g., template-filling. Our code is available at
+https://github.com/clinicalml/co-llm.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 4 figures, 11 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Robustness of Human Detection Algorithms in Maritime SAR
+  through Augmented Aerial Images to Simulate Weather Conditions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13766v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13766v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miguel Tjia, Artem Kim, Elaine Wynette Wijaya, Hanna Tefara, Kevin Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  7,651 cases of Search and Rescue Missions (SAR) were reported by the United
+States Coast Guard in 2024, with over 1322 SAR helicopters deployed in the 6
+first months alone. Through the utilizations of YOLO, we were able to run
+different weather conditions and lighting from our augmented dataset for
+training. YOLO then utilizes CNNs to apply a series of convolutions and pooling
+layers to the input image, where the convolution layers are able to extract the
+main features of the image. Through this, our YOLO model is able to learn to
+differentiate different objects which may considerably improve its accuracy,
+possibly enhancing the efficiency of SAR operations through enhanced detection
+accuracy. This paper aims to improve the model's accuracy of human detection in
+maritime SAR by evaluating a robust datasets containing various elevations and
+geological locations, as well as through data augmentation which simulates
+different weather and lighting. We observed that models trained on augmented
+datasets outperformed their non-augmented counterparts in which the human
+recall scores ranged from 0.891 to 0.911 with an improvement rate of 3.4\% on
+the YOLOv5l model. Results showed that these models demonstrate greater
+robustness to real-world conditions in varying of weather, brightness, tint,
+and contrast.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Water Quality Time-Series Prediction in Hong Kong using
+  Sentinel-2 MSI Data and Google Earth Engine Cloud Computing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14010v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14010v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rohin Sood, Kevin Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Effective water quality monitoring in coastal regions is crucial due to the
+progressive deterioration caused by pollution and human activities. To address
+this, this study develops time-series models to predict chlorophyll-a (Chl-a),
+suspended solids (SS), and turbidity using Sentinel-2 satellite data and Google
+Earth Engine (GEE) in the coastal regions of Hong Kong. Leveraging Long
+Short-Term Memory (LSTM) Recurrent Neural Networks, the study incorporates
+extensive temporal datasets to enhance prediction accuracy. The models utilize
+spectral data from Sentinel-2, focusing on optically active components, and
+demonstrate that selected variables closely align with the spectral
+characteristics of Chl-a and SS. The results indicate improved predictive
+performance over previous methods, highlighting the potential for remote
+sensing technology in continuous and comprehensive water quality assessment.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Conformal Depression Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.18723v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.18723v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yonghong Li, Xiuzhuang Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While existing depression prediction methods based on deep learning show
+promise, their practical application is hindered by the lack of
+trustworthiness, as these deep models are often deployed as black box models,
+leaving us uncertain on the confidence of their predictions. For high-risk
+clinical applications like depression prediction, uncertainty quantification is
+essential in decision-making. In this paper, we introduce conformal depression
+prediction (CDP), a depression prediction method with uncertainty
+quantification based on conformal prediction (CP), giving valid confidence
+intervals with theoretical coverage guarantees for the model predictions. CDP
+is a plug-and-play module that requires neither model retraining nor an
+assumption about the depression data distribution. As CDP provides only an
+average coverage guarantee across all inputs rather than per-input performance
+guarantee, we further propose CDP-ACC, an improved conformal prediction with
+approximate conditional coverage. CDP-ACC firstly estimates the prediction
+distribution through neighborhood relaxation, and then introduces a conformal
+score function by constructing nested sequences, so as to provide a tighter
+prediction interval adaptive to specific input. We empirically demonstrate the
+application of CDP in uncertainty-aware facial depression prediction, as well
+as the effectiveness and superiority of CDP-ACC on the AVEC 2013 and AVEC 2014
+datasets. Our code is publicly available at https://github.com/PushineLee/CDP.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improved identification of breakpoints in piecewise regression and its
+  applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13751v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13751v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taehyeong Kim, Hyungu Lee, Hayoung Choi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Identifying breakpoints in piecewise regression is critical in enhancing the
+reliability and interpretability of data fitting. In this paper, we propose
+novel algorithms based on the greedy algorithm to accurately and efficiently
+identify breakpoints in piecewise polynomial regression. The algorithm updates
+the breakpoints to minimize the error by exploring the neighborhood of each
+breakpoint. It has a fast convergence rate and stability to find optimal
+breakpoints. Moreover, it can determine the optimal number of breakpoints. The
+computational results for real and synthetic data show that its accuracy is
+better than any existing methods. The real-world datasets demonstrate that
+breakpoints through the proposed algorithm provide valuable data information.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Nonlinear subspace clustering by functional link neural networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.02051v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.02051v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Long Shi, Lei Cao, Zhongpu Chen, Badong Chen, Yu Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nonlinear subspace clustering based on a feed-forward neural network has been
+demonstrated to provide better clustering accuracy than some advanced subspace
+clustering algorithms. While this approach demonstrates impressive outcomes, it
+involves a balance between effectiveness and computational cost. In this study,
+we employ a functional link neural network to transform data samples into a
+nonlinear domain. Subsequently, we acquire a self-representation matrix through
+a learning mechanism that builds upon the mapped samples. As the functional
+link neural network is a single-layer neural network, our proposed method
+achieves high computational efficiency while ensuring desirable clustering
+performance. By incorporating the local similarity regularization to enhance
+the grouping effect, our proposed method further improves the quality of the
+clustering results. Additionally, we introduce a convex combination subspace
+clustering scheme, which combining a linear subspace clustering method with the
+functional link neural network subspace clustering approach. This combination
+approach allows for a dynamic balance between linear and nonlinear
+representations. Extensive experiments confirm the advancement of our methods.
+The source code will be released on https://lshi91.github.io/ soon.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhanced Latent Multi-view Subspace Clustering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.14763v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.14763v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Long Shi, Lei Cao, Jun Wang, Badong Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Latent multi-view subspace clustering has been demonstrated to have desirable
+clustering performance. However, the original latent representation method
+vertically concatenates the data matrices from multiple views into a single
+matrix along the direction of dimensionality to recover the latent
+representation matrix, which may result in an incomplete information recovery.
+To fully recover the latent space representation, we in this paper propose an
+Enhanced Latent Multi-view Subspace Clustering (ELMSC) method. The ELMSC method
+involves constructing an augmented data matrix that enhances the representation
+of multi-view data. Specifically, we stack the data matrices from various views
+into the block-diagonal locations of the augmented matrix to exploit the
+complementary information. Meanwhile, the non-block-diagonal entries are
+composed based on the similarity between different views to capture the
+consistent information. In addition, we enforce a sparse regularization for the
+non-diagonal blocks of the augmented self-representation matrix to avoid
+redundant calculations of consistency information. Finally, a novel iterative
+algorithm based on the framework of Alternating Direction Method of Multipliers
+(ADMM) is developed to solve the optimization problem for ELMSC. Extensive
+experiments on real-world datasets demonstrate that our proposed ELMSC is able
+to achieve higher clustering performance than some state-of-art multi-view
+clustering methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GNN: Graph Neural Network and Large Language Model for Data Discovery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13609v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13609v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas Hoang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Our algorithm GNN: Graph Neural Network and Large Language Model for Data
+Discovery inherit the benefits of \cite{hoang2024plod} (PLOD: Predictive
+Learning Optimal Data Discovery), \cite{Hoang2024BODBO} (BOD: Blindly Optimal
+Data Discovery) in terms of overcoming the challenges of having to predefine
+utility function and the human input for attribute ranking, which helps prevent
+the time-consuming loop process. In addition to these previous works, our
+algorithm GNN leverages the advantages of graph neural networks and large
+language models to understand text type values that cannot be understood by
+PLOD and MOD, thus making the task of predicting outcomes more reliable. GNN
+could be seen as an extension of PLOD in terms of understanding the text type
+value and the user's preferences, not only numerical values but also text
+values, making the promise of data science and analytics purposes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Inference-Time Rule Eraser: Fair Recognition via Distilling and Removing
+  Biased Rules 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.04814v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.04814v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Zhang, Dongyuan Lu, Jitao Sang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning models often make predictions based on biased features such
+as gender, race, and other social attributes, posing significant fairness
+risks, especially in societal applications, such as hiring, banking, and
+criminal justice. Traditional approaches to addressing this issue involve
+retraining or fine-tuning neural networks with fairness-aware optimization
+objectives. However, these methods can be impractical due to significant
+computational resources, complex industrial tests, and the associated CO2
+footprint. Additionally, regular users often fail to fine-tune models because
+they lack access to model parameters In this paper, we introduce the
+Inference-Time Rule Eraser (Eraser), a novel method designed to address
+fairness concerns by removing biased decision-making rules from deployed models
+during inference without altering model weights. We begin by establishing a
+theoretical foundation for modifying model outputs to eliminate biased rules
+through Bayesian analysis. Next, we present a specific implementation of Eraser
+that involves two stages: (1) distilling the biased rules from the deployed
+model into an additional patch model, and (2) removing these biased rules from
+the output of the deployed model during inference. Extensive experiments
+validate the effectiveness of our approach, showcasing its superior performance
+in addressing fairness concerns in AI systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ STAMP: Outlier-Aware Test-Time Adaptation with Stable Memory Replay <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.15773v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.15773v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongcan Yu, Lijun Sheng, Ran He, Jian Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Test-time adaptation (TTA) aims to address the distribution shift between the
+training and test data with only unlabeled data at test time. Existing TTA
+methods often focus on improving recognition performance specifically for test
+data associated with classes in the training set. However, during the
+open-world inference process, there are inevitably test data instances from
+unknown classes, commonly referred to as outliers. This paper pays attention to
+the problem that conducts both sample recognition and outlier rejection during
+inference while outliers exist. To address this problem, we propose a new
+approach called STAble Memory rePlay (STAMP), which performs optimization over
+a stable memory bank instead of the risky mini-batch. In particular, the memory
+bank is dynamically updated by selecting low-entropy and label-consistent
+samples in a class-balanced manner. In addition, we develop a self-weighted
+entropy minimization strategy that assigns higher weight to low-entropy
+samples. Extensive results demonstrate that STAMP outperforms existing TTA
+methods in terms of both recognition and outlier detection performance. The
+code is released at https://github.com/yuyongcan/STAMP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ECCV 2024; Fixed a bug in calculating OOD score of STAMP
+  and updated the results</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Item Response Theory-based R Module for Algorithm Portfolio Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14025v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14025v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brodie Oldfield, Sevvandi Kandanaarachchi, Ziqi Xu, Mario Andrés Muñoz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Experimental evaluation is crucial in AI research, especially for assessing
+algorithms across diverse tasks. Many studies often evaluate a limited set of
+algorithms, failing to fully understand their strengths and weaknesses within a
+comprehensive portfolio. This paper introduces an Item Response Theory (IRT)
+based analysis tool for algorithm portfolio evaluation called AIRT-Module.
+Traditionally used in educational psychometrics, IRT models test question
+difficulty and student ability using responses to test questions. Adapting IRT
+to algorithm evaluation, the AIRT-Module contains a Shiny web application and
+the R package airt. AIRT-Module uses algorithm performance measures to compute
+anomalousness, consistency, and difficulty limits for an algorithm and the
+difficulty of test instances. The strengths and weaknesses of algorithms are
+visualised using the difficulty spectrum of the test instances. AIRT-Module
+offers a detailed understanding of algorithm capabilities across varied test
+instances, thus enhancing comprehensive AI method assessment. It is available
+at https://sevvandi.shinyapps.io/AIRT/ .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 Pages, 6 Figures. Submitted to SoftwareX</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SONICS: Synthetic Or Not -- Identifying Counterfeit Songs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14080v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14080v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Awsafur Rahman, Zaber Ibn Abdul Hakim, Najibul Haque Sarker, Bishmoy Paul, Shaikh Anowarul Fattah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent surge in AI-generated songs presents exciting possibilities and
+challenges. While these tools democratize music creation, they also necessitate
+the ability to distinguish between human-composed and AI-generated songs for
+safeguarding artistic integrity and content curation. Existing research and
+datasets in fake song detection only focus on singing voice deepfake detection
+(SVDD), where the vocals are AI-generated but the instrumental music is sourced
+from real songs. However, this approach is inadequate for contemporary
+end-to-end AI-generated songs where all components (vocals, lyrics, music, and
+style) could be AI-generated. Additionally, existing datasets lack lyrics-music
+diversity, long-duration songs, and open fake songs. To address these gaps, we
+introduce SONICS, a novel dataset for end-to-end Synthetic Song Detection
+(SSD), comprising over 97k songs with over 49k synthetic songs from popular
+platforms like Suno and Udio. Furthermore, we highlight the importance of
+modeling long-range temporal dependencies in songs for effective authenticity
+detection, an aspect overlooked in existing methods. To capture these patterns,
+we propose a novel model, SpecTTTra, that is up to 3 times faster and 6 times
+more memory efficient compared to popular CNN and Transformer-based models
+while maintaining competitive performance. Finally, we offer both AI-based and
+Human evaluation benchmarks, addressing another deficiency in current research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FERI: A Multitask-based Fairness Achieving Algorithm with Applications
+  to Fair Organ Transplantation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.13820v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.13820v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Can Li, Dejian Lai, Xiaoqian Jiang, Kai Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Liver transplantation often faces fairness challenges across subgroups
+defined by sensitive attributes such as age group, gender, and race/ethnicity.
+Machine learning models for outcome prediction can introduce additional biases.
+Therefore, we introduce Fairness through the Equitable Rate of Improvement in
+Multitask Learning (FERI) algorithm for fair predictions of graft failure risk
+in liver transplant patients. FERI constrains subgroup loss by balancing
+learning rates and preventing subgroup dominance in the training process. Our
+results show that FERI maintained high predictive accuracy with AUROC and AUPRC
+comparable to baseline models. More importantly, FERI demonstrated an ability
+to improve fairness without sacrificing accuracy. Specifically, for the gender,
+FERI reduced the demographic parity disparity by 71.74%, and for the age group,
+it decreased the equalized odds disparity by 40.46%. Therefore, the FERI
+algorithm advanced fairness-aware predictive modeling in healthcare and
+provides an invaluable tool for equitable healthcare systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>First Prize Student Award Paper, American Medical Informatics
+  Association 2024 Informatics Summit</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Research on the Spatial Data Intelligent Foundation Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19730v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19730v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaohua Wang, Xing Xie, Yong Li, Danhuai Guo, Zhi Cai, Yu Liu, Yang Yue, Xiao Pan, Feng Lu, Huayi Wu, Zhipeng Gui, Zhiming Ding, Bolong Zheng, Fuzheng Zhang, Jingyuan Wang, Zhengchao Chen, Hao Lu, Jiayi Li, Peng Yue, Wenhao Yu, Yao Yao, Leilei Sun, Yong Zhang, Longbiao Chen, Xiaoping Du, Xiang Li, Xueying Zhang, Kun Qin, Zhaoya Gong, Weihua Dong, Xiaofeng Meng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This report focuses on spatial data intelligent large models, delving into
+the principles, methods, and cutting-edge applications of these models. It
+provides an in-depth discussion on the definition, development history, current
+status, and trends of spatial data intelligent large models, as well as the
+challenges they face. The report systematically elucidates the key technologies
+of spatial data intelligent large models and their applications in urban
+environments, aerospace remote sensing, geography, transportation, and other
+scenarios. Additionally, it summarizes the latest application cases of spatial
+data intelligent large models in themes such as urban development, multimodal
+systems, remote sensing, smart transportation, and resource environments.
+Finally, the report concludes with an overview and outlook on the development
+prospects of spatial data intelligent large models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>V1 and V2 are in Chinese language, other versions are in English</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A StrongREJECT for Empty Jailbreaks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.10260v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.10260v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexandra Souly, Qingyuan Lu, Dillon Bowen, Tu Trinh, Elvis Hsieh, Sana Pandey, Pieter Abbeel, Justin Svegliato, Scott Emmons, Olivia Watkins, Sam Toyer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most jailbreak papers claim the jailbreaks they propose are highly effective,
+often boasting near-100% attack success rates. However, it is perhaps more
+common than not for jailbreak developers to substantially exaggerate the
+effectiveness of their jailbreaks. We suggest this problem arises because
+jailbreak researchers lack a standard, high-quality benchmark for evaluating
+jailbreak performance, leaving researchers to create their own. To create a
+benchmark, researchers must choose a dataset of forbidden prompts to which a
+victim model will respond, along with an evaluation method that scores the
+harmfulness of the victim model's responses. We show that existing benchmarks
+suffer from significant shortcomings and introduce the StrongREJECT benchmark
+to address these issues. StrongREJECT's dataset contains prompts that victim
+models must answer with specific, harmful information, while its automated
+evaluator measures the extent to which a response gives useful information to
+forbidden prompts. In doing so, the StrongREJECT evaluator achieves
+state-of-the-art agreement with human judgments of jailbreak effectiveness.
+Notably, we find that existing evaluation methods significantly overstate
+jailbreak effectiveness compared to human judgments and the StrongREJECT
+evaluator. We describe a surprising and novel phenomenon that explains this
+discrepancy: jailbreaks bypassing a victim model's safety fine-tuning tend to
+reduce its capabilities. Together, our findings underscore the need for
+researchers to use a high-quality benchmark, such as StrongREJECT, when
+developing new jailbreak attacks. We release the StrongREJECT code and data at
+https://strong-reject.readthedocs.io/en/latest/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code and data at https://strong-reject.readthedocs.io/en/latest/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ALIAS: DAG Learning with Efficient Unconstrained Policies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13448v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13448v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bao Duong, Hung Le, Thin Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, reinforcement learning (RL) has proved a promising alternative for
+conventional local heuristics in score-based approaches to learning directed
+acyclic causal graphs (DAGs) from observational data. However, the intricate
+acyclicity constraint still challenges the efficient exploration of the vast
+space of DAGs in existing methods. In this study, we introduce ALIAS
+(reinforced dAg Learning wIthout Acyclicity conStraints), a novel approach to
+causal discovery powered by the RL machinery. Our method features an efficient
+policy for generating DAGs in just a single step with an optimal quadratic
+complexity, fueled by a novel parametrization of DAGs that directly translates
+a continuous space to the space of all DAGs, bypassing the need for explicitly
+enforcing acyclicity constraints. This approach enables us to navigate the
+search space more effectively by utilizing policy gradient methods and
+established scoring functions. In addition, we provide compelling empirical
+evidence for the strong performance of ALIAS in comparison with
+state-of-the-arts in causal discovery over increasingly difficult experiment
+conditions on both synthetic and real datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ When Fairness Meets Privacy: Exploring Privacy Threats in Fair Binary
+  Classifiers via Membership Inference Attacks <span class="chip">IJCAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.03865v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.03865v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huan Tian, Guangsheng Zhang, Bo Liu, Tianqing Zhu, Ming Ding, Wanlei Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Previous studies have developed fairness methods for biased models that
+exhibit discriminatory behaviors towards specific subgroups. While these models
+have shown promise in achieving fair predictions, recent research has
+identified their potential vulnerability to score-based membership inference
+attacks (MIAs). In these attacks, adversaries can infer whether a particular
+data sample was used during training by analyzing the model's prediction
+scores. However, our investigations reveal that these score-based MIAs are
+ineffective when targeting fairness-enhanced models in binary classifications.
+The attack models trained to launch the MIAs degrade into simplistic threshold
+models, resulting in lower attack performance. Meanwhile, we observe that
+fairness methods often lead to prediction performance degradation for the
+majority subgroups of the training data. This raises the barrier to successful
+attacks and widens the prediction gaps between member and non-member data.
+Building upon these insights, we propose an efficient MIA method against
+fairness-enhanced models based on fairness discrepancy results (FD-MIA). It
+leverages the difference in the predictions from both the original and
+fairness-enhanced models and exploits the observed prediction gaps as attack
+clues. We also explore potential strategies for mitigating privacy leakages.
+Extensive experiments validate our findings and demonstrate the efficacy of the
+proposed method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IJCAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Verifiable cloud-based variational quantum algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13713v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13713v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junhong Yang, Banghai Wang, Junyu Quan, Qin Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Variational quantum algorithms (VQAs) have shown potential for quantum
+advantage with noisy intermediate-scale quantum (NISQ) devices for quantum
+machine learning (QML). However, given the high cost and limited availability
+of quantum resources, delegating VQAs via cloud networks is a more practical
+solution for clients with limited quantum capabilities. Recently, Shingu et
+al.[Physical Review A, 105, 022603 (2022)] proposed a variational secure cloud
+quantum computing protocol, utilizing ancilla-driven quantum computation (ADQC)
+for cloud-based VQAs with minimal quantum resource consumption. However, their
+protocol lacks verifiability, which exposes it to potential malicious behaviors
+by the server. Additionally, channel loss requires frequent re-delegation as
+the size of the delegated variational circuit grows, complicating verification
+due to increased circuit complexity. This paper introduces a new protocol to
+address these challenges and enhance both verifiability and tolerance to
+channel loss in cloud-based VQAs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tractable Equilibrium Computation in Markov Games through Risk Aversion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.14156v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.14156v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eric Mazumdar, Kishan Panaganti, Laixi Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A significant roadblock to the development of principled multi-agent
+reinforcement learning is the fact that desired solution concepts like Nash
+equilibria may be intractable to compute. To overcome this obstacle, we take
+inspiration from behavioral economics and show that -- by imbuing agents with
+important features of human decision-making like risk aversion and bounded
+rationality -- a class of risk-averse quantal response equilibria (RQE) become
+tractable to compute in all $n$-player matrix and finite-horizon Markov games.
+In particular, we show that they emerge as the endpoint of no-regret learning
+in suitably adjusted versions of the games. Crucially, the class of
+computationally tractable RQE is independent of the underlying game structure
+and only depends on agents' degree of risk-aversion and bounded rationality. To
+validate the richness of this class of solution concepts we show that it
+captures peoples' patterns of play in a number of 2-player matrix games
+previously studied in experimental economics. Furthermore, we give a first
+analysis of the sample complexity of computing these equilibria in
+finite-horizon Markov games when one has access to a generative model and
+validate our findings on a simple multi-agent reinforcement learning benchmark.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>preprint of multi-agent RL with risk-averse equilibria</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Conditional Stochastic Interpolation for Generative Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.05579v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.05579v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ding Huang, Jian Huang, Ting Li, Guohao Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a conditional stochastic interpolation (CSI) method for learning
+conditional distributions. CSI is based on estimating probability flow
+equations or stochastic differential equations that transport a reference
+distribution to the target conditional distribution. This is achieved by first
+learning the conditional drift and score functions based on CSI, which are then
+used to construct a deterministic process governed by an ordinary differential
+equation or a diffusion process for conditional sampling. In our proposed
+approach, we incorporate an adaptive diffusion term to address the instability
+issues arising in the diffusion process. We derive explicit expressions of the
+conditional drift and score functions in terms of conditional expectations,
+which naturally lead to an nonparametric regression approach to estimating
+these functions. Furthermore, we establish nonasymptotic error bounds for
+learning the target conditional distribution. We illustrate the application of
+CSI on image generation using a benchmark image dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>57 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Data Augmentation for Continual RL via Adversarial Gradient Episodic
+  Memory 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13452v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13452v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sihao Wu, Xingyu Zhao, Xiaowei Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data efficiency of learning, which plays a key role in the Reinforcement
+Learning (RL) training process, becomes even more important in continual RL
+with sequential environments. In continual RL, the learner interacts with
+non-stationary, sequential tasks and is required to learn new tasks without
+forgetting previous knowledge. However, there is little work on implementing
+data augmentation for continual RL. In this paper, we investigate the efficacy
+of data augmentation for continual RL. Specifically, we provide benchmarking
+data augmentations for continual RL, by (1) summarising existing data
+augmentation methods and (2) including a new augmentation method for continual
+RL: Adversarial Augmentation with Gradient Episodic Memory (Adv-GEM). Extensive
+experiments show that data augmentations, such as random amplitude scaling,
+state-switch, mixup, adversarial augmentation, and Adv-GEM, can improve
+existing continual RL algorithms in terms of their average performance,
+catastrophic forgetting, and forward transfer, on robot control tasks. All data
+augmentation methods are implemented as plug-in modules for trivial integration
+into continual RL methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Gated Linear Attention <span class="highlight-title">Transformer</span>s with Hardware-Efficient Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.06635v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.06635v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Songlin Yang, Bailin Wang, Yikang Shen, Rameswar Panda, Yoon Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers with linear attention allow for efficient parallel training but
+can simultaneously be formulated as an RNN with 2D (matrix-valued) hidden
+states, thus enjoying linear-time inference complexity. However, linear
+attention generally underperforms ordinary softmax attention. Moreover, current
+implementations of linear attention lack I/O-awareness and are thus slower than
+highly optimized implementations of softmax attention. This work describes a
+hardware-efficient algorithm for linear attention that trades off memory
+movement against parallelizability. The resulting implementation, dubbed
+FLASHLINEARATTENTION, is faster than FLASHATTENTION-2 (Dao, 2023) as a
+standalone layer even on short sequence lengths (e.g., 1K). We then generalize
+this algorithm to a more expressive variant of linear attention with
+data-dependent gates. When used as a replacement for the standard attention
+layer in Transformers, the resulting gated linear attention (GLA) Transformer
+is found to perform competitively against the LLaMA-architecture Transformer
+(Touvron et al., 2023) as well recent linear-time-inference baselines such as
+RetNet (Sun et al., 2023a) and Mamba (Gu & Dao, 2023) on moderate-scale
+language modeling experiments. GLA Transformer is especially effective at
+length generalization, enabling a model trained on 2K to generalize to
+sequences longer than 20K without significant perplexity degradations. For
+training speed, the GLA Transformer has higher throughput than a
+similarly-sized Mamba model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>minor update</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Attack on Scene Flow using Point Clouds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.13621v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.13621v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haniyeh Ehsani Oskouie, Mohammad-Shahram Moin, Shohreh Kasaei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks have made significant advancements in accurately
+estimating scene flow using point clouds, which is vital for many applications
+like video analysis, action recognition, and navigation. The robustness of
+these techniques, however, remains a concern, particularly in the face of
+adversarial attacks that have been proven to deceive state-of-the-art deep
+neural networks in many domains. Surprisingly, the robustness of scene flow
+networks against such attacks has not been thoroughly investigated. To address
+this problem, the proposed approach aims to bridge this gap by introducing
+adversarial white-box attacks specifically tailored for scene flow networks.
+Experimental results show that the generated adversarial examples obtain up to
+33.7 relative degradation in average end-point error on the KITTI and
+FlyingThings3D datasets. The study also reveals the significant impact that
+attacks targeting point clouds in only one dimension or color channel have on
+average end-point error. Analyzing the success and failure of these attacks on
+the scene flow networks and their 2D optical flow network variants shows a
+higher vulnerability for the optical flow networks. Code is available at
+https://github.com/aheldis/Attack-on-Scene-Flow-using-Point-Clouds.git.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Anti-Matthew FL: Bridging the Performance Gap in Federated Learning to
+  Counteract the Matthew Effect 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.16338v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.16338v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiashi Gao, Xin Yao, Xuetao Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) stands as a paradigmatic approach that facilitates
+model training across heterogeneous and diverse datasets originating from
+various data providers. However, conventional FLs fall short of achieving
+consistent performance, potentially leading to performance degradation for
+clients who are disadvantaged in data resources. Influenced by the Matthew
+effect, deploying a performance-imbalanced global model in applications further
+impedes the generation of high-quality data from disadvantaged clients,
+exacerbating the disparities in data resources among clients. In this work, we
+propose anti-Matthew fairness for the global model at the client level,
+requiring equal accuracy and equal decision bias across clients. To balance the
+trade-off between achieving anti-Matthew fairness and performance optimality,
+we formalize the anti-Matthew effect federated learning (anti-Matthew FL) as a
+multi-constrained multi-objectives optimization (MCMOO) problem and propose a
+three-stage multi-gradient descent algorithm to obtain the Pareto optimality.
+We theoretically analyze the convergence and time complexity of our proposed
+algorithms. Additionally, through extensive experimentation, we demonstrate
+that our proposed anti-Matthew FL outperforms other state-of-the-art FL
+algorithms in achieving a high-performance global model while effectively
+bridging performance gaps among clients. We hope this work provides valuable
+insights into the manifestation of the Matthew effect in FL and other
+decentralized learning scenarios and can contribute to designing fairer
+learning mechanisms, ultimately fostering societal welfare.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LQ-LoRA: Low-rank Plus Quantized Matrix Decomposition for Efficient
+  Language Model Finetuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.12023v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.12023v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Han Guo, Philip Greengard, Eric P. Xing, Yoon Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a simple approach for memory-efficient adaptation of pretrained
+language models. Our approach uses an iterative algorithm to decompose each
+pretrained matrix into a high-precision low-rank component and a
+memory-efficient quantized component. During finetuning, the quantized
+component remains fixed and only the low-rank component is updated. We present
+an integer linear programming formulation of the quantization component which
+enables dynamic configuration of quantization parameters (e.g., bit-width,
+block size) for each matrix given an overall target memory budget. We further
+explore a data-aware version of the algorithm which uses an approximation of
+the Fisher information matrix to weight the reconstruction objective during
+matrix decomposition. Experiments on finetuning RoBERTa and LLaMA-2 (7B and
+70B) demonstrate that our low-rank plus quantized matrix decomposition approach
+(LQ-LoRA) outperforms strong QLoRA and GPTQ-LoRA baselines and enables
+aggressive quantization to sub-3 bits with only minor performance degradations.
+When finetuned on a language modeling calibration dataset, LQ-LoRA can also be
+used for model compression; in this setting our 2.75-bit LLaMA-2-70B model
+(which has 2.85 bits on average when including the low-rank components and
+requires 27GB of GPU memory) performs respectably compared to the 16-bit
+baseline.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fast Matrix Multiplications for Lookup Table-Quantized LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.10960v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.10960v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Han Guo, William Brandon, Radostin Cholakov, Jonathan Ragan-Kelley, Eric P. Xing, Yoon Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The deployment of large language models (LLMs) is often constrained by memory
+bandwidth, where the primary bottleneck is the cost of transferring model
+parameters from the GPU's global memory to its registers. When coupled with
+custom kernels that fuse the dequantization and matmul operations, weight-only
+quantization can thus enable faster inference by reducing the amount of memory
+movement. However, developing high-performance kernels for weight-quantized
+LLMs presents substantial challenges, especially when the weights are
+compressed to non-evenly-divisible bit widths (e.g., 3 bits) with non-uniform,
+lookup table (LUT) quantization. This paper describes FLUTE, a flexible lookup
+table engine for LUT-quantized LLMs, which uses offline restructuring of the
+quantized weight matrix to minimize bit manipulations associated with
+unpacking, and vectorization and duplication of the lookup table to mitigate
+shared memory bandwidth constraints. At batch sizes < 32 and quantization group
+size of 128 (typical in LLM inference), the FLUTE kernel can be 2-4x faster
+than existing GEMM kernels. As an application of FLUTE, we explore a simple
+extension to lookup table-based NormalFloat quantization and apply it to
+quantize LLaMA3 to various configurations, obtaining competitive quantization
+performance against strong baselines while obtaining an end-to-end throughput
+increase of 1.5 to 2 times.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ IReCa: Intrinsic Reward-enhanced Context-aware Reinforcement Learning
+  for Human-AI Coordination 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.07877v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.07877v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Hao, Bahareh Nakisa, Mohmmad Naim Rastgoo, Richard Dazeley
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In human-AI coordination scenarios, human agents usually exhibit asymmetric
+behaviors that are significantly sparse and unpredictable compared to those of
+AI agents. These characteristics introduce two primary challenges to human-AI
+coordination: the effectiveness of obtaining sparse rewards and the efficiency
+of training the AI agents. To tackle these challenges, we propose an Intrinsic
+Reward-enhanced Context-aware (IReCa) reinforcement learning (RL) algorithm,
+which leverages intrinsic rewards to facilitate the acquisition of sparse
+rewards and utilizes environmental context to enhance training efficiency. Our
+IReCa RL algorithm introduces three unique features: (i) it encourages the
+exploration of sparse rewards by incorporating intrinsic rewards that
+supplement traditional extrinsic rewards from the environment; (ii) it improves
+the acquisition of sparse rewards by prioritizing the corresponding sparse
+state-action pairs; and (iii) it enhances the training efficiency by optimizing
+the exploration and exploitation through innovative context-aware weights of
+extrinsic and intrinsic rewards. Extensive simulations executed in the
+Overcooked layouts demonstrate that our IReCa RL algorithm can increase the
+accumulated rewards by approximately 20% and reduce the epochs required for
+convergence by approximately 67% compared to state-of-the-art baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scaling Learning based Policy Optimization for Temporal Logic Tasks by
+  Controller Network Dropout 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.15826v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.15826v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Navid Hashemi, Bardh Hoxha, Danil Prokhorov, Georgios Fainekos, Jyotirmoy Deshmukh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a model-based approach for training feedback
+controllers for an autonomous agent operating in a highly nonlinear (albeit
+deterministic) environment. We desire the trained policy to ensure that the
+agent satisfies specific task objectives and safety constraints, both expressed
+in Discrete-Time Signal Temporal Logic (DT-STL). One advantage for
+reformulation of a task via formal frameworks, like DT-STL, is that it permits
+quantitative satisfaction semantics. In other words, given a trajectory and a
+DT-STL formula, we can compute the {\em robustness}, which can be interpreted
+as an approximate signed distance between the trajectory and the set of
+trajectories satisfying the formula. We utilize feedback control, and we assume
+a feed forward neural network for learning the feedback controller. We show how
+this learning problem is similar to training recurrent neural networks (RNNs),
+where the number of recurrent units is proportional to the temporal horizon of
+the agent's task objectives. This poses a challenge: RNNs are susceptible to
+vanishing and exploding gradients, and na\"{i}ve gradient descent-based
+strategies to solve long-horizon task objectives thus suffer from the same
+problems. To tackle this challenge, we introduce a novel gradient approximation
+algorithm based on the idea of dropout or gradient sampling. One of the main
+contributions is the notion of {\em controller network dropout}, where we
+approximate the NN controller in several time-steps in the task horizon by the
+control input obtained using the controller in a previous training step. We
+show that our control synthesis methodology, can be quite helpful for
+stochastic gradient descent to converge with less numerical issues, enabling
+scalable backpropagation over long time horizons and trajectories over high
+dimensional state spaces.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Faithfulness Measurable Masked Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07819v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07819v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andreas Madsen, Siva Reddy, Sarath Chandar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A common approach to explaining NLP models is to use importance measures that
+express which tokens are important for a prediction. Unfortunately, such
+explanations are often wrong despite being persuasive. Therefore, it is
+essential to measure their faithfulness. One such metric is if tokens are truly
+important, then masking them should result in worse model performance. However,
+token masking introduces out-of-distribution issues, and existing solutions
+that address this are computationally expensive and employ proxy models.
+Furthermore, other metrics are very limited in scope. This work proposes an
+inherently faithfulness measurable model that addresses these challenges. This
+is achieved using a novel fine-tuning method that incorporates masking, such
+that masking tokens become in-distribution by design. This differs from
+existing approaches, which are completely model-agnostic but are inapplicable
+in practice. We demonstrate the generality of our approach by applying it to 16
+different datasets and validate it using statistical in-distribution tests. The
+faithfulness is then measured with 9 different importance measures. Because
+masking is in-distribution, importance measures that themselves use masking
+become consistently more faithful. Additionally, because the model makes
+faithfulness cheap to measure, we can optimize explanations towards maximal
+faithfulness; thus, our model becomes indirectly inherently explainable.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Expert Knowledge-Aware Image Difference Graph Representation Learning
+  for Difference-Aware Medical Visual Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11986v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11986v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyue Hu, Lin Gu, Qiyuan An, Mengliang Zhang, Liangchen Liu, Kazuma Kobayashi, Tatsuya Harada, Ronald M. Summers, Yingying Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To contribute to automating the medical vision-language model, we propose a
+novel Chest-Xray Difference Visual Question Answering (VQA) task. Given a pair
+of main and reference images, this task attempts to answer several questions on
+both diseases and, more importantly, the differences between them. This is
+consistent with the radiologist's diagnosis practice that compares the current
+image with the reference before concluding the report. We collect a new
+dataset, namely MIMIC-Diff-VQA, including 700,703 QA pairs from 164,324 pairs
+of main and reference images. Compared to existing medical VQA datasets, our
+questions are tailored to the Assessment-Diagnosis-Intervention-Evaluation
+treatment procedure used by clinical professionals. Meanwhile, we also propose
+a novel expert knowledge-aware graph representation learning model to address
+this task. The proposed baseline model leverages expert knowledge such as
+anatomical structure prior, semantic, and spatial knowledge to construct a
+multi-relationship graph, representing the image differences between two images
+for the image difference VQA task. The dataset and code can be found at
+https://github.com/Holipori/MIMIC-Diff-VQA. We believe this work would further
+push forward the medical vision language model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reduce, Reuse, Recycle: Compositional Generation with Energy-Based
+  Diffusion Models and MCMC <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.11552v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.11552v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yilun Du, Conor Durkan, Robin Strudel, Joshua B. Tenenbaum, Sander Dieleman, Rob Fergus, Jascha Sohl-Dickstein, Arnaud Doucet, Will Grathwohl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Since their introduction, diffusion models have quickly become the prevailing
+approach to generative modeling in many domains. They can be interpreted as
+learning the gradients of a time-varying sequence of log-probability density
+functions. This interpretation has motivated classifier-based and
+classifier-free guidance as methods for post-hoc control of diffusion models.
+In this work, we build upon these ideas using the score-based interpretation of
+diffusion models, and explore alternative ways to condition, modify, and reuse
+diffusion models for tasks involving compositional generation and guidance. In
+particular, we investigate why certain types of composition fail using current
+techniques and present a number of solutions. We conclude that the sampler (not
+the model) is responsible for this failure and propose new samplers, inspired
+by MCMC, which enable successful compositional generation. Further, we propose
+an energy-based parameterization of diffusion models which enables the use of
+new compositional operators and more sophisticated, Metropolis-corrected
+samplers. Intriguingly we find these samplers lead to notable improvements in
+compositional generation across a wide set of problems such as
+classifier-guided ImageNet modeling and compositional text-to-image generation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2023, Project Webpage:
+  https://energy-based-model.github.io/reduce-reuse-recycle/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Dataset</span> Scale and Societal Consistency Mediate Facial Impression Bias in
+  Vision-Language AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.01959v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.01959v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Robert Wolfe, Aayushi Dangol, Alexis Hiniker, Bill Howe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal AI models capable of associating images and text hold promise for
+numerous domains, ranging from automated image captioning to accessibility
+applications for blind and low-vision users. However, uncertainty about bias
+has in some cases limited their adoption and availability. In the present work,
+we study 43 CLIP vision-language models to determine whether they learn
+human-like facial impression biases, and we find evidence that such biases are
+reflected across three distinct CLIP model families. We show for the first time
+that the the degree to which a bias is shared across a society predicts the
+degree to which it is reflected in a CLIP model. Human-like impressions of
+visually unobservable attributes, like trustworthiness and sexuality, emerge
+only in models trained on the largest dataset, indicating that a better fit to
+uncurated cultural data results in the reproduction of increasingly subtle
+social biases. Moreover, we use a hierarchical clustering approach to show that
+dataset size predicts the extent to which the underlying structure of facial
+impression bias resembles that of facial impression bias in humans. Finally, we
+show that Stable Diffusion models employing CLIP as a text encoder learn facial
+impression biases, and that these biases intersect with racial biases in Stable
+Diffusion XL-Turbo. While pretrained CLIP models may prove useful for
+scientific studies of bias, they will also require significant dataset curation
+when intended for use as general-purpose models in a zero-shot setting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Artificial Intelligence, Ethics, and Society 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Universal Time-Series Representation Learning: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.03717v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.03717v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patara Trirat, Yooju Shin, Junhyeok Kang, Youngeun Nam, Jihye Na, Minyoung Bae, Joeun Kim, Byunghyun Kim, Jae-Gil Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Time-series data exists in every corner of real-world systems and services,
+ranging from satellites in the sky to wearable devices on human bodies.
+Learning representations by extracting and inferring valuable information from
+these time series is crucial for understanding the complex dynamics of
+particular phenomena and enabling informed decisions. With the learned
+representations, we can perform numerous downstream analyses more effectively.
+Among several approaches, deep learning has demonstrated remarkable performance
+in extracting hidden patterns and features from time-series data without manual
+feature engineering. This survey first presents a novel taxonomy based on three
+fundamental elements in designing state-of-the-art universal representation
+learning methods for time series. According to the proposed taxonomy, we
+comprehensively review existing studies and discuss their intuitions and
+insights into how these methods enhance the quality of learned representations.
+Finally, as a guideline for future studies, we summarize commonly used
+experimental setups and datasets and discuss several promising research
+directions. An up-to-date corresponding resource is available at
+https://github.com/itouchz/awesome-deep-time-series-representations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>41 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Submodular Maximization Approaches for Equitable Client Selection in
+  Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13683v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13683v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrés Catalino Castillo Jiménez, Ege C. Kaya, Lintao Ye, Abolfazl Hashemi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In a conventional Federated Learning framework, client selection for training
+typically involves the random sampling of a subset of clients in each
+iteration. However, this random selection often leads to disparate performance
+among clients, raising concerns regarding fairness, particularly in
+applications where equitable outcomes are crucial, such as in medical or
+financial machine learning tasks. This disparity typically becomes more
+pronounced with the advent of performance-centric client sampling techniques.
+This paper introduces two novel methods, namely SUBTRUNC and UNIONFL, designed
+to address the limitations of random client selection. Both approaches utilize
+submodular function maximization to achieve more balanced models. By modifying
+the facility location problem, they aim to mitigate the fairness concerns
+associated with random selection. SUBTRUNC leverages client loss information to
+diversify solutions, while UNIONFL relies on historical client selection data
+to ensure a more equitable performance of the final model. Moreover, these
+algorithms are accompanied by robust theoretical guarantees regarding
+convergence under reasonable assumptions. The efficacy of these methods is
+demonstrated through extensive evaluations across heterogeneous scenarios,
+revealing significant improvements in fairness as measured by a client
+dissimilarity metric.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Splatt3R: Zero-shot Gaussian Splatting from Uncalibrated Image Pairs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13912v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13912v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brandon Smart, Chuanxia Zheng, Iro Laina, Victor Adrian Prisacariu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce Splatt3R, a pose-free, feed-forward method for
+in-the-wild 3D reconstruction and novel view synthesis from stereo pairs. Given
+uncalibrated natural images, Splatt3R can predict 3D Gaussian Splats without
+requiring any camera parameters or depth information. For generalizability, we
+build Splatt3R upon a ``foundation'' 3D geometry reconstruction method, MASt3R,
+by extending it to deal with both 3D structure and appearance. Specifically,
+unlike the original MASt3R which reconstructs only 3D point clouds, we predict
+the additional Gaussian attributes required to construct a Gaussian primitive
+for each point. Hence, unlike other novel view synthesis methods, Splatt3R is
+first trained by optimizing the 3D point cloud's geometry loss, and then a
+novel view synthesis objective. By doing this, we avoid the local minima
+present in training 3D Gaussian Splats from stereo views. We also propose a
+novel loss masking strategy that we empirically find is critical for strong
+performance on extrapolated viewpoints. We train Splatt3R on the ScanNet++
+dataset and demonstrate excellent generalisation to uncalibrated, in-the-wild
+images. Splatt3R can reconstruct scenes at 4FPS at 512 x 512 resolution, and
+the resultant splats can be rendered in real-time.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Our project page can be found at: https://splatt3r.active.vision/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multimodality Helps Unimodality: Cross-Modal Few-Shot Learning with
+  Multimodal Models <span class="chip">CVPR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.06267v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.06267v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiqiu Lin, Samuel Yu, Zhiyi Kuang, Deepak Pathak, Deva Ramanan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ability to quickly learn a new task with minimal instruction - known as
+few-shot learning - is a central aspect of intelligent agents. Classical
+few-shot benchmarks make use of few-shot samples from a single modality, but
+such samples may not be sufficient to characterize an entire concept class. In
+contrast, humans use cross-modal information to learn new concepts efficiently.
+In this work, we demonstrate that one can indeed build a better ${\bf visual}$
+dog classifier by ${\bf read}$ing about dogs and ${\bf listen}$ing to them
+bark. To do so, we exploit the fact that recent multimodal foundation models
+such as CLIP learn cross-modal encoders that map different modalities to the
+same representation space. Specifically, we propose a simple strategy for ${\bf
+cross-modal}$ ${\bf adaptation}$: we treat examples from different modalities
+as additional few-shot examples. For example, by simply repurposing class names
+as an additional training sample, we trivially turn any n-shot learning problem
+into a (n+1)-shot problem. This allows us to produce SOTA results with
+embarrassingly simple linear classifiers. We show that our approach can be
+combined with existing methods such as prefix tuning, adapters, and classifier
+ensembling. Finally, to explore other modalities beyond vision and language, we
+construct the first (to our knowledge) audiovisual few-shot benchmark and use
+cross-modal training to improve the performance of both image and audio
+classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at CVPR 2023. Project site:
+  https://linzhiqiu.github.io/papers/cross_modal/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Probabilistic Forecasting with Stochastic Interpolants and Föllmer
+  Processes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.13724v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.13724v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Chen, Mark Goldstein, Mengjian Hua, Michael S. Albergo, Nicholas M. Boffi, Eric Vanden-Eijnden
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a framework for probabilistic forecasting of dynamical systems
+based on generative modeling. Given observations of the system state over time,
+we formulate the forecasting problem as sampling from the conditional
+distribution of the future system state given its current state. To this end,
+we leverage the framework of stochastic interpolants, which facilitates the
+construction of a generative model between an arbitrary base distribution and
+the target. We design a fictitious, non-physical stochastic dynamics that takes
+as initial condition the current system state and produces as output a sample
+from the target conditional distribution in finite time and without bias. This
+process therefore maps a point mass centered at the current state onto a
+probabilistic ensemble of forecasts. We prove that the drift coefficient
+entering the stochastic differential equation (SDE) achieving this task is
+non-singular, and that it can be learned efficiently by square loss regression
+over the time-series data. We show that the drift and the diffusion
+coefficients of this SDE can be adjusted after training, and that a specific
+choice that minimizes the impact of the estimation error gives a F\"ollmer
+process. We highlight the utility of our approach on several complex,
+high-dimensional forecasting problems, including stochastically forced
+Navier-Stokes and video prediction on the KTH and CLEVRER datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unsupervised discovery of the shared and private geometry in multi-view
+  data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.12091v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.12091v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sai Koukuntla, Joshua B. Julian, Jesse C. Kaminsky, Manuel Schottdorf, David W. Tank, Carlos D. Brody, Adam S. Charles
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern applications often leverage multiple views of a subject of study.
+Within neuroscience, there is growing interest in large-scale simultaneous
+recordings across multiple brain regions. Understanding the relationship
+between views (e.g., the neural activity in each region recorded) can reveal
+fundamental principles about the characteristics of each representation and
+about the system. However, existing methods to characterize such relationships
+either lack the expressivity required to capture complex nonlinearities,
+describe only sources of variance that are shared between views, or discard
+geometric information that is crucial to interpreting the data. Here, we
+develop a nonlinear neural network-based method that, given paired samples of
+high-dimensional views, disentangles low-dimensional shared and private latent
+variables underlying these views while preserving intrinsic data geometry.
+Across multiple simulated and real datasets, we demonstrate that our method
+outperforms competing methods. Using simulated populations of lateral
+geniculate nucleus (LGN) and V1 neurons we demonstrate our model's ability to
+discover interpretable shared and private structure across different noise
+conditions. On a dataset of unrotated and corresponding but randomly rotated
+MNIST digits, we recover private latents for the rotated view that encode
+rotation angle regardless of digit class, and places the angle representation
+on a 1-d manifold, while shared latents encode digit class but not rotation
+angle. Applying our method to simultaneous Neuropixels recordings of
+hippocampus and prefrontal cortex while mice run on a linear track, we discover
+a low-dimensional shared latent space that encodes the animal's position. We
+propose our approach as a general-purpose method for finding succinct and
+interpretable descriptions of paired data sets in terms of disentangled shared
+and private latent variables.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Red-Teaming for Generative AI: Silver Bullet or Security Theater? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.15897v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.15897v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Feffer, Anusha Sinha, Wesley Hanwen Deng, Zachary C. Lipton, Hoda Heidari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In response to rising concerns surrounding the safety, security, and
+trustworthiness of Generative AI (GenAI) models, practitioners and regulators
+alike have pointed to AI red-teaming as a key component of their strategies for
+identifying and mitigating these risks. However, despite AI red-teaming's
+central role in policy discussions and corporate messaging, significant
+questions remain about what precisely it means, what role it can play in
+regulation, and how it relates to conventional red-teaming practices as
+originally conceived in the field of cybersecurity. In this work, we identify
+recent cases of red-teaming activities in the AI industry and conduct an
+extensive survey of relevant research literature to characterize the scope,
+structure, and criteria for AI red-teaming practices. Our analysis reveals that
+prior methods and practices of AI red-teaming diverge along several axes,
+including the purpose of the activity (which is often vague), the artifact
+under evaluation, the setting in which the activity is conducted (e.g., actors,
+resources, and methods), and the resulting decisions it informs (e.g.,
+reporting, disclosure, and mitigation). In light of our findings, we argue that
+while red-teaming may be a valuable big-tent idea for characterizing GenAI harm
+mitigations, and that industry may effectively apply red-teaming and other
+strategies behind closed doors to safeguard AI, gestures towards red-teaming
+(based on public definitions) as a panacea for every possible risk verge on
+security theater. To move toward a more robust toolbox of evaluations for
+generative AI, we synthesize our recommendations into a question bank meant to
+guide and scaffold future AI red-teaming practices.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AIES 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">8</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sec2Sec Co-attention for Video-Based Apparent Affective Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.15209v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.15209v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingwei Sun, Kunpeng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video-based apparent affect detection plays a crucial role in video
+understanding, as it encompasses various elements such as vision, audio,
+audio-visual interactions, and spatiotemporal information, which are essential
+for accurate video predictions. However, existing approaches often focus on
+extracting only a subset of these elements, resulting in the limited predictive
+capacity of their models. To address this limitation, we propose a novel
+LSTM-based network augmented with a Transformer co-attention mechanism for
+predicting apparent affect in videos. We demonstrate that our proposed Sec2Sec
+Co-attention Transformer surpasses multiple state-of-the-art methods in
+predicting apparent affect on two widely used datasets: LIRIS-ACCEDE and First
+Impressions. Notably, our model offers interpretability, allowing us to examine
+the contributions of different time points to the overall prediction. The
+implementation is available at: https://github.com/nestor-sun/sec2sec.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Alfie: Democratising RGBA Image Generation With No $$$ <span class="chip">ECCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14826v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14826v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabio Quattrini, Vittorio Pippi, Silvia Cascianelli, Rita Cucchiara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Designs and artworks are ubiquitous across various creative fields, requiring
+graphic design skills and dedicated software to create compositions that
+include many graphical elements, such as logos, icons, symbols, and art scenes,
+which are integral to visual storytelling. Automating the generation of such
+visual elements improves graphic designers' productivity, democratizes and
+innovates the creative industry, and helps generate more realistic synthetic
+data for related tasks. These illustration elements are mostly RGBA images with
+irregular shapes and cutouts, facilitating blending and scene composition.
+However, most image generation models are incapable of generating such images
+and achieving this capability requires expensive computational resources,
+specific training recipes, or post-processing solutions. In this work, we
+propose a fully-automated approach for obtaining RGBA illustrations by
+modifying the inference-time behavior of a pre-trained Diffusion Transformer
+model, exploiting the prompt-guided controllability and visual quality offered
+by such models with no additional computational cost. We force the generation
+of entire subjects without sharp croppings, whose background is easily removed
+for seamless integration into design projects or artistic scenes. We show with
+a user study that, in most cases, users prefer our solution over generating and
+then matting an image, and we show that our generated illustrations yield good
+results when used as inputs for composite scene generation pipelines. We
+release the code at https://github.com/aimagelab/Alfie.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ECCV AI for Visual Arts Workshop and Challenges</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LapisGS: Layered Progressive 3D Gaussian Splatting for Adaptive
+  Streaming 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14823v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14823v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuang Shi, Simone Gasparini, Géraldine Morin, Wei Tsang Ooi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rise of Extended Reality (XR) requires efficient streaming of 3D online
+worlds, challenging current 3DGS representations to adapt to
+bandwidth-constrained environments. This paper proposes LapisGS, a layered 3DGS
+that supports adaptive streaming and progressive rendering. Our method
+constructs a layered structure for cumulative representation, incorporates
+dynamic opacity optimization to maintain visual fidelity, and utilizes
+occupancy maps to efficiently manage Gaussian splats. This proposed model
+offers a progressive representation supporting a continuous rendering quality
+adapted for bandwidth-aware streaming. Extensive experiments validate the
+effectiveness of our approach in balancing visual fidelity with the compactness
+of the model, with up to 50.71% improvement in SSIM, 286.53% improvement in
+LPIPS, and 318.41% reduction in model size, and shows its potential for
+bandwidth-adapted 3D streaming and rendering applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SynthDoc: Bilingual Documents Synthesis for Visual Document
+  Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14764v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14764v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chuanghao Ding, Xuejing Liu, Wei Tang, Juan Li, Xiaoliang Wang, Rui Zhao, Cam-Tu Nguyen, Fei Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces SynthDoc, a novel synthetic document generation
+pipeline designed to enhance Visual Document Understanding (VDU) by generating
+high-quality, diverse datasets that include text, images, tables, and charts.
+Addressing the challenges of data acquisition and the limitations of existing
+datasets, SynthDoc leverages publicly available corpora and advanced rendering
+tools to create a comprehensive and versatile dataset. Our experiments,
+conducted using the Donut model, demonstrate that models trained with
+SynthDoc's data achieve superior performance in pre-training read tasks and
+maintain robustness in downstream tasks, despite language inconsistencies. The
+release of a benchmark dataset comprising 5,000 image-text pairs not only
+showcases the pipeline's capabilities but also provides a valuable resource for
+the VDU community to advance research and development in document image
+recognition. This work significantly contributes to the field by offering a
+scalable solution to data scarcity and by validating the efficacy of end-to-end
+models in parsing complex, real-world documents.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PPVF: An Efficient Privacy-Preserving Online Video Fetching Framework
+  with Correlated Differential Privacy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14735v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14735v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xianzhi Zhang, Yipeng Zhou, Di Wu, Quan Z. Sheng, Miao Hu, Linchang Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online video streaming has evolved into an integral component of the
+contemporary Internet landscape. Yet, the disclosure of user requests presents
+formidable privacy challenges. As users stream their preferred online videos,
+their requests are automatically seized by video content providers, potentially
+leaking users' privacy.
+  Unfortunately, current protection methods are not well-suited to preserving
+user request privacy from content providers while maintaining high-quality
+online video services. To tackle this challenge, we introduce a novel
+Privacy-Preserving Video Fetching (PPVF) framework, which utilizes trusted edge
+devices to pre-fetch and cache videos, ensuring the privacy of users' requests
+while optimizing the efficiency of edge caching. More specifically, we design
+PPVF with three core components: (1) \textit{Online privacy budget scheduler},
+which employs a theoretically guaranteed online algorithm to select
+non-requested videos as candidates with assigned privacy budgets. Alternative
+videos are chosen by an online algorithm that is theoretically guaranteed to
+consider both video utilities and available privacy budgets. (2) \textit{Noisy
+video request generator}, which generates redundant video requests (in addition
+to original ones) utilizing correlated differential privacy to obfuscate
+request privacy. (3) \textit{Online video utility predictor}, which leverages
+federated learning to collaboratively evaluate video utility in an online
+fashion, aiding in video selection in (1) and noise generation in (2). Finally,
+we conduct extensive experiments using real-world video request traces from
+Tencent Video. The results demonstrate that PPVF effectively safeguards user
+request privacy while upholding high video caching performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ StyleSpeech: Parameter-efficient Fine Tuning for <span class="highlight-title">Pre-train</span>ed
+  Controllable Text-to-Speech 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14713v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14713v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haowei Lou, Helen Paik, Wen Hu, Lina Yao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces StyleSpeech, a novel Text-to-Speech~(TTS) system that
+enhances the naturalness and accuracy of synthesized speech. Building upon
+existing TTS technologies, StyleSpeech incorporates a unique Style Decorator
+structure that enables deep learning models to simultaneously learn style and
+phoneme features, improving adaptability and efficiency through the principles
+of Lower Rank Adaptation~(LoRA). LoRA allows efficient adaptation of style
+features in pre-trained models. Additionally, we introduce a novel automatic
+evaluation metric, the LLM-Guided Mean Opinion Score (LLM-MOS), which employs
+large language models to offer an objective and robust protocol for
+automatically assessing TTS system performance. Extensive testing on benchmark
+datasets shows that our approach markedly outperforms existing state-of-the-art
+baseline methods in producing natural, accurate, and high-quality speech. These
+advancements not only pushes the boundaries of current TTS system capabilities,
+but also facilitate the application of TTS system in more dynamic and
+specialized, such as interactive virtual assistants, adaptive audiobooks, and
+customized voice for gaming. Speech samples can be found in
+https://style-speech.vercel.app
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Zero-Shot Character Identification and Speaker Prediction in Comics via
+  Iterative Multimodal Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.13993v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.13993v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yingxuan Li, Ryota Hinami, Kiyoharu Aizawa, Yusuke Matsui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recognizing characters and predicting speakers of dialogue are critical for
+comic processing tasks, such as voice generation or translation. However,
+because characters vary by comic title, supervised learning approaches like
+training character classifiers which require specific annotations for each
+comic title are infeasible. This motivates us to propose a novel zero-shot
+approach, allowing machines to identify characters and predict speaker names
+based solely on unannotated comic images. In spite of their importance in
+real-world applications, these task have largely remained unexplored due to
+challenges in story comprehension and multimodal integration. Recent large
+language models (LLMs) have shown great capability for text understanding and
+reasoning, while their application to multimodal content analysis is still an
+open problem. To address this problem, we propose an iterative multimodal
+framework, the first to employ multimodal information for both character
+identification and speaker prediction tasks. Our experiments demonstrate the
+effectiveness of the proposed framework, establishing a robust baseline for
+these tasks. Furthermore, since our method requires no training data or
+annotations, it can be used as-is on any comic series.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM Multimedia 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Attack on Scene Flow using Point Clouds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.13621v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.13621v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haniyeh Ehsani Oskouie, Mohammad-Shahram Moin, Shohreh Kasaei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks have made significant advancements in accurately
+estimating scene flow using point clouds, which is vital for many applications
+like video analysis, action recognition, and navigation. The robustness of
+these techniques, however, remains a concern, particularly in the face of
+adversarial attacks that have been proven to deceive state-of-the-art deep
+neural networks in many domains. Surprisingly, the robustness of scene flow
+networks against such attacks has not been thoroughly investigated. To address
+this problem, the proposed approach aims to bridge this gap by introducing
+adversarial white-box attacks specifically tailored for scene flow networks.
+Experimental results show that the generated adversarial examples obtain up to
+33.7 relative degradation in average end-point error on the KITTI and
+FlyingThings3D datasets. The study also reveals the significant impact that
+attacks targeting point clouds in only one dimension or color channel have on
+average end-point error. Analyzing the success and failure of these attacks on
+the scene flow networks and their 2D optical flow network variants shows a
+higher vulnerability for the optical flow networks. Code is available at
+https://github.com/aheldis/Attack-on-Scene-Flow-using-Point-Clouds.git.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2024-08-26T00:00:00Z">2024-08-26</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">83</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Practitioner's Guide to Continual Multimodal <span class="highlight-title">Pretrain</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14471v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14471v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karsten Roth, Vishaal Udandarao, Sebastian Dziadzio, Ameya Prabhu, Mehdi Cherti, Oriol Vinyals, Olivier Hénaff, Samuel Albanie, Matthias Bethge, Zeynep Akata
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal foundation models serve numerous applications at the intersection
+of vision and language. Still, despite being pretrained on extensive data, they
+become outdated over time. To keep models updated, research into continual
+pretraining mainly explores scenarios with either (1) infrequent,
+indiscriminate updates on large-scale new data, or (2) frequent, sample-level
+updates. However, practical model deployment often operates in the gap between
+these two limit cases, as real-world applications often demand adaptation to
+specific subdomains, tasks or concepts -- spread over the entire, varying life
+cycle of a model. In this work, we complement current perspectives on continual
+pretraining through a research test bed as well as provide comprehensive
+guidance for effective continual model updates in such scenarios. We first
+introduce FoMo-in-Flux, a continual multimodal pretraining benchmark with
+realistic compute constraints and practical deployment requirements,
+constructed over 63 datasets with diverse visual and semantic coverage. Using
+FoMo-in-Flux, we explore the complex landscape of practical continual
+pretraining through multiple perspectives: (1) A data-centric investigation of
+data mixtures and stream orderings that emulate real-world deployment
+situations, (2) a method-centric investigation ranging from simple fine-tuning
+and traditional continual learning strategies to parameter-efficient updates
+and model merging, (3) meta learning rate schedules and mechanistic design
+choices, and (4) the influence of model and compute scaling. Together, our
+insights provide a practitioner's guide to continual multimodal pretraining for
+real-world deployment. Our benchmark and code is here:
+https://github.com/ExplainableML/fomo_in_flux.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report. 52 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Step-by-Step Unmasking for Parameter-Efficient Fine-tuning of Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14470v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14470v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aradhye Agarwal, Suhas K Ramesh, Ayan Sengupta, Tanmoy Chakraborty
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fine-tuning large language models (LLMs) on downstream tasks requires
+substantial computational resources. A class of parameter-efficient fine-tuning
+(PEFT) aims to mitigate these computational challenges by selectively
+fine-tuning only a small fraction of the model parameters. Although
+computationally efficient, these techniques often fail to match the performance
+of fully fine-tuned models, primarily due to inherent biases introduced during
+parameter selection. Traditional selective PEFT techniques use a fixed set of
+parameters based on a predefined budget (a process also known as unmasking),
+failing to capture parameter importance dynamically and often ending up
+exceeding the budget. We introduce $\text{ID}^3$, a novel selective PEFT method
+that calculates parameter importance continually and dynamically unmasks
+parameters by balancing exploration and exploitation in parameter selection.
+Our empirical study on 15 tasks spanning natural language understanding and
+generative tasks demonstrates the effectiveness of our method compared to
+fixed-masking-based PEFT techniques. We analytically show that $\text{ID}^3$
+reduces the number of gradient updates by a factor of two, enhancing
+computational efficiency. $\text{ID}^3$ is robust to random initialization of
+neurons and, therefore, can be seamlessly integrated into existing additive and
+reparametrization-based PEFT modules such as adapters and LoRA for dynamic
+sparsification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 7 tables, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explicit Inductive Inference using Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14467v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14467v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyang Liu, Tianyi Li, Liang Cheng, Mark Steedman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) are reported to hold undesirable attestation
+bias on inference tasks: when asked to predict if a premise P entails a
+hypothesis H, instead of considering H's conditional truthfulness entailed by
+P, LLMs tend to use the out-of-context truth label of H as a fragile proxy. In
+this paper, we propose a pipeline that exploits this bias to do explicit
+inductive inference. Our pipeline uses an LLM to transform a premise into a set
+of attested alternatives, and then aggregate answers of the derived new
+entailment inquiries to support the original inference prediction. On a
+directional predicate entailment benchmark, we demonstrate that by applying
+this simple pipeline, we can improve the overall performance of LLMs on
+inference and substantially alleviate the impact of their attestation bias.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating Large Language Models on Spatial Tasks: A Multi-Task
+  Benchmarking Study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14438v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14438v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liuchang Xu Shuo Zhao, Qingming Lin, Luyao Chen, Qianqian Luo, Sensen Wu, Xinyue Ye, Hailin Feng, Zhenhong Du
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advent of large language models such as ChatGPT, Gemini, and others has
+underscored the importance of evaluating their diverse capabilities, ranging
+from natural language understanding to code generation. However, their
+performance on spatial tasks has not been comprehensively assessed. This study
+addresses this gap by introducing a novel multi-task spatial evaluation
+dataset, designed to systematically explore and compare the performance of
+several advanced models on spatial tasks. The dataset encompasses twelve
+distinct task types, including spatial understanding and path planning, each
+with verified, accurate answers. We evaluated multiple models, including
+OpenAI's gpt-3.5-turbo, gpt-4o, and ZhipuAI's glm-4, through a two-phase
+testing approach. Initially, we conducted zero-shot testing, followed by
+categorizing the dataset by difficulty and performing prompt tuning tests.
+Results indicate that gpt-4o achieved the highest overall accuracy in the first
+phase, with an average of 71.3%. Although moonshot-v1-8k slightly
+underperformed overall, it surpassed gpt-4o in place name recognition tasks.
+The study also highlights the impact of prompt strategies on model performance
+in specific tasks. For example, the Chain-of-Thought (COT) strategy increased
+gpt-4o's accuracy in path planning from 12.4% to 87.5%, while a one-shot
+strategy enhanced moonshot-v1-8k's accuracy in mapping tasks from 10.1% to
+76.3%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CHARTOM: A Visual Theory-of-Mind Benchmark for Multimodal Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14419v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14419v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shubham Bharti, Shiyun Cheng, Jihyun Rho, Martina Rao, Xiaojin Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce CHARTOM, a visual theory-of-mind benchmark for multimodal large
+language models. CHARTOM consists of specially designed data visualizing
+charts. Given a chart, a language model needs to not only correctly comprehend
+the chart (the FACT question) but also judge if the chart will be misleading to
+a human reader (the MIND question). Both questions have significant societal
+benefits. We detail the construction of the CHARTOM benchmark including its
+calibration on human performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MEDSAGE: Enhancing Robustness of Medical Dialogue Summarization to ASR
+  Errors with LLM-generated Synthetic Dialogues 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14418v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14418v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kuluhan Binici, Abhinav Ramesh Kashyap, Viktor Schlegel, Andy T. Liu, Vijay Prakash Dwivedi, Thanh-Tung Nguyen, Xiaoxue Gao, Nancy F. Chen, Stefan Winkler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic Speech Recognition (ASR) systems are pivotal in transcribing speech
+into text, yet the errors they introduce can significantly degrade the
+performance of downstream tasks like summarization. This issue is particularly
+pronounced in clinical dialogue summarization, a low-resource domain where
+supervised data for fine-tuning is scarce, necessitating the use of ASR models
+as black-box solutions. Employing conventional data augmentation for enhancing
+the noise robustness of summarization models is not feasible either due to the
+unavailability of sufficient medical dialogue audio recordings and
+corresponding ASR transcripts. To address this challenge, we propose MEDSAGE,
+an approach for generating synthetic samples for data augmentation using Large
+Language Models (LLMs). Specifically, we leverage the in-context learning
+capabilities of LLMs and instruct them to generate ASR-like errors based on a
+few available medical dialogue examples with audio recordings. Experimental
+results show that LLMs can effectively model ASR noise, and incorporating this
+noisy data into the training process significantly improves the robustness and
+accuracy of medical dialogue summarization systems. This approach addresses the
+challenges of noisy ASR outputs in critical applications, offering a robust
+solution to enhance the reliability of clinical dialogue summarization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Language-specific Calibration for Pruning Multilingual Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14398v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14398v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simon Kurz, Zhixue Zhao, Jian-Jia Chen, Lucie Flek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in large language model (LLM) pruning have shown
+state-of-the-art compression results in post-training and retraining-free
+settings while maintaining high predictive performance. However, such research
+mainly considers calibrating pruning using English text, despite the
+multilingual nature of modern LLMs and their frequent uses in non-English
+languages. In this paper, we set out to explore effective strategies for
+calibrating the pruning of multilingual language models. We present the first
+comprehensive empirical study, comparing different calibration languages for
+pruning multilingual models across diverse tasks, models, and state-of-the-art
+pruning techniques. Our results present practical suggestions, for example,
+calibrating in the target language can efficiently yield lower perplexity, but
+does not necessarily benefit downstream tasks. Our further analysis experiments
+unveil that calibration in the target language mainly contributes to preserving
+language-specific features related to fluency and coherence, but might not
+contribute to capturing language-agnostic features such as language
+understanding and reasoning. Last, we provide practical recommendations for
+future practitioners.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Uncovering Knowledge Gaps in Radiology Report Generation Models through
+  Knowledge Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14397v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14397v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoman Zhang, Julián N. Acosta, Hong-Yu Zhou, Pranav Rajpurkar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in artificial intelligence have significantly improved
+the automatic generation of radiology reports. However, existing evaluation
+methods fail to reveal the models' understanding of radiological images and
+their capacity to achieve human-level granularity in descriptions. To bridge
+this gap, we introduce a system, named ReXKG, which extracts structured
+information from processed reports to construct a comprehensive radiology
+knowledge graph. We then propose three metrics to evaluate the similarity of
+nodes (ReXKG-NSC), distribution of edges (ReXKG-AMS), and coverage of subgraphs
+(ReXKG-SCS) across various knowledge graphs. We conduct an in-depth comparative
+analysis of AI-generated and human-written radiology reports, assessing the
+performance of both specialist and generalist models. Our study provides a
+deeper understanding of the capabilities and limitations of current AI models
+in radiology report generation, offering valuable insights for improving model
+performance and clinical applicability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code is available at: https://github.com/rajpurkarlab/ReXKG</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Probing Causality Manipulation of Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14380v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14380v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenyang Zhang, Haibo Tong, Bin Zhang, Dongyu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have shown various ability on natural language
+processing, including problems about causality. It is not intuitive for LLMs to
+command causality, since pretrained models usually work on statistical
+associations, and do not focus on causes and effects in sentences. So that
+probing internal manipulation of causality is necessary for LLMs. This paper
+proposes a novel approach to probe causality manipulation hierarchically, by
+providing different shortcuts to models and observe behaviors. We exploit
+retrieval augmented generation (RAG) and in-context learning (ICL) for models
+on a designed causality classification task. We conduct experiments on
+mainstream LLMs, including GPT-4 and some smaller and domain-specific models.
+Our results suggest that LLMs can detect entities related to causality and
+recognize direct causal relationships. However, LLMs lack specialized cognition
+for causality, merely treating them as part of the global semantic of the
+sentence.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SWE-bench-java: A GitHub Issue Resolving Benchmark for Java 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14354v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14354v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daoguang Zan, Zhirong Huang, Ailun Yu, Shaoxin Lin, Yifan Shi, Wei Liu, Dong Chen, Zongshuai Qi, Hao Yu, Lei Yu, Dezhi Ran, Muhan Zeng, Bo Shen, Pan Bian, Guangtai Liang, Bei Guan, Pengjie Huang, Tao Xie, Yongji Wang, Qianxiang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  GitHub issue resolving is a critical task in software engineering, recently
+gaining significant attention in both industry and academia. Within this task,
+SWE-bench has been released to evaluate issue resolving capabilities of large
+language models (LLMs), but has so far only focused on Python version. However,
+supporting more programming languages is also important, as there is a strong
+demand in industry. As a first step toward multilingual support, we have
+developed a Java version of SWE-bench, called SWE-bench-java. We have publicly
+released the dataset, along with the corresponding Docker-based evaluation
+environment and leaderboard, which will be continuously maintained and updated
+in the coming months. To verify the reliability of SWE-bench-java, we implement
+a classic method SWE-agent and test several powerful LLMs on it. As is well
+known, developing a high-quality multi-lingual benchmark is time-consuming and
+labor-intensive, so we welcome contributions through pull requests or
+collaboration to accelerate its iteration and refinement, paving the way for
+fully automated programming.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work is in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Assessing Contamination in Large Language Models: Introducing the
+  LogProber method 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14352v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14352v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicolas Yax, Pierre-Yves Oudeyer, Stefano Palminteri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In machine learning, contamination refers to situations where testing data
+leak into the training set. The issue is particularly relevant for the
+evaluation of the performance of Large Language Models (LLMs), which are
+generally trained on gargantuan, and generally opaque, corpora of text scraped
+from the world wide web. Developing tools to detect contamination is therefore
+crucial to be able to fairly and properly track the evolution of the
+performance of LLMs. Most recent works in the field are not tailored to
+quantify contamination on short sequences of text like we find in psychology
+questionnaires. In the present paper we introduce LogProber, a novel,
+efficient, algorithm that we show able to detect contamination using token
+probability in given sentences. In the second part we investigate the
+limitations of the method and discuss how different training methods can
+contaminate models without leaving traces in the token probabilities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Foundation Models for Music: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14340v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14340v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yinghao Ma, Anders Øland, Anton Ragni, Bleiz MacSen Del Sette, Charalampos Saitis, Chris Donahue, Chenghua Lin, Christos Plachouras, Emmanouil Benetos, Elio Quinton, Elona Shatri, Fabio Morreale, Ge Zhang, György Fazekas, Gus Xia, Huan Zhang, Ilaria Manco, Jiawen Huang, Julien Guinot, Liwei Lin, Luca Marinelli, Max W. Y. Lam, Megha Sharma, Qiuqiang Kong, Roger B. Dannenberg, Ruibin Yuan, Shangda Wu, Shih-Lun Wu, Shuqi Dai, Shun Lei, Shiyin Kang, Simon Dixon, Wenhu Chen, Wehhao Huang, Xingjian Du, Xingwei Qu, Xu Tan, Yizhi Li, Zeyue Tian, Zhiyong Wu, Zhizheng Wu, Ziyang Ma, Ziyu Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, foundation models (FMs) such as large language models (LLMs)
+and latent diffusion models (LDMs) have profoundly impacted diverse sectors,
+including music. This comprehensive review examines state-of-the-art (SOTA)
+pre-trained models and foundation models in music, spanning from representation
+learning, generative learning and multimodal learning. We first contextualise
+the significance of music in various industries and trace the evolution of AI
+in music. By delineating the modalities targeted by foundation models, we
+discover many of the music representations are underexplored in FM development.
+Then, emphasis is placed on the lack of versatility of previous methods on
+diverse music applications, along with the potential of FMs in music
+understanding, generation and medical application. By comprehensively exploring
+the details of the model pre-training paradigm, architectural choices,
+tokenisation, finetuning methodologies and controllability, we emphasise the
+important topics that should have been well explored, like instruction tuning
+and in-context learning, scaling law and emergent ability, as well as
+long-sequence modelling etc. A dedicated section presents insights into music
+agents, accompanied by a thorough analysis of datasets and evaluations
+essential for pre-training and downstream tasks. Finally, by underscoring the
+vital importance of ethical considerations, we advocate that following research
+on FM for music should focus more on such issues as interpretability,
+transparency, human responsibility, and copyright issues. The paper offers
+insights into future challenges and trends on FMs for music, aiming to shape
+the trajectory of human-AI collaboration in the music realm.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Claim Verification in the Age of Large Language Models: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14317v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14317v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alphaeus Dmonte, Roland Oruche, Marcos Zampieri, Prasad Calyam, Isabelle Augenstein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The large and ever-increasing amount of data available on the Internet
+coupled with the laborious task of manual claim and fact verification has
+sparked the interest in the development of automated claim verification
+systems. Several deep learning and transformer-based models have been proposed
+for this task over the years. With the introduction of Large Language Models
+(LLMs) and their superior performance in several NLP tasks, we have seen a
+surge of LLM-based approaches to claim verification along with the use of novel
+methods such as Retrieval Augmented Generation (RAG). In this survey, we
+present a comprehensive account of recent claim verification frameworks using
+LLMs. We describe the different components of the claim verification pipeline
+used in these frameworks in detail including common approaches to retrieval,
+prompting, and fine-tuning. Finally, we describe publicly available English
+datasets created for this task.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLM-3D Print: Large Language Models To Monitor and Control 3D Printing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14307v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14307v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yayati Jadhav, Peter Pak, Amir Barati Farimani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Industry 4.0 has revolutionized manufacturing by driving digitalization and
+shifting the paradigm toward additive manufacturing (AM). Fused Deposition
+Modeling (FDM), a key AM technology, enables the creation of highly customized,
+cost-effective products with minimal material waste through layer-by-layer
+extrusion, posing a significant challenge to traditional subtractive methods.
+However, the susceptibility of material extrusion techniques to errors often
+requires expert intervention to detect and mitigate defects that can severely
+compromise product quality. While automated error detection and machine
+learning models exist, their generalizability across diverse 3D printer setups,
+firmware, and sensors is limited, and deep learning methods require extensive
+labeled datasets, hindering scalability and adaptability. To address these
+challenges, we present a process monitoring and control framework that
+leverages pre-trained Large Language Models (LLMs) alongside 3D printers to
+detect and address printing defects. The LLM evaluates print quality by
+analyzing images captured after each layer or print segment, identifying
+failure modes and querying the printer for relevant parameters. It then
+generates and executes a corrective action plan. We validated the effectiveness
+of the proposed framework in identifying defects by comparing it against a
+control group of engineers with diverse AM expertise. Our evaluation
+demonstrated that LLM-based agents not only accurately identify common 3D
+printing errors, such as inconsistent extrusion, stringing, warping, and layer
+adhesion, but also effectively determine the parameters causing these failures
+and autonomously correct them without any need for human intervention.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Predictability and Causality in Spanish and English Natural Language
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14283v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14283v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrea Busto-Castiñeira, Francisco J. González-Castaño, Silvia García-Méndez, Francisco de Arriba-Pérez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, the field of Natural Language Generation (NLG) has been
+boosted by the recent advances in deep learning technologies. Nonetheless,
+these new data-intensive methods introduce language-dependent disparities in
+NLG as the main training data sets are in English. Also, most neural NLG
+systems use decoder-only (causal) transformer language models, which work well
+for English, but were not designed with other languages in mind. In this work
+we depart from the hypothesis that they may introduce generation bias in target
+languages with less rigid word ordering, subject omission, or different
+attachment preferences for relative clauses, so that for these target languages
+other language generation strategies may be more desirable. This paper first
+compares causal and non-causal language modeling for English and Spanish, two
+languages with different grammatical structures and over 1.5 billion and 0.5
+billion speakers, respectively. For this purpose, we define a novel metric of
+average causal and non-causal context-conditioned entropy of the grammatical
+category distribution for both languages as an information-theoretic a priori
+approach. The evaluation of natural text sources (such as training data) in
+both languages reveals lower average non-causal conditional entropy in Spanish
+and lower causal conditional entropy in English. According to this experiment,
+Spanish is more predictable than English given a non-causal context. Then, by
+applying a conditional relative entropy metric to text generation experiments,
+we obtain as insights that the best performance is respectively achieved with
+causal NLG in English, and with non-causal NLG in Spanish. These insights
+support further research in NLG in Spanish using bidirectional transformer
+language models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Epidemic Information Extraction for Event-Based Surveillance using Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14277v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14277v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sergio Consoli, Peter Markov, Nikolaos I. Stilianakis, Lorenzo Bertolini, Antonio Puertas Gallardo, Mario Ceresa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a novel approach to epidemic surveillance, leveraging the
+power of Artificial Intelligence and Large Language Models (LLMs) for effective
+interpretation of unstructured big data sources, like the popular ProMED and
+WHO Disease Outbreak News. We explore several LLMs, evaluating their
+capabilities in extracting valuable epidemic information. We further enhance
+the capabilities of the LLMs using in-context learning, and test the
+performance of an ensemble model incorporating multiple open-source LLMs. The
+findings indicate that LLMs can significantly enhance the accuracy and
+timeliness of epidemic modelling and forecasting, offering a promising tool for
+managing future pandemic events.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 4 figures, Ninth International Congress on Information and
+  Communication Technology (ICICT 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Self-supervised</span> Speech Representations Still Struggle with African
+  American Vernacular English <span class="chip">INTERSPEECH 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14262v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14262v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kalvin Chang, Yi-Hui Chou, Jiatong Shi, Hsuan-Ming Chen, Nicole Holliday, Odette Scharenborg, David R. Mortensen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Underperformance of ASR systems for speakers of African American Vernacular
+English (AAVE) and other marginalized language varieties is a well-documented
+phenomenon, and one that reinforces the stigmatization of these varieties. We
+investigate whether or not the recent wave of Self-Supervised Learning (SSL)
+speech models can close the gap in ASR performance between AAVE and Mainstream
+American English (MAE). We evaluate four SSL models (wav2vec 2.0, HuBERT,
+WavLM, and XLS-R) on zero-shot Automatic Speech Recognition (ASR) for these two
+varieties and find that these models perpetuate the bias in performance against
+AAVE. Additionally, the models have higher word error rates on utterances with
+more phonological and morphosyntactic features of AAVE. Despite the success of
+SSL speech models in improving ASR for low resource varieties, SSL pre-training
+alone may not bridge the gap between AAVE and MAE. Our code is publicly
+available at https://github.com/cmu-llab/s3m-aave.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>INTERSPEECH 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DSTI at LLMs4OL 2024 Task A: Intrinsic versus extrinsic knowledge for
+  type classification <span class="chip">ISWC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14236v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14236v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanna Abi Akl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce semantic towers, an extrinsic knowledge representation method,
+and compare it to intrinsic knowledge in large language models for ontology
+learning. Our experiments show a trade-off between performance and semantic
+grounding for extrinsic knowledge compared to a fine-tuned model intrinsic
+knowledge. We report our findings on the Large Language Models for Ontology
+Learning (LLMs4OL) 2024 challenge.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 4 figures, accepted for the LLMs4OL challenge at the
+  International Semantic Web Conference (ISWC) 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Investigating the effect of Mental Models in User Interaction with an
+  Adaptive Dialog Agent <span class="chip">COLING 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14154v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14154v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lindsey Vanderlyn, Dirk Väth, Ngoc Thang Vu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mental models play an important role in whether user interaction with
+intelligent systems, such as dialog systems is successful or not. Adaptive
+dialog systems present the opportunity to align a dialog agent's behavior with
+heterogeneous user expectations. However, there has been little research into
+what mental models users form when interacting with a task-oriented dialog
+system, how these models affect users' interactions, or what role system
+adaptation can play in this process, making it challenging to avoid damage to
+human-AI partnership. In this work, we collect a new publicly available dataset
+for exploring user mental models about information seeking dialog systems. We
+demonstrate that users have a variety of conflicting mental models about such
+systems, the validity of which directly impacts the success of their
+interactions and perceived usability of system. Furthermore, we show that
+adapting a dialog agent's behavior to better align with users' mental models,
+even when done implicitly, can improve perceived usability, dialog efficiency,
+and success. To this end, we argue that implicit adaptation can be a valid
+strategy for task-oriented dialog systems, so long as developers first have a
+solid understanding of users' mental models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>submitted to COLING 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explaining Vision-Language Similarities in Dual Encoders with
+  Feature-Pair Attributions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14153v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14153v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucas Möller, Pascal Tilli, Ngoc Thang Vu, Sebastian Padó
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dual encoder architectures like CLIP models map two types of inputs into a
+shared embedding space and learn similarities between them. However, it is not
+understood how such models compare two inputs. Here, we address this research
+gap with two contributions. First, we derive a method to attribute predictions
+of any differentiable dual encoder onto feature-pair interactions between its
+inputs. Second, we apply our method to CLIP-type models and show that they
+learn fine-grained correspondences between parts of captions and regions in
+images. They match objects across input modes and also account for mismatches.
+However, this visual-linguistic grounding ability heavily varies between object
+classes, depends on the training data distribution, and largely improves after
+in-domain training. Using our method we can identify knowledge gaps about
+specific object classes in individual models and can monitor their improvement
+upon fine-tuning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Crowd-Calibrator: Can Annotator Disagreement Inform Calibration in
+  Subjective Tasks? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14141v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14141v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Urja Khurana, Eric Nalisnick, Antske Fokkens, Swabha Swayamdipta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Subjective tasks in NLP have been mostly relegated to objective standards,
+where the gold label is decided by taking the majority vote. This obfuscates
+annotator disagreement and the inherent uncertainty of the label. We argue that
+subjectivity should factor into model decisions and play a direct role via
+calibration under a selective prediction setting. Specifically, instead of
+calibrating confidence purely from the model's perspective, we calibrate models
+for subjective tasks based on crowd worker agreement. Our method,
+Crowd-Calibrator, models the distance between the distribution of crowd worker
+labels and the model's own distribution over labels to inform whether the model
+should abstain from a decision. On two highly subjective tasks, hate speech
+detection and natural language inference, our experiments show Crowd-Calibrator
+either outperforms or achieves competitive performance with existing selective
+prediction baselines. Our findings highlight the value of bringing human
+decision-making into model predictions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at COLM 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Faceted Evaluation of Modeling Languages for Augmented Reality
+  Applications -- The Case of ARWFML 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14137v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14137v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabian Muff, Hans-Georg Fill
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The evaluation of modeling languages for augmented reality applications poses
+particular challenges due to the three-dimensional environment they target. The
+previously introduced Augmented Reality Workflow Modeling Language (ARWFML)
+enables the model-based creation of augmented reality scenarios without
+programming knowledge. Building upon the first design cycle of the language's
+specification, this paper presents two further design iterations for refining
+the language based on multi-faceted evaluations. These include a comparative
+evaluation of implementation options and workflow capabilities, the
+introduction of a 3D notation, and the development of a new 3D modeling
+environment. On this basis, a comprehensibility study of the language was
+conducted. Thereby, we show how modeling languages for augmented reality can be
+evolved towards a maturity level suitable for empirical evaluations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted manuscript for the 43rd International Conference on
+  Conceptual Modeling Conceptual Modeling, AI, and Beyond 28-31 October 2024 |
+  Pittsburgh, Pennsylvania, USA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Contrastive Learning Subspace for Text Clustering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14119v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14119v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qian Yong, Chen Chen, Xiabing Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive learning has been frequently investigated to learn effective
+representations for text clustering tasks. While existing contrastive
+learning-based text clustering methods only focus on modeling instance-wise
+semantic similarity relationships, they ignore contextual information and
+underlying relationships among all instances that needs to be clustered. In
+this paper, we propose a novel text clustering approach called Subspace
+Contrastive Learning (SCL) which models cluster-wise relationships among
+instances. Specifically, the proposed SCL consists of two main modules: (1) a
+self-expressive module that constructs virtual positive samples and (2) a
+contrastive learning module that further learns a discriminative subspace to
+capture task-specific cluster-wise relationships among texts. Experimental
+results show that the proposed SCL method not only has achieved superior
+results on multiple task clustering datasets but also has less complexity in
+positive sample construction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Depression Diagnosis with Chain-of-Thought <span class="highlight-title">Prompt</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14053v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14053v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elysia Shi, Adithri Manda, London Chowdhury, Runeema Arun, Kevin Zhu, Michael Lam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When using AI to detect signs of depressive disorder, AI models habitually
+draw preemptive conclusions. We theorize that using chain-of-thought (CoT)
+prompting to evaluate Patient Health Questionnaire-8 (PHQ-8) scores will
+improve the accuracy of the scores determined by AI models. In our findings,
+when the models reasoned with CoT, the estimated PHQ-8 scores were consistently
+closer on average to the accepted true scores reported by each participant
+compared to when not using CoT. Our goal is to expand upon AI models'
+understanding of the intricacies of human conversation, allowing them to more
+effectively assess a patient's feelings and tone, therefore being able to more
+accurately discern mental disorder symptoms; ultimately, we hope to augment AI
+models' abilities, so that they can be widely accessible and used in the
+medical field.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SurGen: Text-Guided Diffusion Model for Surgical Video Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14028v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14028v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joseph Cho, Samuel Schmidgall, Cyril Zakka, Mrudang Mathur, Rohan Shad, William Hiesinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion-based video generation models have made significant strides,
+producing outputs with improved visual fidelity, temporal coherence, and user
+control. These advancements hold great promise for improving surgical education
+by enabling more realistic, diverse, and interactive simulation environments.
+In this study, we introduce SurGen, a text-guided diffusion model tailored for
+surgical video synthesis, producing the highest resolution and longest duration
+videos among existing surgical video generation models. We validate the visual
+and temporal quality of the outputs using standard image and video generation
+metrics. Additionally, we assess their alignment to the corresponding text
+prompts through a deep learning classifier trained on surgical data. Our
+results demonstrate the potential of diffusion models to serve as valuable
+educational tools for surgical trainees.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Empowering Low-Resource Language ASR via Large-Scale Pseudo Labeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14026v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14026v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaushal Santosh Bhogale, Deovrat Mehendale, Niharika Parasa, Sathish Kumar Reddy G, Tahir Javed, Pratyush Kumar, Mitesh M. Khapra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we tackle the challenge of limited labeled data for
+low-resource languages in ASR, focusing on Hindi. Specifically, we explore
+pseudo-labeling, by proposing a generic framework combining multiple ideas from
+existing works. Our framework integrates multiple base models for transcription
+and evaluators for assessing audio-transcript pairs, resulting in robust
+pseudo-labeling for low resource languages. We validate our approach with a new
+benchmark, IndicYT, comprising diverse YouTube audio files from multiple
+content categories. Our findings show that augmenting pseudo labeled data from
+YouTube with existing training data leads to significant performance
+improvements on IndicYT, without affecting performance on out-of-domain
+benchmarks, demonstrating the efficacy of pseudo-labeled data in enhancing ASR
+capabilities for low-resource languages. The benchmark, code and models
+developed as a part of this work will be made publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Question answering system of bridge design specification based on large
+  language model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13282v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13282v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leye Zhang, Xiangxiang Tian, Hongjun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper constructs question answering system for bridge design
+specification based on large language model. Three implementation schemes are
+tried: full fine-tuning of the Bert pretrained model, parameter-efficient
+fine-tuning of the Bert pretrained model, and self-built language model from
+scratch. Through the self-built question and answer task dataset, based on the
+tensorflow and keras deep learning platform framework, the model is constructed
+and trained to predict the start position and end position of the answer in the
+bridge design specification given by the user. The experimental results show
+that full fine-tuning of the Bert pretrained model achieves 100% accuracy in
+the training-dataset, validation-dataset and test-dataset, and the system can
+extract the answers from the bridge design specification given by the user to
+answer various questions of the user; While parameter-efficient fine-tuning of
+the Bert pretrained model and self-built language model from scratch perform
+well in the training-dataset, their generalization ability in the test-dataset
+needs to be improved. The research of this paper provides a useful reference
+for the development of question answering system in professional field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Focused Large Language Models are Stable Many-Shot Learners 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13987v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13987v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peiwen Yuan, Shaoxiong Feng, Yiwei Li, Xinglin Wang, Yueqi Zhang, Chuyi Tan, Boyuan Pan, Heda Wang, Yao Hu, Kan Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In-Context Learning (ICL) enables large language models (LLMs) to achieve
+rapid task adaptation by learning from demonstrations. With the increase in
+available context length of LLMs, recent experiments have shown that the
+performance of ICL does not necessarily scale well in many-shot (demonstration)
+settings. We theoretically and experimentally confirm that the reason lies in
+more demonstrations dispersing the model attention from the query, hindering
+its understanding of key content. Inspired by how humans learn from examples,
+we propose a training-free method FocusICL, which conducts triviality filtering
+to avoid attention being diverted by unimportant contents at token-level and
+operates hierarchical attention to further ensure sufficient attention towards
+current query at demonstration-level. We also design an efficient
+hyperparameter searching strategy for FocusICL based on model perplexity of
+demonstrations. Comprehensive experiments validate that FocusICL achieves an
+average performance improvement of 5.2% over vanilla ICL and scales well with
+many-shot demonstrations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AgentMove: Predicting Human Mobility Anywhere Using Large Language Model
+  based Agentic Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13986v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13986v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jie Feng, Yuwei Du, Jie Zhao, Yong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human mobility prediction plays a crucial role in various real-world
+applications. Although deep learning based models have shown promising results
+over the past decade, their reliance on extensive private mobility data for
+training and their inability to perform zero-shot predictions, have hindered
+further advancements. Recently, attempts have been made to apply large language
+models (LLMs) to mobility prediction task. However, their performance has been
+constrained by the absence of a systematic design of workflow. They directly
+generate the final output using LLMs, which limits the potential of LLMs to
+uncover complex mobility patterns and underestimates their extensive reserve of
+global geospatial knowledge. In this paper, we introduce AgentMove, a
+systematic agentic prediction framework to achieve generalized mobility
+prediction for any cities worldwide. In AgentMove, we first decompose the
+mobility prediction task into three sub-tasks and then design corresponding
+modules to complete these subtasks, including spatial-temporal memory for
+individual mobility pattern mining, world knowledge generator for modeling the
+effects of urban structure and collective knowledge extractor for capturing the
+shared patterns among population. Finally, we combine the results of three
+modules and conduct a reasoning step to generate the final predictions.
+Extensive experiments on mobility data from two sources in 12 cities
+demonstrate that AgentMove outperforms the best baseline more than 8% in
+various metrics and it shows robust predictions with various LLMs as base and
+also less geographical bias across cities. Codes and data can be found in
+https://github.com/tsinghua-fib-lab/AgentMove.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TF-Attack: Transferable and Fast Adversarial Attacks on Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13985v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13985v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zelin Li, Kehai Chen, Xuefeng Bai, Lemao Liu, Mingming Yang, Yang Xiang, Min Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the great advancements in large language models (LLMs), adversarial
+attacks against LLMs have recently attracted increasing attention. We found
+that pre-existing adversarial attack methodologies exhibit limited
+transferability and are notably inefficient, particularly when applied to LLMs.
+In this paper, we analyze the core mechanisms of previous predominant
+adversarial attack methods, revealing that 1) the distributions of importance
+score differ markedly among victim models, restricting the transferability; 2)
+the sequential attack processes induces substantial time overheads. Based on
+the above two insights, we introduce a new scheme, named TF-Attack, for
+Transferable and Fast adversarial attacks on LLMs. TF-Attack employs an
+external LLM as a third-party overseer rather than the victim model to identify
+critical units within sentences. Moreover, TF-Attack introduces the concept of
+Importance Level, which allows for parallel substitutions of attacks. We
+conduct extensive experiments on 6 widely adopted benchmarks, evaluating the
+proposed method through both automatic and human metrics. Results show that our
+method consistently surpasses previous methods in transferability and delivers
+significant speed improvements, up to 20 times faster than earlier attack
+strategies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 6 figures. arXiv admin note: text overlap with
+  arXiv:2305.17440 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reducing the Cost: Cross-<span class="highlight-title">Prompt</span> Pre-Finetuning for Short Answer Scoring 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13966v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13966v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hiroaki Funayama, Yuya Asazuma, Yuichiroh Matsubayashi, Tomoya Mizumoto, Kentaro Inui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated Short Answer Scoring (SAS) is the task of automatically scoring a
+given input to a prompt based on rubrics and reference answers. Although SAS is
+useful in real-world applications, both rubrics and reference answers differ
+between prompts, thus requiring a need to acquire new data and train a model
+for each new prompt. Such requirements are costly, especially for schools and
+online courses where resources are limited and only a few prompts are used. In
+this work, we attempt to reduce this cost through a two-phase approach: train a
+model on existing rubrics and answers with gold score signals and finetune it
+on a new prompt. Specifically, given that scoring rubrics and reference answers
+differ for each prompt, we utilize key phrases, or representative expressions
+that the answer should contain to increase scores, and train a SAS model to
+learn the relationship between key phrases and answers using already annotated
+prompts (i.e., cross-prompts). Our experimental results show that finetuning on
+existing cross-prompt data with key phrases significantly improves scoring
+accuracy, especially when the training data is limited. Finally, our extensive
+analysis shows that it is crucial to design the model so that it can learn the
+task's general property.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is the draft submitted to AIED 2023. For the latest version,
+  please visit: https://link.springer.com/chapter/10.1007/978-3-031-36272-9_7</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Smart Multi-Modal Search: Contextual Sparse and Dense Embedding
+  Integration in Adobe Express 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14698v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14698v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cherag Aroraa, Tracy Holloway King, Jayant Kumar, Yi Lu, Sanat Sharma, Arvind Srikantan, David Uvalle, Josep Valls-Vargas, Harsha Vardhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As user content and queries become increasingly multi-modal, the need for
+effective multi-modal search systems has grown. Traditional search systems
+often rely on textual and metadata annotations for indexed images, while
+multi-modal embeddings like CLIP enable direct search using text and image
+embeddings. However, embedding-based approaches face challenges in integrating
+contextual features such as user locale and recency. Building a scalable
+multi-modal search system requires fine-tuning several components. This paper
+presents a multi-modal search architecture and a series of AB tests that
+optimize embeddings and multi-modal technologies in Adobe Express template
+search. We address considerations such as embedding model selection, the roles
+of embeddings in matching and ranking, and the balance between dense and sparse
+embeddings. Our iterative approach demonstrates how utilizing sparse, dense,
+and contextual features enhances short and long query search, significantly
+reduces null rates (over 70\%), and increases click-through rates (CTR). Our
+findings provide insights into developing robust multi-modal search systems,
+thereby enhancing relevance for complex queries.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Training-Free Activation Sparsity in Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14690v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14690v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        James Liu, Pragaash Ponnusamy, Tianle Cai, Han Guo, Yoon Kim, Ben Athiwaratkun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Activation sparsity can enable practical inference speedups in large language
+models (LLMs) by reducing the compute and memory-movement required for matrix
+multiplications during the forward pass. However, existing methods face
+limitations that inhibit widespread adoption. Some approaches are tailored
+towards older models with ReLU-based sparsity, while others require extensive
+continued pre-training on up to hundreds of billions of tokens. This paper
+describes TEAL, a simple training-free method that applies magnitude-based
+activation sparsity to hidden states throughout the entire model. TEAL achieves
+40-50% model-wide sparsity with minimal performance degradation across Llama-2,
+Llama-3, and Mistral families, with sizes varying from 7B to 70B. We improve
+existing sparse kernels and demonstrate wall-clock decoding speed-ups of up to
+1.53$\times$ and 1.8$\times$ at 40% and 50% model-wide sparsity. TEAL is
+compatible with weight quantization, enabling further efficiency gains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Relationships are Complicated! An Analysis of Relationships Between
+  <span class="highlight-title">Dataset</span>s on the Web 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14636v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14636v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kate Lin, Tarfah Alrashed, Natasha Noy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Web today has millions of datasets, and the number of datasets continues
+to grow at a rapid pace. These datasets are not standalone entities; rather,
+they are intricately connected through complex relationships. Semantic
+relationships between datasets provide critical insights for research and
+decision-making processes. In this paper, we study dataset relationships from
+the perspective of users who discover, use, and share datasets on the Web: what
+relationships are important for different tasks? What contextual information
+might users want to know? We first present a comprehensive taxonomy of
+relationships between datasets on the Web and map these relationships to user
+tasks performed during dataset discovery. We develop a series of methods to
+identify these relationships and compare their performance on a large corpus of
+datasets generated from Web pages with schema.org markup. We demonstrate that
+machine-learning based methods that use dataset metadata achieve multi-class
+classification accuracy of 90%. Finally, we highlight gaps in available
+semantic markup for datasets and discuss how incorporating comprehensive
+semantics can facilitate the identification of dataset relationships. By
+providing a comprehensive overview of dataset relationships at scale, this
+paper sets a benchmark for future research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MODOC: A Modular Interface for Flexible Interlinking of Text Retrieval
+  and Text Generation Functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14623v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14623v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yingqiang Gao, Jhony Prada, Nianlong Gu, Jessica Lam, Richard H. R. Hahnloser
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) produce eloquent texts but often the content
+they generate needs to be verified. Traditional information retrieval systems
+can assist with this task, but most systems have not been designed with
+LLM-generated queries in mind. As such, there is a compelling need for
+integrated systems that provide both retrieval and generation functionality
+within a single user interface.
+  We present MODOC, a modular user interface that leverages the capabilities of
+LLMs and provides assistance with detecting their confabulations, promoting
+integrity in scientific writing. MODOC represents a significant step forward in
+scientific writing assistance. Its modular architecture supports flexible
+functions for retrieving information and for writing and generating text in a
+single, user-friendly interface.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ What Makes a Good Story and How Can We Measure It? A Comprehensive
+  <span class="highlight-title">Survey</span> of Story Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14622v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14622v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dingyi Yang, Qin Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the development of artificial intelligence, particularly the success of
+Large Language Models (LLMs), the quantity and quality of automatically
+generated stories have significantly increased. This has led to the need for
+automatic story evaluation to assess the generative capabilities of computing
+systems and analyze the quality of both automatic-generated and human-written
+stories. Evaluating a story can be more challenging than other generation
+evaluation tasks. While tasks like machine translation primarily focus on
+assessing the aspects of fluency and accuracy, story evaluation demands complex
+additional measures such as overall coherence, character development,
+interestingness, etc. This requires a thorough review of relevant research. In
+this survey, we first summarize existing storytelling tasks, including
+text-to-text, visual-to-text, and text-to-visual. We highlight their evaluation
+challenges, identify various human criteria to measure stories, and present
+existing benchmark datasets. Then, we propose a taxonomy to organize evaluation
+metrics that have been developed or can be adopted for story evaluation. We
+also provide descriptions of these metrics, along with the discussion of their
+merits and limitations. Later, we discuss the human-AI collaboration for story
+evaluation and generation. Finally, we suggest potential future research
+directions, extending from story evaluation to general evaluations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Surprisingly Fragile: Assessing and Addressing <span class="highlight-title">Prompt</span> Instability in
+  Multimodal Foundation Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14595v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14595v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ian Stewart, Sameera Horawalavithana, Brendan Kennedy, Sai Munikoti, Karl Pazdernik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal foundation models (MFMs) such as OFASys show the potential to
+unlock analysis of complex data such as images, videos, and audio data via text
+prompts alone. However, their performance may suffer in the face of text input
+that differs even slightly from their training distribution, which is
+surprising considering the use of modality-specific data to "ground" the text
+input. This study demonstrates that prompt instability is a major concern for
+MFMs, leading to a consistent drop in performance across all modalities, but
+that instability can be mitigated with additional training with augmented data.
+We evaluate several methods for grounded prompt perturbation, where we generate
+perturbations and filter based on similarity to text and/or modality data.
+After re-training the models on the augmented data, we find improved accuracy
+and more stable performance on the perturbed test data regardless of
+perturbation condition, suggesting that the data augmentation strategy helps
+the models handle domain shifts more effectively. In error analysis, we find
+consistent patterns of performance improvement across domains, suggesting that
+retraining on prompt perturbations tends to help general reasoning capabilities
+in MFMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>in submission</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CURLoRA: Stable LLM Continual Fine-Tuning and Catastrophic Forgetting
+  Mitigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14572v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14572v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Fawi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces CURLoRA, a novel approach to fine-tuning large language
+models (LLMs) that leverages CUR matrix decomposition in the context of
+Low-Rank Adaptation (LoRA). Our method addresses two critical challenges in LLM
+fine-tuning: mitigating catastrophic forgetting during continual learning and
+reducing the number of trainable parameters. We propose a unique modification
+to the CUR decomposition process, utilizing inverted probabilities for column
+and row selection which acts as an implicit regularization, and initializing
+the $U$ matrix as a zero matrix, and only fine-tuning it. We demonstrate
+through experiments on multiple datasets that CURLoRA outperforms standard LoRA
+in mitigating catastrophic forgetting. It maintains model stability and
+performance across tasks while significantly reducing the number of trainable
+parameters. Our results show that CURLoRA achieves very good and stable task
+accuracy while maintaining base model's perplexity scores fixed compared to
+LoRA upon continual fine-tuning, particularly in scenarios with limited data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code available at https://github.com/MNoorFawi/curlora</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Clinical Note Generation from Complex Doctor-Patient
+  Conversation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14568v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14568v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yizhan Li, Sifan Wu, Christopher Smith, Thomas Lo, Bang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Writing clinical notes and documenting medical exams is a critical task for
+healthcare professionals, serving as a vital component of patient care
+documentation. However, manually writing these notes is time-consuming and can
+impact the amount of time clinicians can spend on direct patient interaction
+and other tasks. Consequently, the development of automated clinical note
+generation systems has emerged as a clinically meaningful area of research
+within AI for health. In this paper, we present three key contributions to the
+field of clinical note generation using large language models (LLMs). First, we
+introduce CliniKnote, a comprehensive dataset consisting of 1,200 complex
+doctor-patient conversations paired with their full clinical notes. This
+dataset, created and curated by medical experts with the help of modern neural
+networks, provides a valuable resource for training and evaluating models in
+clinical note generation tasks. Second, we propose the K-SOAP (Keyword,
+Subjective, Objective, Assessment, and Plan) note format, which enhances
+traditional SOAP~\cite{podder2023soap} (Subjective, Objective, Assessment, and
+Plan) notes by adding a keyword section at the top, allowing for quick
+identification of essential information. Third, we develop an automatic
+pipeline to generate K-SOAP notes from doctor-patient conversations and
+benchmark various modern LLMs using various metrics. Our results demonstrate
+significant improvements in efficiency and performance compared to standard LLM
+finetuning methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revisiting Image Captioning Training Paradigm via Direct CLIP-based
+  Optimization <span class="chip">BMVC 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14547v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14547v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicholas Moratelli, Davide Caffagni, Marcella Cornia, Lorenzo Baraldi, Rita Cucchiara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The conventional training approach for image captioning involves pre-training
+a network using teacher forcing and subsequent fine-tuning with Self-Critical
+Sequence Training to maximize hand-crafted captioning metrics. However, when
+attempting to optimize modern and higher-quality metrics like CLIP-Score and
+PAC-Score, this training method often encounters instability and fails to
+acquire the genuine descriptive capabilities needed to produce fluent and
+informative captions. In this paper, we propose a new training paradigm termed
+Direct CLIP-Based Optimization (DiCO). Our approach jointly learns and
+optimizes a reward model that is distilled from a learnable captioning
+evaluator with high human correlation. This is done by solving a weighted
+classification problem directly inside the captioner. At the same time, DiCO
+prevents divergence from the original model, ensuring that fluency is
+maintained. DiCO not only exhibits improved stability and enhanced quality in
+the generated captions but also aligns more closely with human preferences
+compared to existing methods, especially in modern metrics. Additionally, it
+maintains competitive performance in traditional metrics. Our source code and
+trained models are publicly available at https://github.com/aimagelab/DiCO.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>BMVC 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TF-Attack: Transferable and Fast Adversarial Attacks on Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13985v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13985v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zelin Li, Kehai Chen, Xuefeng Bai, Lemao Liu, Mingming Yang, Yang Xiang, Min Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the great advancements in large language models (LLMs), adversarial
+attacks against LLMs have recently attracted increasing attention. We found
+that pre-existing adversarial attack methodologies exhibit limited
+transferability and are notably inefficient, particularly when applied to LLMs.
+In this paper, we analyze the core mechanisms of previous predominant
+adversarial attack methods, revealing that 1) the distributions of importance
+score differ markedly among victim models, restricting the transferability; 2)
+the sequential attack processes induces substantial time overheads. Based on
+the above two insights, we introduce a new scheme, named TF-Attack, for
+Transferable and Fast adversarial attacks on LLMs. TF-Attack employs an
+external LLM as a third-party overseer rather than the victim model to identify
+critical units within sentences. Moreover, TF-Attack introduces the concept of
+Importance Level, which allows for parallel substitutions of attacks. We
+conduct extensive experiments on 6 widely adopted benchmarks, evaluating the
+proposed method through both automatic and human metrics. Results show that our
+method consistently surpasses previous methods in transferability and delivers
+significant speed improvements, up to 20 times faster than earlier attack
+strategies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLM Pruning and Distillation in Practice: The Minitron Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.11796v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.11796v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sharath Turuvekere Sreenivas, Saurav Muralidharan, Raviraj Joshi, Marcin Chochowski, Mostofa Patwary, Mohammad Shoeybi, Bryan Catanzaro, Jan Kautz, Pavlo Molchanov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a comprehensive report on compressing the Llama 3.1 8B and Mistral
+NeMo 12B models to 4B and 8B parameters, respectively, using pruning and
+distillation. We explore two distinct pruning strategies: (1) depth pruning and
+(2) joint hidden/attention/MLP (width) pruning, and evaluate the results on
+common benchmarks from the LM Evaluation Harness. The models are then aligned
+with NeMo Aligner and tested in instruct-tuned versions. This approach produces
+a compelling 4B model from Llama 3.1 8B and a state-of-the-art
+Mistral-NeMo-Minitron-8B (MN-Minitron-8B for brevity) model from Mistral NeMo
+12B. We found that with no access to the original data, it is beneficial to
+slightly fine-tune teacher models on the distillation dataset. We open-source
+our base model weights on Hugging Face with a permissive license.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>v2: Added missing references. Cleaned up runtime performance section</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond Scale: The Diversity Coefficient as a Data Quality Metric for
+  Variability in Natural Language Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.13840v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.13840v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brando Miranda, Alycia Lee, Sudharsan Sundar, Allison Casasola, Sanmi Koyejo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current trends in pre-training Large Language Models (LLMs) primarily focus
+on the scaling of model and dataset size. While the quality of pre-training
+data is considered an important factor for training powerful LLMs, it remains a
+nebulous concept that has not been rigorously characterized. To this end, we
+propose a formalization of one key aspect of data quality -- measuring the
+variability of natural language data -- specifically via a measure we call the
+diversity coefficient. Our empirical analysis shows that the proposed diversity
+coefficient aligns with the intuitive properties of diversity and variability,
+e.g., it increases as the number of latent concepts increases. Then, we measure
+the diversity coefficient of publicly available pre-training datasets and
+demonstrate that their formal diversity is high compared to theoretical lower
+and upper bounds. Finally, we conduct a comprehensive set of controlled
+interventional experiments with GPT-2 and LLaMAv2 that demonstrate the
+diversity coefficient of pre-training data characterizes useful aspects of
+downstream model evaluation performance -- totaling 44 models of various sizes
+(51M to 7B parameters). We conclude that our formal notion of diversity is an
+important aspect of data quality that captures variability and causally leads
+to improved evaluation performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tracing Privacy Leakage of Language Models to Training Data via Adjusted
+  Influence Functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.10468v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.10468v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinxin Liu, Zao Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The responses generated by Large Language Models (LLMs) can include sensitive
+information from individuals and organizations, leading to potential privacy
+leakage. This work implements Influence Functions (IFs) to trace privacy
+leakage back to the training data, thereby mitigating privacy concerns of
+Language Models (LMs). However, we notice that current IFs struggle to
+accurately estimate the influence of tokens with large gradient norms,
+potentially overestimating their influence. When tracing the most influential
+samples, this leads to frequently tracing back to samples with large gradient
+norm tokens, overshadowing the actual most influential samples even if their
+influences are well estimated. To address this issue, we propose Heuristically
+Adjusted IF (HAIF), which reduces the weight of tokens with large gradient
+norms, thereby significantly improving the accuracy of tracing the most
+influential samples. To establish easily obtained groundtruth for tracing
+privacy leakage, we construct two datasets, PII-E and PII-CR, representing two
+distinct scenarios: one with identical text in the model outputs and
+pre-training data, and the other where models leverage their reasoning
+abilities to generate text divergent from pre-training data. HAIF significantly
+improves tracing accuracy, enhancing it by 20.96% to 73.71% on the PII-E
+dataset and 3.21% to 45.93% on the PII-CR dataset, compared to the best SOTA
+IFs against various GPT-2 and QWen-1.5 models. HAIF also outperforms SOTA IFs
+on real-world pretraining data CLUECorpus2020, demonstrating strong robustness
+regardless prompt and response lengths.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Dataset</span> and Benchmark for Hospital Course Summarization with Adapted
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.05720v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.05720v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Asad Aali, Dave Van Veen, Yamin Ishraq Arefeen, Jason Hom, Christian Bluethgen, Eduardo Pontes Reis, Sergios Gatidis, Namuun Clifford, Joseph Daws, Arash S. Tehrani, Jangwon Kim, Akshay S. Chaudhari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Brief hospital course (BHC) summaries are clinical documents that summarize a
+patient's hospital stay. While large language models (LLMs) depict remarkable
+capabilities in automating real-world tasks, their capabilities for healthcare
+applications such as synthesizing BHCs from clinical notes have not been shown.
+We introduce a novel pre-processed dataset, the MIMIC-IV-BHC, encapsulating
+clinical note and brief hospital course (BHC) pairs to adapt LLMs for BHC
+synthesis. Furthermore, we introduce a benchmark of the summarization
+performance of two general-purpose LLMs and three healthcare-adapted LLMs.
+  Using clinical notes as input, we apply prompting-based (using in-context
+learning) and fine-tuning-based adaptation strategies to three open-source LLMs
+(Clinical-T5-Large, Llama2-13B, FLAN-UL2) and two proprietary LLMs (GPT-3.5,
+GPT-4). We evaluate these LLMs across multiple context-length inputs using
+natural language similarity metrics. We further conduct a clinical study with
+five clinicians, comparing clinician-written and LLM-generated BHCs across 30
+samples, focusing on their potential to enhance clinical decision-making
+through improved summary quality. We observe that the Llama2-13B fine-tuned LLM
+outperforms other domain-adapted models given quantitative evaluation metrics
+of BLEU and BERT-Score. GPT-4 with in-context learning shows more robustness to
+increasing context lengths of clinical note inputs than fine-tuned Llama2-13B.
+Despite comparable quantitative metrics, the reader study depicts a significant
+preference for summaries generated by GPT-4 with in-context learning compared
+to both Llama2-13B fine-tuned summaries and the original summaries,
+highlighting the need for qualitative clinical evaluation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Revenge of the Fallen? Recurrent Models Match <span class="highlight-title">Transformer</span>s at Predicting
+  Human Language Comprehension Metrics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.19178v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.19178v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        James A. Michaelov, Catherine Arnett, Benjamin K. Bergen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers have generally supplanted recurrent neural networks as the
+dominant architecture for both natural language processing tasks and for
+modelling the effect of predictability on online human language comprehension.
+However, two recently developed recurrent model architectures, RWKV and Mamba,
+appear to perform natural language tasks comparably to or better than
+transformers of equivalent scale. In this paper, we show that contemporary
+recurrent models are now also able to match - and in some cases, exceed - the
+performance of comparably sized transformers at modeling online human language
+comprehension. This suggests that transformer language models are not uniquely
+suited to this task, and opens up new directions for debates about the extent
+to which architectural features of language models make them better or worse
+models of human language comprehension.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at COLM 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LoQT: Low Rank Adapters for Quantized Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.16528v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.16528v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sebastian Loeschcke, Mads Toftrup, Michael J. Kastoryano, Serge Belongie, Vésteinn Snæbjarnarson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training of large neural networks requires significant computational
+resources. Despite advances using low-rank adapters and quantization,
+pretraining of models such as LLMs on consumer hardware has not been possible
+without model sharding, offloading during training, or per-layer gradient
+updates. To address these limitations, we propose LoQT, a method for
+efficiently training quantized models. LoQT uses gradient-based tensor
+factorization to initialize low-rank trainable weight matrices that are
+periodically merged into quantized full-rank weight matrices. Our approach is
+suitable for both pretraining and fine-tuning of models, which we demonstrate
+experimentally for language modeling and downstream task adaptation. We find
+that LoQT enables efficient training of models up to 7B parameters on a
+consumer-grade 24GB GPU. We also demonstrate the feasibility of training a 13B
+parameter model using per-layer gradient updates on the same hardware.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BlockPruner: Fine-grained Pruning for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.10594v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.10594v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Longguang Zhong, Fanqi Wan, Ruijun Chen, Xiaojun Quan, Liangzhi Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rapid growth in the size and complexity of large language models
+(LLMs), the costs associated with their training and inference have escalated
+significantly. Research indicates that certain layers in LLMs harbor
+substantial redundancy, and pruning these layers has minimal impact on the
+overall performance. While various layer pruning methods have been developed
+based on this insight, they generally overlook the finer-grained redundancies
+within the layers themselves. In this paper, we delve deeper into the
+architecture of LLMs and demonstrate that finer-grained pruning can be achieved
+by targeting redundancies in multi-head attention (MHA) and multi-layer
+perceptron (MLP) blocks. We propose a novel, training-free structured pruning
+approach called BlockPruner. Unlike existing layer pruning methods, BlockPruner
+segments each Transformer layer into MHA and MLP blocks. It then assesses the
+importance of these blocks using perplexity measures and applies a heuristic
+search for iterative pruning. We applied BlockPruner to LLMs of various sizes
+and architectures and validated its performance across a wide range of
+downstream tasks. Experimental results show that BlockPruner achieves more
+granular and effective pruning compared to state-of-the-art baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Docling Technical Report 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.09869v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.09869v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christoph Auer, Maksym Lysak, Ahmed Nassar, Michele Dolfi, Nikolaos Livathinos, Panos Vagenas, Cesar Berrospi Ramis, Matteo Omenetti, Fabian Lindlbauer, Kasper Dinkla, Valery Weber, Lucas Morin, Ingmar Meijer, Viktor Kuropiatnyk, Peter W. J. Staar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This technical report introduces Docling, an easy to use, self-contained,
+MIT-licensed open-source package for PDF document conversion. It is powered by
+state-of-the-art specialized AI models for layout analysis (DocLayNet) and
+table structure recognition (TableFormer), and runs efficiently on commodity
+hardware in a small resource budget. The code interface allows for easy
+extensibility and addition of new features and models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Pruning Large Language Models with Semi-Structural Adaptive Sparse
+  Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.20584v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.20584v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weiyu Huang, Yuezhou Hu, Guohao Jian, Jun Zhu, Jianfei Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The tremendous success of Large Language Models (LLMs) across various complex
+tasks relies heavily on their substantial scale, which raises challenges during
+model deployment due to their large memory consumption. Recently, numerous
+studies have attempted to compress LLMs using one-shot pruning methods.
+However, these methods often experience considerable performance degradation on
+complex language understanding tasks, calling into question the feasibility of
+pruning in LLMs. To address this issue, we propose a pruning pipeline for
+semi-structured sparse models via retraining, termed Adaptive Sparse Trainer
+(AST). Unlike previous one-shot pruning methods, AST incrementally transforms
+dense models into sparse ones by applying decay to masked weights while
+allowing the model to adaptively select masks throughout the training process.
+Furthermore, we observe that using distillation with a dense model as the
+teacher can prevent the sparse model from falling into local optima and
+accelerate convergence. In addition, we incorporate extra well-initialized
+parameters to further enhance model performance with minimal increase in memory
+footprint. AST can significantly enhance model performance, approaching the
+level of dense models. When applied to the LLaMA2-7B model, AST reduces the
+zero-shot accuracy gap between dense and semi-structured sparse models to 1.12%
+across multiple zero-shot tasks, utilizing less than 0.4% of the pretraining
+tokens. Our work demonstrates the feasibility of deploying semi-structured
+sparse large language models and introduces a novel method for achieving highly
+compressed models when combined with existing quantization techniques.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Language Models for Emotion Analysis: Insights from Cognitive
+  Science 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.10265v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.10265v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Constant Bonard, Gustave Cortal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose leveraging cognitive science research on emotions and
+communication to improve language models for emotion analysis. First, we
+present the main emotion theories in psychology and cognitive science. Then, we
+introduce the main methods of emotion annotation in natural language processing
+and their connections to psychological theories. We also present the two main
+types of analyses of emotional communication in cognitive pragmatics. Finally,
+based on the cognitive science research presented, we propose directions for
+improving language models for emotion analysis. We suggest that these research
+efforts pave the way for constructing new annotation schemes, methods, and a
+possible benchmark for emotional understanding, considering different facets of
+human emotion and communication.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Hybrid RAG System with Comprehensive Enhancement on Complex Reasoning <span class="chip">KDD</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.05141v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.05141v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ye Yuan, Chengwu Liu, Jingyang Yuan, Gongbo Sun, Siqi Li, Ming Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-augmented generation (RAG) is a framework enabling large language
+models (LLMs) to enhance their accuracy and reduce hallucinations by
+integrating external knowledge bases. In this paper, we introduce a hybrid RAG
+system enhanced through a comprehensive suite of optimizations that
+significantly improve retrieval quality, augment reasoning capabilities, and
+refine numerical computation ability. We refined the text chunks and tables in
+web pages, added attribute predictors to reduce hallucinations, conducted LLM
+Knowledge Extractor and Knowledge Graph Extractor, and finally built a
+reasoning strategy with all the references. We evaluated our system on the CRAG
+dataset through the Meta CRAG KDD Cup 2024 Competition. Both the local and
+online evaluations demonstrate that our system significantly enhances complex
+reasoning capabilities. In local evaluations, we have significantly improved
+accuracy and reduced error rates compared to the baseline model, achieving a
+notable increase in scores. In the meanwhile, we have attained outstanding
+results in online assessments, demonstrating the performance and generalization
+capabilities of the proposed system. The source code for our system is released
+in \url{https://gitlab.aicrowd.com/shizueyy/crag-new}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical report for 3rd prize in Task 1 of Meta CRAG KDD Cup 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MultiG<span class="highlight-title">Prompt</span> for Multi-Task <span class="highlight-title">Pre-Train</span>ing and <span class="highlight-title">Prompt</span>ing on Graphs <span class="chip">WWW2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.03731v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.03731v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingtong Yu, Chang Zhou, Yuan Fang, Xinming Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graphs can inherently model interconnected objects on the Web, thereby
+facilitating a series of Web applications, such as web analyzing and content
+recommendation. Recently, Graph Neural Networks (GNNs) have emerged as a
+mainstream technique for graph representation learning. However, their efficacy
+within an end-to-end supervised framework is significantly tied to the
+availabilityof task-specific labels. To mitigate labeling costs and enhance
+robustness in few-shot settings, pre-training on self-supervised tasks has
+emerged as a promising method, while prompting has been proposed to further
+narrow the objective gap between pretext and downstream tasks. Although there
+has been some initial exploration of prompt-based learning on graphs, they
+primarily leverage a single pretext task, resulting in a limited subset of
+general knowledge that could be learned from the pre-training data. Hence, in
+this paper, we propose MultiGPrompt, a novel multi-task pre-training and
+prompting framework to exploit multiple pretext tasks for more comprehensive
+pre-trained knowledge. First, in pre-training, we design a set of pretext
+tokens to synergize multiple pretext tasks. Second, we propose a dual-prompt
+mechanism consisting of composed and open prompts to leverage task-specific and
+global pre-training knowledge, to guide downstream tasks in few-shot settings.
+Finally, we conduct extensive experiments on six public datasets to evaluate
+and analyze MultiGPrompt.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>WWW2024 research track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ XMainframe: A Large Language Model for Mainframe Modernization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.04660v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.04660v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anh T. V. Dau, Hieu Trung Dao, Anh Tuan Nguyen, Hieu Trung Tran, Phong X. Nguyen, Nghi D. Q. Bui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mainframe operating systems, despite their inception in the 1940s, continue
+to support critical sectors like finance and government. However, these systems
+are often viewed as outdated, requiring extensive maintenance and
+modernization. Addressing this challenge necessitates innovative tools that can
+understand and interact with legacy codebases. To this end, we introduce
+XMainframe, a state-of-the-art large language model (LLM) specifically designed
+with knowledge of mainframe legacy systems and COBOL codebases. Our solution
+involves the creation of an extensive data collection pipeline to produce
+high-quality training datasets, enhancing XMainframe's performance in this
+specialized domain. Additionally, we present MainframeBench, a comprehensive
+benchmark for assessing mainframe knowledge, including multiple-choice
+questions, question answering, and COBOL code summarization. Our empirical
+evaluations demonstrate that XMainframe consistently outperforms existing
+state-of-the-art LLMs across these tasks. Specifically, XMainframe achieves 30%
+higher accuracy than DeepSeek-Coder on multiple-choice questions, doubles the
+BLEU score of Mixtral-Instruct 8x7B on question answering, and scores six times
+higher than GPT-3.5 on COBOL summarization. Our work highlights the potential
+of XMainframe to drive significant advancements in managing and modernizing
+legacy systems, thereby enhancing productivity and saving time for software
+developers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Non-discrimination Criteria for Generative Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.08564v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.08564v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sara Sterlie, Nina Weng, Aasa Feragen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative AI, such as large language models, has undergone rapid development
+within recent years. As these models become increasingly available to the
+public, concerns arise about perpetuating and amplifying harmful biases in
+applications. Gender stereotypes can be harmful and limiting for the
+individuals they target, whether they consist of misrepresentation or
+discrimination. Recognizing gender bias as a pervasive societal construct, this
+paper studies how to uncover and quantify the presence of gender biases in
+generative language models. In particular, we derive generative AI analogues of
+three well-known non-discrimination criteria from classification, namely
+independence, separation and sufficiency. To demonstrate these criteria in
+action, we design prompts for each of the criteria with a focus on occupational
+gender stereotype, specifically utilizing the medical test to introduce the
+ground truth in the generative AI context. Our results address the presence of
+occupational gender bias within such conversational language models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PlatoLM: Teaching LLMs in Multi-Round Dialogue via a User Simulator 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.11534v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.11534v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chuyi Kong, Yaxin Fan, Xiang Wan, Feng Jiang, Benyou Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The unparalleled performance of closed-sourced ChatGPT has sparked efforts
+towards its democratization, with notable strides made by leveraging real user
+and ChatGPT dialogues, as evidenced by Vicuna. However, due to challenges in
+gathering dialogues involving human participation, current endeavors like Baize
+and UltraChat rely on ChatGPT conducting roleplay to simulate humans based on
+instructions, resulting in overdependence on seeds, diminished human-likeness,
+limited topic diversity, and an absence of genuine multi-round conversational
+dynamics. To address the above issues, we propose a paradigm to simulate human
+behavior better and explore the benefits of incorporating more human-like
+questions in multi-turn conversations. Specifically, we directly target human
+questions extracted from genuine human-machine conversations as a learning goal
+and provide a novel user simulator called `Socratic'. The experimental results
+show our response model, `PlatoLM', achieves SoTA performance among LLaMA-based
+7B models in MT-Bench. Our findings further demonstrate that our method
+introduces highly human-like questioning patterns and rich topic structures,
+which can teach the response model better than previous works in multi-round
+conversations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Comprehensive <span class="highlight-title">Survey</span> of Scientific Large Language Models and Their
+  Applications in Scientific Discovery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.10833v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.10833v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Zhang, Xiusi Chen, Bowen Jin, Sheng Wang, Shuiwang Ji, Wei Wang, Jiawei Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In many scientific fields, large language models (LLMs) have revolutionized
+the way text and other modalities of data (e.g., molecules and proteins) are
+handled, achieving superior performance in various applications and augmenting
+the scientific discovery process. Nevertheless, previous surveys on scientific
+LLMs often concentrate on one or two fields or a single modality. In this
+paper, we aim to provide a more holistic view of the research landscape by
+unveiling cross-field and cross-modal connections between scientific LLMs
+regarding their architectures and pre-training techniques. To this end, we
+comprehensively survey over 250 scientific LLMs, discuss their commonalities
+and differences, as well as summarize pre-training datasets and evaluation
+tasks for each field and modality. Moreover, we investigate how LLMs have been
+deployed to benefit scientific discovery. Resources related to this survey are
+available at https://github.com/yuzhimanhua/Awesome-Scientific-Language-Models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>34 pages (GitHub:
+  https://github.com/yuzhimanhua/Awesome-Scientific-Language-Models)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ StateFlow: Enhancing LLM Task-Solving through State-Driven Workflows 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.11322v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.11322v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiran Wu, Tianwei Yue, Shaokun Zhang, Chi Wang, Qingyun Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is a notable trend to use Large Language Models (LLMs) to tackle complex
+tasks, e.g., tasks that require a sequence of actions and dynamic interaction
+with tools and external environments. In this paper, we propose StateFlow, a
+novel LLM-based task-solving paradigm that conceptualizes complex task-solving
+processes as state machines. In StateFlow, we distinguish between "process
+grounding" (via state and state transitions) and "sub-task solving" (through
+actions within a state), enhancing control and interpretability of the
+task-solving procedure. A state represents the status of a running process. The
+transitions between states are controlled by heuristic rules or decisions made
+by the LLM, allowing for a dynamic and adaptive progression. Upon entering a
+state, a series of actions is executed, involving not only calling LLMs guided
+by different prompts, but also the utilization of external tools as needed. Our
+results show that StateFlow significantly enhances LLMs' efficiency. For
+instance, StateFlow achieves 13% and 28% higher success rates compared to ReAct
+in InterCode SQL and ALFWorld benchmark, with 5x and 3x less cost respectively.
+We also show that StateFlow can be combined with iterative refining methods
+like Reflexion to further improve performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Question-Analysis <span class="highlight-title">Prompt</span>ing Improves LLM Performance in Reasoning Tasks <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.03624v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.03624v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dharunish Yugeswardeenoo, Kevin Zhu, Sean O'Brien
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although LLMs have the potential to transform many fields, they still
+underperform humans in reasoning tasks. Existing methods induce the model to
+produce step-by-step calculations, but this research explores the question:
+Does making the LLM analyze the question improve its performance? We propose a
+novel prompting strategy called Question Analysis Prompting (QAP), in which the
+model is prompted to explain the question in $n$ words before solving. The
+value of $n$ influences the length of response generated by the model. QAP is
+evaluated on GPT 3.5 Turbo and GPT 4 Turbo on arithmetic datasets GSM8K, AQuA,
+and SAT and commonsense dataset StrategyQA. QAP is compared with other
+state-of-the-art prompts including Chain-of-Thought (CoT), Plan and Solve
+Prompting (PS+) and Take A Deep Breath (TADB). QAP outperforms all
+state-of-the-art prompts on AQuA and SAT datasets on both GPT3.5 and GPT4. QAP
+consistently ranks among the top-2 prompts on 75\% of the tests. A key factor
+of QAP performance can be attributed to response length, where detailed
+responses are beneficial when answering harder questions, but can negatively
+affect easy questions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in Proceedings of the 62nd Annual Meeting of the Association
+  for Computational Linguistics: Student Research Workshop (ACL-SRW 2024) 11
+  pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Synergistic Multi-Agent Framework with Trajectory Learning for
+  Knowledge-Intensive Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.09893v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.09893v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengbin Yue, Siyuan Wang, Wei Chen, Xuanjing Huang, Zhongyu Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in Large Language Models (LLMs) have led to significant
+breakthroughs in various natural language processing tasks. However, generating
+factually consistent responses in knowledge-intensive scenarios remains a
+challenge due to issues such as hallucination, difficulty in acquiring
+long-tailed knowledge, and limited memory expansion. This paper introduces
+SMART, a novel multi-agent framework that leverages external knowledge to
+enhance the interpretability and factual consistency of LLM-generated
+responses. SMART comprises four specialized agents, each performing a specific
+sub-trajectory action to navigate complex knowledge-intensive tasks. We propose
+a multi-agent co-training paradigm, Long-Short Trajectory Learning, which
+ensures synergistic collaboration among agents while maintaining fine-grained
+execution by each agent. Extensive experiments on five knowledge-intensive
+tasks demonstrate SMART's superior performance compared to widely adopted
+knowledge internalization and knowledge enhancement methods. Our framework can
+extend beyond knowledge-intensive tasks to more complex scenarios. Our code is
+available at https://github.com/yueshengbin/SMART.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ reCSE: Portable Reshaping Features for Sentence Embedding in
+  <span class="highlight-title">Self-supervised</span> Contrastive Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.04975v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.04975v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fufangchen Zhao, Jian Gao, Danfeng Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose reCSE, a self supervised contrastive learning sentence
+representation framework based on feature reshaping. This framework is
+different from the current advanced models that use discrete data augmentation
+methods, but instead reshapes the input features of the original sentence,
+aggregates the global information of each token in the sentence, and alleviates
+the common problems of representation polarity and GPU memory consumption
+linear increase in current advanced models. In addition, our reCSE has achieved
+competitive performance in semantic similarity tasks. And the experiment proves
+that our proposed feature reshaping method has strong universality, which can
+be transplanted to other self supervised contrastive learning frameworks and
+enhance their representation ability, even achieving state-of-the-art
+performance. Our code is available at https://github.com/heavenhellchen/reCSE.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BEYOND DIALOGUE: A Profile-Dialogue Alignment Framework Towards General
+  Role-Playing Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.10903v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.10903v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yeyong Yu, Rusheng Yu, Haojie Wei, Zhanqiu Zhang, Quan Qian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid advancement of large language models (LLMs) has revolutionized
+role-playing, enabling the development of general role-playing models. However,
+current role-playing training has two significant issues: (I) Using a
+predefined role profile to prompt dialogue training for specific scenarios
+usually leads to inconsistencies and even conflicts between the dialogue and
+the profile, resulting in training biases. (II) The model learns to imitate the
+role based solely on the profile, neglecting profile-dialogue alignment at the
+sentence level. In this work, we propose a simple yet effective framework
+called BEYOND DIALOGUE, designed to overcome these hurdles. This framework
+innovatively introduces "beyond dialogue" tasks to align dialogue with profile
+traits based on each specific scenario, thereby eliminating biases during
+training. Furthermore, by adopting an innovative prompting mechanism that
+generates reasoning outcomes for training, the framework allows the model to
+achieve fine-grained alignment between profile and dialogue at the sentence
+level. The aforementioned methods are fully automated and low-cost.
+Additionally, the integration of automated dialogue and objective evaluation
+methods forms a comprehensive framework, paving the way for general
+role-playing. Experimental results demonstrate that our model excels in
+adhering to and reflecting various dimensions of role profiles, outperforming
+most proprietary general and specialized role-playing baselines. All code and
+datasets are available at https://github.com/yuyouyu32/BeyondDialogue.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ KLoB: a Benchmark for Assessing Knowledge Locating Methods in Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.16535v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.16535v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiming Ju, Xingrun Xing, Zhixiong Zeng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, Locate-Then-Edit paradigm has emerged as one of the main approaches
+in changing factual knowledge stored in the Language models. However, there is
+a lack of research on whether present locating methods can pinpoint the exact
+parameters embedding the desired knowledge. Moreover, although many researchers
+have questioned the validity of locality hypothesis of factual knowledge, no
+method is provided to test the a hypothesis for more in-depth discussion and
+research. Therefore, we introduce KLoB, a benchmark examining three essential
+properties that a reliable knowledge locating method should satisfy. KLoB can
+serve as a benchmark for evaluating existing locating methods in language
+models, and can contributes a method to reassessing the validity of locality
+hypothesis of factual knowledge. KLoB is publicly available at an anonymous
+GitHub: \url{https://github.com/anon6662/KLoB}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ M5 -- A Diverse Benchmark to Assess the Performance of Large Multimodal
+  Models Across Multilingual and Multicultural Vision-Language Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.03791v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.03791v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Florian Schneider, Sunayana Sitaram
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Since the release of ChatGPT, the field of Natural Language Processing has
+experienced rapid advancements, particularly in Large Language Models (LLMs)
+and their multimodal counterparts, Large Multimodal Models (LMMs). Despite
+their impressive capabilities, LLMs often exhibit significant performance
+disparities across different languages and cultural contexts, as demonstrated
+by various text-only benchmarks. However, current research lacks such
+benchmarks for multimodal visio-linguistic settings. This work fills this gap
+by introducing M5, the first comprehensive benchmark designed to evaluate LMMs
+on diverse vision-language tasks within a multilingual and multicultural
+context. M5 includes eight datasets covering five tasks and $41$ languages,
+with a focus on underrepresented languages and culturally diverse images.
+Furthermore, we introduce two novel datasets, M5-VGR and M5-VLOD, including a
+new Visio-Linguistic Outlier Detection task, in which all evaluated open-source
+models fail to significantly surpass the random baseline. Through extensive
+evaluation and analyses, we highlight substantial task-agnostic performance
+disparities between high- and low-resource languages. Moreover, we show that
+larger models do not necessarily outperform smaller ones in a multilingual
+setting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Monkey: Image Resolution and Text Label Are Important Things for Large
+  Multi-modal Models <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.06607v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.06607v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhang Li, Biao Yang, Qiang Liu, Zhiyin Ma, Shuo Zhang, Jingxu Yang, Yabo Sun, Yuliang Liu, Xiang Bai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Multimodal Models (LMMs) have shown promise in vision-language tasks
+but struggle with high-resolution input and detailed scene understanding.
+Addressing these challenges, we introduce Monkey to enhance LMM capabilities.
+Firstly, Monkey processes input images by dividing them into uniform patches,
+each matching the size (e.g., 448x448) used in the original training of the
+well-trained vision encoder. Equipped with individual adapter for each patch,
+Monkey can handle higher resolutions up to 1344x896 pixels, enabling the
+detailed capture of complex visual information. Secondly, it employs a
+multi-level description generation method, enriching the context for
+scene-object associations. This two-part strategy ensures more effective
+learning from generated data: the higher resolution allows for a more detailed
+capture of visuals, which in turn enhances the effectiveness of comprehensive
+descriptions. Extensive ablative results validate the effectiveness of our
+designs. Additionally, experiments on 18 datasets further demonstrate that
+Monkey surpasses existing LMMs in many tasks like Image Captioning and various
+Visual Question Answering formats. Specially, in qualitative tests focused on
+dense text question answering, Monkey has exhibited encouraging results
+compared with GPT4V. Code is available at
+https://github.com/Yuliang-Liu/Monkey.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR 2024 Highlight</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CARE: A Clue-guided Assistant for CSRs to Read User Manuals <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.03633v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.03633v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weihong Du, Jia Liu, Zujie Wen, Dingnan Jin, Hongru Liang, Wenqiang Lei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is time-saving to build a reading assistant for customer service
+representations (CSRs) when reading user manuals, especially information-rich
+ones. Current solutions don't fit the online custom service scenarios well due
+to the lack of attention to user questions and possible responses. Hence, we
+propose to develop a time-saving and careful reading assistant for CSRs, named
+CARE. It can help the CSRs quickly find proper responses from the user manuals
+via explicit clue chains. Specifically, each of the clue chains is formed by
+inferring over the user manuals, starting from the question clue aligned with
+the user question and ending at a possible response. To overcome the shortage
+of supervised data, we adopt the self-supervised strategy for model learning.
+The offline experiment shows that CARE is efficient in automatically inferring
+accurate responses from the user manual. The online experiment further
+demonstrates the superiority of CARE to reduce CSRs' reading burden and keep
+high service quality, in particular with >35% decrease in time spent and
+keeping a >0.75 ICC score.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to The 62nd Annual Meeting of the Association for
+  Computational Linguistics (ACL 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TrustLLM: Trustworthiness in Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.05561v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.05561v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yue Huang, Lichao Sun, Haoran Wang, Siyuan Wu, Qihui Zhang, Yuan Li, Chujie Gao, Yixin Huang, Wenhan Lyu, Yixuan Zhang, Xiner Li, Zhengliang Liu, Yixin Liu, Yijue Wang, Zhikun Zhang, Bertie Vidgen, Bhavya Kailkhura, Caiming Xiong, Chaowei Xiao, Chunyuan Li, Eric Xing, Furong Huang, Hao Liu, Heng Ji, Hongyi Wang, Huan Zhang, Huaxiu Yao, Manolis Kellis, Marinka Zitnik, Meng Jiang, Mohit Bansal, James Zou, Jian Pei, Jian Liu, Jianfeng Gao, Jiawei Han, Jieyu Zhao, Jiliang Tang, Jindong Wang, Joaquin Vanschoren, John Mitchell, Kai Shu, Kaidi Xu, Kai-Wei Chang, Lifang He, Lifu Huang, Michael Backes, Neil Zhenqiang Gong, Philip S. Yu, Pin-Yu Chen, Quanquan Gu, Ran Xu, Rex Ying, Shuiwang Ji, Suman Jana, Tianlong Chen, Tianming Liu, Tianyi Zhou, William Wang, Xiang Li, Xiangliang Zhang, Xiao Wang, Xing Xie, Xun Chen, Xuyu Wang, Yan Liu, Yanfang Ye, Yinzhi Cao, Yong Chen, Yue Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs), exemplified by ChatGPT, have gained
+considerable attention for their excellent natural language processing
+capabilities. Nonetheless, these LLMs present many challenges, particularly in
+the realm of trustworthiness. Therefore, ensuring the trustworthiness of LLMs
+emerges as an important topic. This paper introduces TrustLLM, a comprehensive
+study of trustworthiness in LLMs, including principles for different dimensions
+of trustworthiness, established benchmark, evaluation, and analysis of
+trustworthiness for mainstream LLMs, and discussion of open challenges and
+future directions. Specifically, we first propose a set of principles for
+trustworthy LLMs that span eight different dimensions. Based on these
+principles, we further establish a benchmark across six dimensions including
+truthfulness, safety, fairness, robustness, privacy, and machine ethics. We
+then present a study evaluating 16 mainstream LLMs in TrustLLM, consisting of
+over 30 datasets. Our findings firstly show that in general trustworthiness and
+utility (i.e., functional effectiveness) are positively related. Secondly, our
+observations reveal that proprietary LLMs generally outperform most open-source
+counterparts in terms of trustworthiness, raising concerns about the potential
+risks of widely accessible open-source LLMs. However, a few open-source LLMs
+come very close to proprietary ones. Thirdly, it is important to note that some
+LLMs may be overly calibrated towards exhibiting trustworthiness, to the extent
+that they compromise their utility by mistakenly treating benign prompts as
+harmful and consequently not responding. Finally, we emphasize the importance
+of ensuring transparency not only in the models themselves but also in the
+technologies that underpin trustworthiness. Knowing the specific trustworthy
+technologies that have been employed is crucial for analyzing their
+effectiveness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work is still under work and we welcome your contribution</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From Text to Pixel: Advancing Long-Context Understanding in MLLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.14213v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.14213v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yujie Lu, Xiujun Li, Tsu-Jui Fu, Miguel Eckstein, William Yang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid progress in Multimodal Large Language Models (MLLMs) has
+significantly advanced their ability to process and understand complex visual
+and textual information. However, the integration of multiple images and
+extensive textual contexts remains a challenge due to the inherent limitation
+of the models' capacity to handle long input sequences efficiently. In this
+paper, we introduce SEEKER, a multimodal large language model designed to
+tackle this issue. SEEKER aims to optimize the compact encoding of long text by
+compressing the text sequence into the visual pixel space via images, enabling
+the model to handle long text within a fixed token-length budget efficiently.
+Our empirical experiments on six long-context multimodal tasks demonstrate that
+SEEKER can leverage fewer image tokens to convey the same amount of textual
+information compared with the OCR-based approach, and is more efficient in
+understanding long-form multimodal input and generating long-form textual
+output, outperforming all existing proprietary and open-source MLLMs by large
+margins.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Crafting the Path: Robust Query Rewriting for Information Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.12529v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.12529v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ingeol Baek, Jimin Lee, Joonho Yang, Hwanhee Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Query rewriting aims to generate a new query that can complement the original
+query to improve the information retrieval system. Recent studies on query
+rewriting, such as query2doc, query2expand and querey2cot, rely on the internal
+knowledge of Large Language Models (LLMs) to generate a relevant passage to add
+information to the query. Nevertheless, the efficacy of these methodologies may
+markedly decline in instances where the requisite knowledge is not encapsulated
+within the model's intrinsic parameters. In this paper, we propose a novel
+structured query rewriting method called Crafting the Path tailored for
+retrieval systems. Crafting the Path involves a three-step process that crafts
+query-related information necessary for finding the passages to be searched in
+each step. Specifically, the Crafting the Path begins with Query Concept
+Comprehension, proceeds to Query Type Identification, and finally conducts
+Expected Answer Extraction. Experimental results show that our method
+outperforms previous rewriting methods, especially in less familiar domains for
+LLMs. We demonstrate that our method is less dependent on the internal
+parameter knowledge of the model and generates queries with fewer factual
+inaccuracies. Furthermore, we observe that \name{} demonstrates superior
+performance in the retrieval-augmented generation scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>3 figures, 13 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MaVEn: An Effective Multi-granularity Hybrid Visual Encoding Framework
+  for Multimodal Large Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.12321v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.12321v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaoya Jiang, Jia Hongrui, Haiyang Xu, Wei Ye, Mengfan Dong, Ming Yan, Ji Zhang, Fei Huang, Shikun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents MaVEn, an innovative Multi-granularity Visual Encoding
+framework designed to enhance the capabilities of Multimodal Large Language
+Models (MLLMs) in multi-image reasoning. Current MLLMs primarily focus on
+single-image visual understanding, limiting their ability to interpret and
+integrate information across multiple images. MaVEn addresses this limitation
+by combining discrete visual symbol sequences, which abstract coarse-grained
+semantic concepts, with traditional continuous representation sequences that
+model fine-grained features. This dual approach bridges the semantic gap
+between visual and textual data, thereby improving the model's ability to
+process and interpret information from multiple images effectively.
+Additionally, we design a dynamic reduction mechanism by for long-sequence
+continuous features to enhance multi-image processing efficiency. Experimental
+results demonstrate that MaVEn significantly enhances MLLMs' understanding in
+complex multi-image scenarios, while also improving performance in single-image
+contexts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OCRBench: On the Hidden Mystery of OCR in Large Multimodal Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.07895v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.07895v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuliang Liu, Zhang Li, Mingxin Huang, Biao Yang, Wenwen Yu, Chunyuan Li, Xucheng Yin, Cheng-lin Liu, Lianwen Jin, Xiang Bai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large models have recently played a dominant role in natural language
+processing and multimodal vision-language learning. However, their
+effectiveness in text-related visual tasks remains relatively unexplored. In
+this paper, we conducted a comprehensive evaluation of Large Multimodal Models,
+such as GPT4V and Gemini, in various text-related visual tasks including Text
+Recognition, Scene Text-Centric Visual Question Answering (VQA),
+Document-Oriented VQA, Key Information Extraction (KIE), and Handwritten
+Mathematical Expression Recognition (HMER). To facilitate the assessment of
+Optical Character Recognition (OCR) capabilities in Large Multimodal Models, we
+propose OCRBench, a comprehensive evaluation benchmark. OCRBench contains 29
+datasets, making it the most comprehensive OCR evaluation benchmark available.
+Furthermore, our study reveals both the strengths and weaknesses of these
+models, particularly in handling multilingual text, handwritten text,
+non-semantic text, and mathematical expression recognition. Most importantly,
+the baseline results presented in this study could provide a foundational
+framework for the conception and assessment of innovative strategies targeted
+at enhancing zero-shot multimodal techniques. The evaluation pipeline and
+benchmark are available at https://github.com/Yuliang-Liu/MultimodalOCR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ uMedSum: A Unified Framework for Advancing Medical Abstractive
+  Summarization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.12095v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.12095v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aishik Nagar, Yutong Liu, Andy T. Liu, Viktor Schlegel, Vijay Prakash Dwivedi, Arun-Kumar Kaliya-Perumal, Guna Pratheep Kalanchiam, Yili Tang, Robby T. Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical abstractive summarization faces the challenge of balancing
+faithfulness and informativeness. Current methods often sacrifice key
+information for faithfulness or introduce confabulations when prioritizing
+informativeness. While recent advancements in techniques like in-context
+learning (ICL) and fine-tuning have improved medical summarization, they often
+overlook crucial aspects such as faithfulness and informativeness without
+considering advanced methods like model reasoning and self-improvement.
+Moreover, the field lacks a unified benchmark, hindering systematic evaluation
+due to varied metrics and datasets. This paper addresses these gaps by
+presenting a comprehensive benchmark of six advanced abstractive summarization
+methods across three diverse datasets using five standardized metrics. Building
+on these findings, we propose uMedSum, a modular hybrid summarization framework
+that introduces novel approaches for sequential confabulation removal followed
+by key missing information addition, ensuring both faithfulness and
+informativeness. Our work improves upon previous GPT-4-based state-of-the-art
+(SOTA) medical summarization methods, significantly outperforming them in both
+quantitative metrics and qualitative domain expert evaluations. Notably, we
+achieve an average relative performance improvement of 11.8% in reference-free
+metrics over the previous SOTA. Doctors prefer uMedSum's summaries 6 times more
+than previous SOTA in difficult cases where there are chances of confabulations
+or missing information. These results highlight uMedSum's effectiveness and
+generalizability across various datasets and metrics, marking a significant
+advancement in medical summarization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reformulating Domain Adaptation of Large Language Models as
+  Adapt-Retrieve-Revise: A Case Study on Chinese Legal Domain <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03328v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03328v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhen wan, Yating Zhang, Yexiang Wang, Fei Cheng, Sadao Kurohashi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While large language models (LLMs) like GPT-4 have recently demonstrated
+astonishing zero-shot capabilities in general domain tasks, they often generate
+content with hallucinations in specific domains such as Chinese law, hindering
+their application in these areas. This is typically due to the absence of
+training data that encompasses such a specific domain, preventing GPT-4 from
+acquiring in-domain knowledge. A pressing challenge is that it's not plausible
+to continue training LLMs of such scale on in-domain data.
+  This paper introduces a simple and effective domain adaptation framework for
+GPT-4 by reformulating generation as an \textbf{adapt-retrieve-revise} process.
+The initial step is to \textbf{adapt} an affordable 7B LLM to the target domain
+by continuing learning on in-domain data. When solving a task, we leverage the
+adapted LLM to generate a draft answer given a task query. Then, the draft
+answer will be used to \textbf{retrieve} supporting evidence candidates from an
+external in-domain knowledge base. Finally, the draft answer and retrieved
+evidence are concatenated into a whole prompt to let GPT-4 assess the evidence
+and \textbf{revise} the draft answer to generate the final answer.
+  Our proposal combines the advantages of the efficiency of adapting a smaller
+7B model with the evidence-assessing capability of GPT-4 and effectively
+prevents GPT-4 from generating hallucinatory content. In the zero-shot setting
+of four Chinese legal tasks, our method improves accuracy by 33.3\% compared to
+the direct generation by GPT-4. When compared to two stronger retrieval-based
+baselines, our method outperforms them by 15.4\% and 23.9\%. Our code will be
+released
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL 2024 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Remastering Divide and Remaster: A Cinematic Audio Source Separation
+  <span class="highlight-title">Dataset</span> with Multilingual Support 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.07275v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.07275v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karn N. Watcharasupat, Chih-Wei Wu, Iroro Orife
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cinematic audio source separation (CASS), as a problem of extracting the
+dialogue, music, and effects stems from their mixture, is a relatively new
+subtask of audio source separation. To date, only one publicly available
+dataset exists for CASS, that is, the Divide and Remaster (DnR) dataset, which
+is currently at version 2. While DnR v2 has been an incredibly useful resource
+for CASS, several areas of improvement have been identified, particularly
+through its use in the 2023 Sound Demixing Challenge. In this work, we develop
+version 3 of the DnR dataset, addressing issues relating to vocal content in
+non-dialogue stems, loudness distributions, mastering process, and linguistic
+diversity. In particular, the dialogue stem of DnR v3 includes speech content
+from more than 30 languages from multiple families including but not limited to
+the Germanic, Romance, Indo-Aryan, Dravidian, Malayo-Polynesian, and Bantu
+families. Benchmark results using the Bandit model indicated that training on
+multilingual data yields significant generalizability to the model even in
+languages with low data availability. Even in languages with high data
+availability, the multilingual model often performs on par or better than
+dedicated models trained on monolingual CASS datasets. Dataset and model
+implementation will be made available at
+https://github.com/kwatcharasupat/source-separation-landing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the 5th IEEE International Symposium on the Internet of
+  Sounds. Camera-ready version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLaVA-Docent: Instruction Tuning with Multimodal Large Language Model to
+  Support Art Appreciation Education 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.06264v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.06264v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Unggi Lee, Minji Jeon, Yunseo Lee, Gyuri Byun, Yoorim Son, Jaeyoon Shin, Hongkyu Ko, Hyeoncheol Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Art appreciation is vital in nurturing critical thinking and emotional
+intelligence among learners. However, traditional art appreciation education
+has often been hindered by limited access to art resources, especially for
+disadvantaged students, and an imbalanced emphasis on STEM subjects in
+mainstream education. In response to these challenges, recent technological
+advancements have paved the way for innovative solutions. This study explores
+the application of multi-modal large language models (MLLMs) in art
+appreciation education, focusing on developing LLaVA-Docent, a model that
+leverages these advancements. Our approach involved a comprehensive literature
+review and consultations with experts in the field, leading to developing a
+robust data framework. Utilizing this framework, we generated a virtual
+dialogue dataset that was leveraged by GPT-4. This dataset was instrumental in
+training the MLLM, named LLaVA-Docent. Six researchers conducted quantitative
+and qualitative evaluations of LLaVA-Docent to assess its effectiveness,
+benchmarking it against the GPT-4 model in a few-shot setting. The evaluation
+process revealed distinct strengths and weaknesses of the LLaVA-Docent model.
+Our findings highlight the efficacy of LLaVA-Docent in enhancing the
+accessibility and engagement of art appreciation education. By harnessing the
+potential of MLLMs, this study makes a significant contribution to the field of
+art education, proposing a novel methodology that reimagines the way art
+appreciation is taught and experienced.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>37 pages, 4 figures, 10 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Quest: Query-Aware Sparsity for Efficient Long-Context LLM Inference <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.10774v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.10774v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaming Tang, Yilong Zhao, Kan Zhu, Guangxuan Xiao, Baris Kasikci, Song Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the demand for long-context large language models (LLMs) increases, models
+with context windows of up to 128K or 1M tokens are becoming increasingly
+prevalent. However, long-context LLM inference is challenging since the
+inference speed decreases significantly as the sequence length grows. This
+slowdown is primarily caused by loading a large KV cache during self-attention.
+Previous works have shown that a small portion of critical tokens will dominate
+the attention outcomes. However, we observe the criticality of a token highly
+depends on the query. To this end, we propose Quest, a query-aware KV cache
+selection algorithm. Quest keeps track of the minimal and maximal Key values in
+KV cache pages and estimates the criticality of a given page using Query
+vectors. By only loading the Top-K critical KV cache pages for attention, Quest
+significantly speeds up self-attention without sacrificing accuracy. We show
+that Quest can achieve up to 2.23x self-attention speedup, which reduces
+inference latency by 7.03x while performing well on tasks with long
+dependencies with negligible accuracy loss. Code is available at
+http://github.com/mit-han-lab/Quest .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mitigating the Impact of Outlier Channels for Language Model
+  Quantization with Activation Regularization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.03605v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.03605v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aniruddha Nrusimha, Mayank Mishra, Naigang Wang, Dan Alistarh, Rameswar Panda, Yoon Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of accurate quantization for language models, where
+both the weights and activations are uniformly quantized to 4 bits per
+parameter, the lowest bitwidth format natively supported by GPU hardware. In
+this context, the key challenge is activation quantization: it is known that
+language models contain outlier channels whose values on average are orders of
+magnitude higher than than other channels, which prevents accurate low-bitwidth
+quantization with known techniques. We systematically study this phenomena and
+find that these outlier channels emerge early in training, and that they occur
+more frequently in layers with residual streams. We then propose a simple
+strategy which regularizes a layer's inputs via quantization-aware training
+(QAT) and its outputs via activation kurtosis regularization. We show that
+regularizing both the inputs and outputs is crucial for preventing a model's
+"migrating" the difficulty in input quantization to the weights, which makes
+post-training quantization (PTQ) of weights more difficult. When combined with
+weight PTQ, we show that our approach can obtain a W4A4 model that performs
+competitively to the standard-precision W16A16 baseline.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CMAT: A Multi-Agent Collaboration Tuning Framework for Enhancing Small
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.01663v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.01663v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuechen Liang, Meiling Tao, Yinghui Xia, Tianyu Shi, Jun Wang, JingSong Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Open large language models (LLMs) have significantly advanced the field of
+natural language processing, showcasing impressive performance across various
+tasks.Despite the significant advancements in LLMs, their effective operation
+still relies heavily on human input to accurately guide the dialogue flow, with
+agent tuning being a crucial optimization technique that involves human
+adjustments to the model for better response to such guidance.Addressing this
+dependency, our work introduces the TinyAgent model, trained on a meticulously
+curated high-quality dataset. We also present the Collaborative Multi-Agent
+Tuning (CMAT) framework, an innovative system designed to augment language
+agent capabilities through adaptive weight updates based on environmental
+feedback. This framework fosters collaborative learning and real-time
+adaptation among multiple intelligent agents, enhancing their context-awareness
+and long-term memory. In this research, we propose a new communication agent
+framework that integrates multi-agent systems with environmental feedback
+mechanisms, offering a scalable method to explore cooperative behaviors.
+Notably, our TinyAgent-7B model exhibits performance on par with GPT-3.5,
+despite having fewer parameters, signifying a substantial improvement in the
+efficiency and effectiveness of LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ What Color Scheme is More Effective in Assisting Readers to Locate
+  Information in a Color-Coded Article? <span class="chip">IEEE VIS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.06494v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.06494v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ho Yin Ng, Zeyu He, Ting-Hao 'Kenneth' Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Color coding, a technique assigning specific colors to cluster information
+types, has proven advantages in aiding human cognitive activities, especially
+reading and comprehension. The rise of Large Language Models (LLMs) has
+streamlined document coding, enabling simple automatic text labeling with
+various schemes. This has the potential to make color-coding more accessible
+and benefit more users. However, the impact of color choice on information
+seeking is understudied. We conducted a user study assessing various color
+schemes' effectiveness in LLM-coded text documents, standardizing contrast
+ratios to approximately 5.55:1 across schemes. Participants performed timed
+information-seeking tasks in color-coded scholarly abstracts. Results showed
+non-analogous and yellow-inclusive color schemes improved performance, with the
+latter also being more preferred by participants. These findings can inform
+better color scheme choices for text annotation. As LLMs advance document
+coding, we advocate for more research focusing on the "color" aspect of
+color-coding techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper will appear at IEEE VIS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Prompt</span> Exploration with <span class="highlight-title">Prompt</span> Regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.11083v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.11083v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Feffer, Ronald Xu, Yuekai Sun, Mikhail Yurochkin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the advent of democratized usage of large language models (LLMs), there is
+a growing desire to systematize LLM prompt creation and selection processes
+beyond iterative trial-and-error. Prior works majorly focus on searching the
+space of prompts without accounting for relations between prompt variations.
+Here we propose a framework, Prompt Exploration with Prompt Regression (PEPR),
+to predict the effect of prompt combinations given results for individual
+prompt elements as well as a simple method to select an effective prompt for a
+given use-case. We evaluate our approach with open-source LLMs of different
+sizes on several different tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>COLM 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Parallelizing Linear <span class="highlight-title">Transformer</span>s with the Delta Rule over Sequence
+  Length 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.06484v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.06484v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Songlin Yang, Bailin Wang, Yu Zhang, Yikang Shen, Yoon Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers with linear attention (i.e., linear transformers) and
+state-space models have recently been suggested as a viable linear-time
+alternative to transformers with softmax attention. However, these models still
+underperform transformers especially on tasks that require in-context
+retrieval. While more expressive variants of linear transformers which replace
+the additive outer-product update in linear transformers with the delta rule
+have been found to be more effective at associative recall, existing algorithms
+for training such models do not parallelize over sequence length and are thus
+inefficient to train on modern hardware. This work describes a
+hardware-efficient algorithm for training linear transformers with the delta
+rule, which exploits a memory-efficient representation for computing products
+of Householder matrices. This algorithm allows us to scale up DeltaNet to
+standard language modeling settings. We train a 1.3B model for 100B tokens and
+find that it outperforms recent linear-time baselines such as Mamba and GLA in
+terms of perplexity and zero-shot performance on downstream tasks (including on
+tasks that focus on recall). We also experiment with two hybrid models which
+combine DeltaNet layers with (1) sliding-window attention layers every other
+layer or (2) two global attention layers, and find that these hybrid models
+outperform strong transformer baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RAVEL: Evaluating Interpretability Methods on Disentangling Language
+  Model Representations <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.17700v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.17700v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jing Huang, Zhengxuan Wu, Christopher Potts, Mor Geva, Atticus Geiger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Individual neurons participate in the representation of multiple high-level
+concepts. To what extent can different interpretability methods successfully
+disentangle these roles? To help address this question, we introduce RAVEL
+(Resolving Attribute-Value Entanglements in Language Models), a dataset that
+enables tightly controlled, quantitative comparisons between a variety of
+existing interpretability methods. We use the resulting conceptual framework to
+define the new method of Multi-task Distributed Alignment Search (MDAS), which
+allows us to find distributed representations satisfying multiple causal
+criteria. With Llama2-7B as the target language model, MDAS achieves
+state-of-the-art results on RAVEL, demonstrating the importance of going beyond
+neuron-level analyses to identify features distributed across activations. We
+release our benchmark at https://github.com/explanare/ravel.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Proceedings of the 62nd Annual Meeting of the Association for
+  Computational Linguistics (ACL 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unboxing Engagement in YouTube Influencer Videos: An Attention-Based
+  Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2012.12311v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2012.12311v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Prashant Rajaram, Puneet Manchanda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Influencer marketing videos have surged in popularity, yet significant gaps
+remain in understanding the relationship between video features and engagement.
+This challenge is intensified by the complexities of interpreting unstructured
+data. While deep learning models effectively leverage unstructured data to
+predict business outcomes, they often function as black boxes with limited
+interpretability, particularly when human validation is hindered by the absence
+of a known ground truth. To address this issue, the authors develop an
+"interpretable deep learning framework" that not only makes good out-of-sample
+predictions using unstructured data but also provides insights into the
+captured relationships. Inspired by visual attention in print advertising, the
+interpretation approach uses measures of model attention to video features,
+eliminating spurious associations through a two-step process and shortlisting
+relationships for formal causal testing. This method is applicable across
+well-known attention mechanisms - additive attention, scaled dot-product
+attention, and gradient-based attention - when analyzing text, audio, or video
+image data. Validated using simulations, this approach outperforms benchmark
+feature selection methods. This framework is applied to YouTube influencer
+videos, linking video features to measures of shallow and deep engagement
+developed based on the dual-system framework of thinking. The findings guide
+influencers and brands in prioritizing video features associated with deep
+engagement.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>50 pages, Online Appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">135</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Practitioner's Guide to Continual Multimodal <span class="highlight-title">Pretrain</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14471v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14471v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karsten Roth, Vishaal Udandarao, Sebastian Dziadzio, Ameya Prabhu, Mehdi Cherti, Oriol Vinyals, Olivier Hénaff, Samuel Albanie, Matthias Bethge, Zeynep Akata
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal foundation models serve numerous applications at the intersection
+of vision and language. Still, despite being pretrained on extensive data, they
+become outdated over time. To keep models updated, research into continual
+pretraining mainly explores scenarios with either (1) infrequent,
+indiscriminate updates on large-scale new data, or (2) frequent, sample-level
+updates. However, practical model deployment often operates in the gap between
+these two limit cases, as real-world applications often demand adaptation to
+specific subdomains, tasks or concepts -- spread over the entire, varying life
+cycle of a model. In this work, we complement current perspectives on continual
+pretraining through a research test bed as well as provide comprehensive
+guidance for effective continual model updates in such scenarios. We first
+introduce FoMo-in-Flux, a continual multimodal pretraining benchmark with
+realistic compute constraints and practical deployment requirements,
+constructed over 63 datasets with diverse visual and semantic coverage. Using
+FoMo-in-Flux, we explore the complex landscape of practical continual
+pretraining through multiple perspectives: (1) A data-centric investigation of
+data mixtures and stream orderings that emulate real-world deployment
+situations, (2) a method-centric investigation ranging from simple fine-tuning
+and traditional continual learning strategies to parameter-efficient updates
+and model merging, (3) meta learning rate schedules and mechanistic design
+choices, and (4) the influence of model and compute scaling. Together, our
+insights provide a practitioner's guide to continual multimodal pretraining for
+real-world deployment. Our benchmark and code is here:
+https://github.com/ExplainableML/fomo_in_flux.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report. 52 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Grounded Multi-Hop VideoQA in Long-Form Egocentric Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14469v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14469v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qirui Chen, Shangzhe Di, Weidi Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper considers the problem of Multi-Hop Video Question Answering
+(MH-VidQA) in long-form egocentric videos. This task not only requires to
+answer visual questions, but also to localize multiple relevant time intervals
+within the video as visual evidences. We develop an automated pipeline to
+create multi-hop question-answering pairs with associated temporal evidence,
+enabling to construct a large-scale dataset for instruction-tuning. To monitor
+the progress of this new task, we further curate a high-quality benchmark,
+MultiHop-EgoQA, with careful manual verification and refinement. Experimental
+results reveal that existing multi-modal systems exhibit inadequate multi-hop
+grounding and reasoning abilities, resulting in unsatisfactory performance. We
+then propose a novel architecture, termed as Grounding Scattered Evidence with
+Large Language Model (GeLM), that enhances multi-modal large language models
+(MLLMs) by incorporating a grounding module to retrieve temporal evidence from
+videos using flexible grounding tokens. Trained on our visual instruction data,
+GeLM demonstrates improved multi-hop grounding and reasoning capabilities,
+setting a new baseline for this challenging task. Furthermore, when trained on
+third-person view videos, the same architecture also achieves state-of-the-art
+performance on the single-hop VidQA benchmark, ActivityNet-RTL, demonstrating
+its effectiveness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dense Center-Direction Regression for Object Counting and Localization
+  with Point Supervision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14457v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14457v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Domen Tabernik, Jon Muhovič, Danijel Skočaj
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Object counting and localization problems are commonly addressed with point
+supervised learning, which allows the use of less labor-intensive point
+annotations. However, learning based on point annotations poses challenges due
+to the high imbalance between the sets of annotated and unannotated pixels,
+which is often treated with Gaussian smoothing of point annotations and focal
+loss. However, these approaches still focus on the pixels in the immediate
+vicinity of the point annotations and exploit the rest of the data only
+indirectly. In this work, we propose a novel approach termed CeDiRNet for
+point-supervised learning that uses a dense regression of directions pointing
+towards the nearest object centers, i.e. center-directions. This provides
+greater support for each center point arising from many surrounding pixels
+pointing towards the object center. We propose a formulation of
+center-directions that allows the problem to be split into the domain-specific
+dense regression of center-directions and the final localization task based on
+a small, lightweight, and domain-agnostic localization network that can be
+trained with synthetic data completely independent of the target domain. We
+demonstrate the performance of the proposed method on six different datasets
+for object counting and localization, and show that it outperforms the existing
+state-of-the-art methods. The code is accessible on GitHub at
+https://github.com/vicoslab/CeDiRNet.git.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in Pattern Recognition</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Center Direction Network for Grasping Point Localization on Cloths 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14456v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14456v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Domen Tabernik, Jon Muhovič, Matej Urbas, Danijel Skočaj
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Object grasping is a fundamental challenge in robotics and computer vision,
+critical for advancing robotic manipulation capabilities. Deformable objects,
+like fabrics and cloths, pose additional challenges due to their non-rigid
+nature. In this work, we introduce CeDiRNet-3DoF, a deep-learning model for
+grasp point detection, with a particular focus on cloth objects. CeDiRNet-3DoF
+employs center direction regression alongside a localization network, attaining
+first place in the perception task of ICRA 2023's Cloth Manipulation Challenge.
+Recognizing the lack of standardized benchmarks in the literature that hinder
+effective method comparison, we present the ViCoS Towel Dataset. This extensive
+benchmark dataset comprises 8,000 real and 12,000 synthetic images, serving as
+a robust resource for training and evaluating contemporary data-driven
+deep-learning approaches. Extensive evaluation revealed CeDiRNet-3DoF's
+robustness in real-world performance, outperforming state-of-the-art methods,
+including the latest transformer-based models. Our work bridges a crucial gap,
+offering a robust solution and benchmark for cloth grasping in computer vision
+and robotics. Code and dataset are available at:
+https://github.com/vicoslab/CeDiRNet-3DoF
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in IEEE Robotics and Automation Letters</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Model Parallel Training and Transfer Learning for Convolutional Neural
+  Networks by Domain Decomposition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14442v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14442v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Axel Klawonn, Martin Lanser, Janine Weber
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep convolutional neural networks (CNNs) have been shown to be very
+successful in a wide range of image processing applications. However, due to
+their increasing number of model parameters and an increasing availability of
+large amounts of training data, parallelization strategies to efficiently train
+complex CNNs are necessary. In previous work by the authors, a novel model
+parallel CNN architecture was proposed which is loosely inspired by domain
+decomposition. In particular, the novel network architecture is based on a
+decomposition of the input data into smaller subimages. For each of these
+subimages, local CNNs with a proportionally smaller number of parameters are
+trained in parallel and the resulting local classifications are then aggregated
+in a second step by a dense feedforward neural network (DNN). In the present
+work, we compare the resulting CNN-DNN architecture to less costly alternatives
+to combine the local classifications into a final, global decision.
+Additionally, we investigate the performance of the CNN-DNN trained as one
+coherent model as well as using a transfer learning strategy, where the
+parameters of the pre-trained local CNNs are used as initial values for a
+subsequently trained global coherent CNN-DNN model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Attend-Fusion: Efficient Audio-Visual Fusion for Video Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14441v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14441v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mahrukh Awan, Asmar Nadeem, Muhammad Junaid Awan, Armin Mustafa, Syed Sameed Husain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Exploiting both audio and visual modalities for video classification is a
+challenging task, as the existing methods require large model architectures,
+leading to high computational complexity and resource requirements. Smaller
+architectures, on the other hand, struggle to achieve optimal performance. In
+this paper, we propose Attend-Fusion, an audio-visual (AV) fusion approach that
+introduces a compact model architecture specifically designed to capture
+intricate audio-visual relationships in video data. Through extensive
+experiments on the challenging YouTube-8M dataset, we demonstrate that
+Attend-Fusion achieves an F1 score of 75.64\% with only 72M parameters, which
+is comparable to the performance of larger baseline models such as
+Fully-Connected Late Fusion (75.96\% F1 score, 341M parameters). Attend-Fusion
+achieves similar performance to the larger baseline model while reducing the
+model size by nearly 80\%, highlighting its efficiency in terms of model
+complexity. Our work demonstrates that the Attend-Fusion model effectively
+combines audio and visual information for video classification, achieving
+competitive performance with significantly reduced model size. This approach
+opens new possibilities for deploying high-performance video understanding
+systems in resource-constrained environments across various applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Social perception of faces in a vision-language model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14435v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14435v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Carina I. Hausladen, Manuel Knott, Colin F. Camerer, Pietro Perona
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We explore social perception of human faces in CLIP, a widely used
+open-source vision-language model. To this end, we compare the similarity in
+CLIP embeddings between different textual prompts and a set of face images. Our
+textual prompts are constructed from well-validated social psychology terms
+denoting social perception. The face images are synthetic and are
+systematically and independently varied along six dimensions: the legally
+protected attributes of age, gender, and race, as well as facial expression,
+lighting, and pose. Independently and systematically manipulating face
+attributes allows us to study the effect of each on social perception and
+avoids confounds that can occur in wild-collected data due to uncontrolled
+systematic correlations between attributes. Thus, our findings are experimental
+rather than observational. Our main findings are three. First, while CLIP is
+trained on the widest variety of images and texts, it is able to make
+fine-grained human-like social judgments on face images. Second, age, gender,
+and race do systematically impact CLIP's social perception of faces, suggesting
+an undesirable bias in CLIP vis-a-vis legally protected attributes. Most
+strikingly, we find a strong pattern of bias concerning the faces of Black
+women, where CLIP produces extreme values of social perception across different
+ages and facial expressions. Third, facial expression impacts social perception
+more than age and lighting as much as age. The last finding predicts that
+studies that do not control for unprotected visual attributes may reach the
+wrong conclusions on bias. Our novel method of investigation, which is founded
+on the social psychology literature and on the experiments involving the
+manipulation of individual attributes, yields sharper and more reliable
+observations than previous observational methods and may be applied to study
+biases in any vision-language model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Few-Shot 3D Volumetric Segmentation with Multi-Surrogate Fusion <span class="chip">MICCAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14427v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14427v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Meng Zheng, Benjamin Planche, Zhongpai Gao, Terrence Chen, Richard J. Radke, Ziyan Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conventional 3D medical image segmentation methods typically require learning
+heavy 3D networks (e.g., 3D-UNet), as well as large amounts of in-domain data
+with accurate pixel/voxel-level labels to avoid overfitting. These solutions
+are thus extremely time- and labor-expensive, but also may easily fail to
+generalize to unseen objects during training. To alleviate this issue, we
+present MSFSeg, a novel few-shot 3D segmentation framework with a lightweight
+multi-surrogate fusion (MSF). MSFSeg is able to automatically segment unseen 3D
+objects/organs (during training) provided with one or a few annotated 2D slices
+or 3D sequence segments, via learning dense query-support organ/lesion anatomy
+correlations across patient populations. Our proposed MSF module mines
+comprehensive and diversified morphology correlations between unlabeled and the
+few labeled slices/sequences through multiple designated surrogates, making it
+able to generate accurate cross-domain 3D segmentation masks given annotated
+slices or sequences. We demonstrate the effectiveness of our proposed framework
+by showing superior performance on conventional few-shot segmentation
+benchmarks compared to prior art, and remarkable cross-domain cross-volume
+segmentation performance on proprietary 3D segmentation datasets for
+challenging entities, i.e., tubular structures, with only limited 2D or 3D
+labels.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to MICCAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating saliency scores in point clouds of natural environments by
+  learning surface anomalies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14421v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14421v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Reuma Arav, Dennis Wittich, Franz Rottensteiner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, three-dimensional point clouds are used increasingly to
+document natural environments. Each dataset contains a diverse set of objects,
+at varying shapes and sizes, distributed throughout the data and intricately
+intertwined with the topography. Therefore, regions of interest are difficult
+to find and consequent analyses become a challenge. Inspired from visual
+perception principles, we propose to differentiate objects of interest from the
+cluttered environment by evaluating how much they stand out from their
+surroundings, i.e., their geometric salience. Previous saliency detection
+approaches suggested mostly handcrafted attributes for the task. However, such
+methods fail when the data are too noisy or have high levels of texture. Here
+we propose a learning-based mechanism that accommodates noise and textured
+surfaces. We assume that within the natural environment any change from the
+prevalent surface would suggest a salient object. Thus, we first learn the
+underlying surface and then search for anomalies within it. Initially, a deep
+neural network is trained to reconstruct the surface. Regions where the
+reconstructed part deviates significantly from the original point cloud yield a
+substantial reconstruction error, signifying an anomaly, i.e., saliency. We
+demonstrate the effectiveness of the proposed approach by searching for salient
+features in various natural scenarios, which were acquired by different
+acquisition platforms. We show the strong correlation between the
+reconstruction error and salient objects.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CHARTOM: A Visual Theory-of-Mind Benchmark for Multimodal Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14419v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14419v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shubham Bharti, Shiyun Cheng, Jihyun Rho, Martina Rao, Xiaojin Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce CHARTOM, a visual theory-of-mind benchmark for multimodal large
+language models. CHARTOM consists of specially designed data visualizing
+charts. Given a chart, a language model needs to not only correctly comprehend
+the chart (the FACT question) but also judge if the chart will be misleading to
+a human reader (the MIND question). Both questions have significant societal
+benefits. We detail the construction of the CHARTOM benchmark including its
+calibration on human performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LoG-VMamba: Local-Global Vision Mamba for Medical Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14415v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14415v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Trung Dinh Quoc Dang, Huy Hoang Nguyen, Aleksei Tiulpin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mamba, a State Space Model (SSM), has recently shown competitive performance
+to Convolutional Neural Networks (CNNs) and Transformers in Natural Language
+Processing and general sequence modeling. Various attempts have been made to
+adapt Mamba to Computer Vision tasks, including medical image segmentation
+(MIS). Vision Mamba (VM)-based networks are particularly attractive due to
+their ability to achieve global receptive fields, similar to Vision
+Transformers, while also maintaining linear complexity in the number of tokens.
+However, the existing VM models still struggle to maintain both spatially local
+and global dependencies of tokens in high dimensional arrays due to their
+sequential nature. Employing multiple and/or complicated scanning strategies is
+computationally costly, which hinders applications of SSMs to high-dimensional
+2D and 3D images that are common in MIS problems. In this work, we propose
+Local-Global Vision Mamba, LoG-VMamba, that explicitly enforces spatially
+adjacent tokens to remain nearby on the channel axis, and retains the global
+context in a compressed form. Our method allows the SSMs to access the local
+and global contexts even before reaching the last token while requiring only a
+simple scanning strategy. Our segmentation models are computationally efficient
+and substantially outperform both CNN and Transformers-based baselines on a
+diverse set of 2D and 3D MIS tasks. The implementation of LoG-VMamba is
+available at \url{https://github.com/Oulu-IMEDS/LoG-VMamba}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Satellite Sunroof: High-res Digital Surface Models and Roof Segmentation
+  for Global Solar Mapping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14400v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14400v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vishal Batchu, Alex Wilson, Betty Peng, Carl Elkin, Umangi Jain, Christopher Van Arsdale, Ross Goroshin, Varun Gulshan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The transition to renewable energy, particularly solar, is key to mitigating
+climate change. Google's Solar API aids this transition by estimating solar
+potential from aerial imagery, but its impact is constrained by geographical
+coverage. This paper proposes expanding the API's reach using satellite
+imagery, enabling global solar potential assessment. We tackle challenges
+involved in building a Digital Surface Model (DSM) and roof instance
+segmentation from lower resolution and single oblique views using deep learning
+models. Our models, trained on aligned satellite and aerial datasets, produce
+25cm DSMs and roof segments. With ~1m DSM MAE on buildings, ~5deg roof pitch
+error and ~56% IOU on roof segmentation, they significantly enhance the Solar
+API's potential to promote solar adoption.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Uncovering Knowledge Gaps in Radiology Report Generation Models through
+  Knowledge Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14397v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14397v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoman Zhang, Julián N. Acosta, Hong-Yu Zhou, Pranav Rajpurkar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in artificial intelligence have significantly improved
+the automatic generation of radiology reports. However, existing evaluation
+methods fail to reveal the models' understanding of radiological images and
+their capacity to achieve human-level granularity in descriptions. To bridge
+this gap, we introduce a system, named ReXKG, which extracts structured
+information from processed reports to construct a comprehensive radiology
+knowledge graph. We then propose three metrics to evaluate the similarity of
+nodes (ReXKG-NSC), distribution of edges (ReXKG-AMS), and coverage of subgraphs
+(ReXKG-SCS) across various knowledge graphs. We conduct an in-depth comparative
+analysis of AI-generated and human-written radiology reports, assessing the
+performance of both specialist and generalist models. Our study provides a
+deeper understanding of the capabilities and limitations of current AI models
+in radiology report generation, offering valuable insights for improving model
+performance and clinical applicability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code is available at: https://github.com/rajpurkarlab/ReXKG</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Tree-Structured Composition of Data Augmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14381v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14381v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongyue Li, Kailai Chen, Predrag Radivojac, Hongyang R. Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data augmentation is widely used for training a neural network given little
+labeled data. A common practice of augmentation training is applying a
+composition of multiple transformations sequentially to the data. Existing
+augmentation methods such as RandAugment randomly sample from a list of
+pre-selected transformations, while methods such as AutoAugment apply advanced
+search to optimize over an augmentation set of size $k^d$, which is the number
+of transformation sequences of length $d$, given a list of $k$ transformations.
+  In this paper, we design efficient algorithms whose running time complexity
+is much faster than the worst-case complexity of $O(k^d)$, provably. We propose
+a new algorithm to search for a binary tree-structured composition of $k$
+transformations, where each tree node corresponds to one transformation. The
+binary tree generalizes sequential augmentations, such as the SimCLR
+augmentation scheme for contrastive learning. Using a top-down, recursive
+search procedure, our algorithm achieves a runtime complexity of $O(2^d k)$,
+which is much faster than $O(k^d)$ as $k$ increases above $2$. We apply our
+algorithm to tackle data distributions with heterogeneous subpopulations by
+searching for one tree in each subpopulation and then learning a weighted
+combination, resulting in a forest of trees.
+  We validate our proposed algorithms on numerous graph and image datasets,
+including a multi-label graph classification dataset we collected. The dataset
+exhibits significant variations in the sizes of graphs and their average
+degrees, making it ideal for studying data augmentation. We show that our
+approach can reduce the computation cost by 43% over existing search methods
+while improving performance by 4.3%. The tree structures can be used to
+interpret the relative importance of each transformation, such as identifying
+the important transformations on small vs. large graphs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SelEx: Self-Expertise in Fine-Grained Generalized Category Discovery <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14371v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14371v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sarah Rastegar, Mohammadreza Salehi, Yuki M. Asano, Hazel Doughty, Cees G. M. Snoek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we address Generalized Category Discovery, aiming to
+simultaneously uncover novel categories and accurately classify known ones.
+Traditional methods, which lean heavily on self-supervision and contrastive
+learning, often fall short when distinguishing between fine-grained categories.
+To address this, we introduce a novel concept called `self-expertise', which
+enhances the model's ability to recognize subtle differences and uncover
+unknown categories. Our approach combines unsupervised and supervised
+self-expertise strategies to refine the model's discernment and generalization.
+Initially, hierarchical pseudo-labeling is used to provide `soft supervision',
+improving the effectiveness of self-expertise. Our supervised technique differs
+from traditional methods by utilizing more abstract positive and negative
+samples, aiding in the formation of clusters that can generalize to novel
+categories. Meanwhile, our unsupervised strategy encourages the model to
+sharpen its category distinctions by considering within-category examples as
+`hard' negatives. Supported by theoretical insights, our empirical results
+showcase that our method outperforms existing state-of-the-art techniques in
+Generalized Category Discovery across several fine-grained datasets. Our code
+is available at: https://github.com/SarahRastegar/SelEx.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ECCV 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Embedding is Worth a Thousand Noisy Labels 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14358v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14358v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francesco Di Salvo, Sebastian Doerrich, Ines Rieger, Christian Ledig
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The performance of deep neural networks scales with dataset size and label
+quality, rendering the efficient mitigation of low-quality data annotations
+crucial for building robust and cost-effective systems. Existing strategies to
+address label noise exhibit severe limitations due to computational complexity
+and application dependency. In this work, we propose WANN, a Weighted Adaptive
+Nearest Neighbor approach that builds on self-supervised feature
+representations obtained from foundation models. To guide the weighted voting
+scheme, we introduce a reliability score, which measures the likelihood of a
+data label being correct. WANN outperforms reference methods, including a
+linear layer trained with robust loss functions, on diverse datasets of varying
+size and under various noise types and severities. WANN also exhibits superior
+generalization on imbalanced data compared to both Adaptive-NNs (ANN) and fixed
+k-NNs. Furthermore, the proposed weighting scheme enhances supervised
+dimensionality reduction under noisy labels. This yields a significant boost in
+classification performance with 10x and 100x smaller image embeddings,
+minimizing latency and storage requirements. Our approach, emphasizing
+efficiency and explainability, emerges as a simple, robust solution to overcome
+the inherent limitations of deep neural network training. The code is available
+at https://github.com/francescodisalvo05/wann-noisy-labels .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint submitted to the International Journal of Computer Vision
+  (IJCV)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep learning-based ecological analysis of camera trap images is
+  impacted by training data quality and size 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14348v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14348v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Omiros Pantazis, Peggy Bevan, Holly Pringle, Guilherme Braga Ferreira, Daniel J. Ingram, Emily Madsen, Liam Thomas, Dol Raj Thanet, Thakur Silwal, Santosh Rayamajhi, Gabriel Brostow, Oisin Mac Aodha, Kate E. Jones
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large wildlife image collections from camera traps are crucial for
+biodiversity monitoring, offering insights into species richness, occupancy,
+and activity patterns. However, manual processing of these data is
+time-consuming, hindering analytical processes. To address this, deep neural
+networks have been widely adopted to automate image analysis. Despite their
+growing use, the impact of model training decisions on downstream ecological
+metrics remains unclear. Here, we analyse camera trap data from an African
+savannah and an Asian sub-tropical dry forest to compare key ecological metrics
+derived from expert-generated species identifications with those generated from
+deep neural networks. We assess the impact of model architecture, training data
+noise, and dataset size on ecological metrics, including species richness,
+occupancy, and activity patterns. Our results show that while model
+architecture has minimal impact, large amounts of noise and reduced dataset
+size significantly affect these metrics. Nonetheless, estimated ecological
+metrics are resilient to considerable noise, tolerating up to 10% error in
+species labels and a 50% reduction in training set size without changing
+significantly. We also highlight that conventional metrics like classification
+error may not always be representative of a model's ability to accurately
+measure ecological metrics. We conclude that ecological metrics derived from
+deep neural network predictions closely match those calculated from expert
+labels and remain robust to variations in the factors explored. However,
+training decisions for deep neural networks can impact downstream ecological
+analysis. Therefore, practitioners should prioritize creating large, clean
+training sets and evaluate deep neural network solutions based on their ability
+to measure the ecological metrics of interest.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Brief Analysis of the Iterative Next Boundary Detection Network for
+  Tree Rings Delineation in Images of Pinus taeda 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14343v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14343v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Henry Marichal, Gregory Randall
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work presents the INBD network proposed by Gillert et al. in CVPR-2023
+and studies its application for delineating tree rings in RGB images of Pinus
+taeda cross sections captured by a smartphone (UruDendro dataset), which are
+images with different characteristics from the ones used to train the method.
+The INBD network operates in two stages: first, it segments the background,
+pith, and ring boundaries. In the second stage, the image is transformed into
+polar coordinates, and ring boundaries are iteratively segmented from the pith
+to the bark. Both stages are based on the U-Net architecture. The method
+achieves an F-Score of 77.5, a mAR of 0.540, and an ARAND of 0.205 on the
+evaluation set. The code for the experiments is available at
+https://github.com/hmarichal93/mlbrief_inbd.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to IPOL ad an MLBriefs paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ConceptMix: A Compositional Image Generation Benchmark with Controllable
+  Difficulty 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14339v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14339v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xindi Wu, Dingli Yu, Yangsibo Huang, Olga Russakovsky, Sanjeev Arora
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Compositionality is a critical capability in Text-to-Image (T2I) models, as
+it reflects their ability to understand and combine multiple concepts from text
+descriptions. Existing evaluations of compositional capability rely heavily on
+human-designed text prompts or fixed templates, limiting their diversity and
+complexity, and yielding low discriminative power. We propose ConceptMix, a
+scalable, controllable, and customizable benchmark which automatically
+evaluates compositional generation ability of T2I models. This is done in two
+stages. First, ConceptMix generates the text prompts: concretely, using
+categories of visual concepts (e.g., objects, colors, shapes, spatial
+relationships), it randomly samples an object and k-tuples of visual concepts,
+then uses GPT4-o to generate text prompts for image generation based on these
+sampled concepts. Second, ConceptMix evaluates the images generated in response
+to these prompts: concretely, it checks how many of the k concepts actually
+appeared in the image by generating one question per visual concept and using a
+strong VLM to answer them. Through administering ConceptMix to a diverse set of
+T2I models (proprietary as well as open ones) using increasing values of k, we
+show that our ConceptMix has higher discrimination power than earlier
+benchmarks. Specifically, ConceptMix reveals that the performance of several
+models, especially open models, drops dramatically with increased k.
+Importantly, it also provides insight into the lack of prompt diversity in
+widely-used training datasets. Additionally, we conduct extensive human studies
+to validate the design of ConceptMix and compare our automatic grading with
+human judgement. We hope it will guide future T2I model development.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>43 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Equivariant Reinforcement Learning under Partial Observability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14336v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14336v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hai Nguyen, Andrea Baisero, David Klee, Dian Wang, Robert Platt, Christopher Amato
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Incorporating inductive biases is a promising approach for tackling
+challenging robot learning domains with sample-efficient solutions. This paper
+identifies partially observable domains where symmetries can be a useful
+inductive bias for efficient learning. Specifically, by encoding the
+equivariance regarding specific group symmetries into the neural networks, our
+actor-critic reinforcement learning agents can reuse solutions in the past for
+related scenarios. Consequently, our equivariant agents outperform
+non-equivariant approaches significantly in terms of sample efficiency and
+final performance, demonstrated through experiments on a range of robotic tasks
+in simulation and real hardware.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Conference on Robot Learning, 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PHEVA: A Privacy-preserving Human-centric Video Anomaly Detection
+  <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14329v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14329v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ghazal Alinezhad Noghre, Shanle Yao, Armin Danesh Pazho, Babak Rahimi Ardabili, Vinit Katariya, Hamed Tabkhi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  PHEVA, a Privacy-preserving Human-centric Ethical Video Anomaly detection
+dataset. By removing pixel information and providing only de-identified human
+annotations, PHEVA safeguards personally identifiable information. The dataset
+includes seven indoor/outdoor scenes, featuring one novel, context-specific
+camera, and offers over 5x the pose-annotated frames compared to the largest
+previous dataset. This study benchmarks state-of-the-art methods on PHEVA using
+a comprehensive set of metrics, including the 10% Error Rate (10ER), a metric
+used for anomaly detection for the first time providing insights relevant to
+real-world deployment. As the first of its kind, PHEVA bridges the gap between
+conventional training and real-world deployment by introducing continual
+learning benchmarks, with models outperforming traditional methods in 82.14% of
+cases. The dataset is publicly available at
+https://github.com/TeCSAR-UNCC/PHEVA.git.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Streamline tractography of the fetal brain in utero with machine
+  learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14326v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14326v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weide Liu, Camilo Calixto, Simon K. Warfield, Davood Karimi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion-weighted magnetic resonance imaging (dMRI) is the only non-invasive
+tool for studying white matter tracts and structural connectivity of the brain.
+These assessments rely heavily on tractography techniques, which reconstruct
+virtual streamlines representing white matter fibers. Much effort has been
+devoted to improving tractography methodology for adult brains, while
+tractography of the fetal brain has been largely neglected. Fetal tractography
+faces unique difficulties due to low dMRI signal quality, immature and rapidly
+developing brain structures, and paucity of reference data. This work presents
+the first machine learning model for fetal tractography. The model input
+consists of five sources of information: (1) Fiber orientation, inferred from a
+diffusion tensor fit to the dMRI signal; (2) Directions of recent propagation
+steps; (3) Global spatial information, encoded as distances to keypoints in the
+brain cortex; (4) Tissue segmentation information; and (5) Prior information
+about the expected local fiber orientations supplied with an atlas. In order to
+mitigate the local tensor estimation error, a large spatial context around the
+current point in the diffusion tensor image is encoded using convolutional and
+attention neural network modules. Moreover, the diffusion tensor information at
+a hypothetical next point is included in the model input. Filtering rules based
+on anatomically constrained tractography are applied to prune implausible
+streamlines. We trained the model on manually-refined whole-brain fetal
+tractograms and validated the trained model on an independent set of 11 test
+scans with gestational ages between 23 and 36 weeks. Results show that our
+proposed method achieves superior performance across all evaluated tracts. The
+new method can significantly advance the capabilities of dMRI for studying
+normal and abnormal brain development in utero.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ May the Forgetting Be with You: Alternate Replay for Learning with Noisy
+  Labels <span class="chip">BMVC 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14284v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14284v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Monica Millunzi, Lorenzo Bonicelli, Angelo Porrello, Jacopo Credi, Petter N. Kolm, Simone Calderara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Forgetting presents a significant challenge during incremental training,
+making it particularly demanding for contemporary AI systems to assimilate new
+knowledge in streaming data environments. To address this issue, most
+approaches in Continual Learning (CL) rely on the replay of a restricted buffer
+of past data. However, the presence of noise in real-world scenarios, where
+human annotation is constrained by time limitations or where data is
+automatically gathered from the web, frequently renders these strategies
+vulnerable. In this study, we address the problem of CL under Noisy Labels
+(CLN) by introducing Alternate Experience Replay (AER), which takes advantage
+of forgetting to maintain a clear distinction between clean, complex, and noisy
+samples in the memory buffer. The idea is that complex or mislabeled examples,
+which hardly fit the previously learned data distribution, are most likely to
+be forgotten. To grasp the benefits of such a separation, we equip AER with
+Asymmetric Balanced Sampling (ABS): a new sample selection strategy that
+prioritizes purity on the current task while retaining relevant samples from
+the past. Through extensive computational comparisons, we demonstrate the
+effectiveness of our approach in terms of both accuracy and purity of the
+obtained buffer, resulting in a remarkable average gain of 4.71% points in
+accuracy with respect to existing loss-based purification strategies. Code is
+available at https://github.com/aimagelab/mammoth.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 5 figures. Accepted at the The 35th British Machine Vision
+  Conference 2024 (BMVC 2024), Glasgow, UK</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Uncertainties of Latent Representations in Computer Vision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14281v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14281v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Kirchhof
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Uncertainty quantification is a key pillar of trustworthy machine learning.
+It enables safe reactions under unsafe inputs, like predicting only when the
+machine learning model detects sufficient evidence, discarding anomalous data,
+or emitting warnings when an error is likely to be inbound. This is
+particularly crucial in safety-critical areas like medical image classification
+or self-driving cars. Despite the plethora of proposed uncertainty
+quantification methods achieving increasingly higher scores on performance
+benchmarks, uncertainty estimates are often shied away from in practice. Many
+machine learning projects start from pretrained latent representations that
+come without uncertainty estimates. Uncertainties would need to be trained by
+practitioners on their own, which is notoriously difficult and
+resource-intense.
+  This thesis makes uncertainty estimates easily accessible by adding them to
+the latent representation vectors of pretrained computer vision models. Besides
+proposing approaches rooted in probability and decision theory, such as
+Monte-Carlo InfoNCE (MCInfoNCE) and loss prediction, we delve into both
+theoretical and empirical questions. We show that these unobservable
+uncertainties about unobservable latent representations are indeed provably
+correct. We also provide an uncertainty-aware representation learning (URL)
+benchmark to compare these unobservables against observable ground-truths.
+Finally, we compile our findings to pretrain lightweight representation
+uncertainties on large-scale computer vision models that transfer to unseen
+datasets in a zero-shot manner.
+  Our findings do not only advance the current theoretical understanding of
+uncertainties over latent variables, but also facilitate the access to
+uncertainty quantification for future researchers inside and outside the field,
+enabling straightforward but trustworthy machine learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Doctoral thesis</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Local Pattern Modularization for Point Cloud Reconstruction
+  from Unseen Classes <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14279v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14279v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chao Chen, Zhizhong Han, Yu-Shen Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is challenging to reconstruct 3D point clouds in unseen classes from
+single 2D images. Instead of object-centered coordinate system, current methods
+generalized global priors learned in seen classes to reconstruct 3D shapes from
+unseen classes in viewer-centered coordinate system. However, the
+reconstruction accuracy and interpretability are still eager to get improved.
+To resolve this issue, we introduce to learn local pattern modularization for
+reconstructing 3D shapes in unseen classes, which achieves both good
+generalization ability and high reconstruction accuracy. Our insight is to
+learn a local prior which is class-agnostic and easy to generalize in
+object-centered coordinate system. Specifically, the local prior is learned via
+a process of learning and customizing local pattern modularization in seen
+classes. During this process, we first learn a set of patterns in local
+regions, which is the basis in the object-centered coordinate system to
+represent an arbitrary region on shapes across different classes. Then, we
+modularize each region on an initially reconstructed shape using the learned
+local patterns. Based on that, we customize the local pattern modularization
+using the input image by refining the reconstruction with more details. Our
+method enables to reconstruct high fidelity point clouds from unseen classes in
+object-centered coordinate system without requiring a large number of patterns
+or any additional information, such as segmentation supervision or camera
+poses. Our experimental results under widely used benchmarks show that our
+method achieves the state-of-the-art reconstruction accuracy for shapes from
+unseen classes. The code is available at https://github.com/chenchao15/Unseen.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14pages, 11figures, accepted by ECCV 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reliable Multi-modal Medical Image-to-image Translation Independent of
+  Pixel-wise Aligned Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14270v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14270v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Langrui Zhou, Guang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The current mainstream multi-modal medical image-to-image translation methods
+face a contradiction. Supervised methods with outstanding performance rely on
+pixel-wise aligned training data to constrain the model optimization. However,
+obtaining pixel-wise aligned multi-modal medical image datasets is challenging.
+Unsupervised methods can be trained without paired data, but their reliability
+cannot be guaranteed. At present, there is no ideal multi-modal medical
+image-to-image translation method that can generate reliable translation
+results without the need for pixel-wise aligned data. This work aims to develop
+a novel medical image-to-image translation model that is independent of
+pixel-wise aligned data (MITIA), enabling reliable multi-modal medical
+image-to-image translation under the condition of misaligned training data. The
+proposed MITIA model utilizes a prior extraction network composed of a
+multi-modal medical image registration module and a multi-modal misalignment
+error detection module to extract pixel-level prior information from training
+data with misalignment errors to the largest extent. The extracted prior
+information is then used to construct a regularization term to constrain the
+optimization of the unsupervised cycle-consistent GAN model, restricting its
+solution space and thereby improving the performance and reliability of the
+generator. We trained the MITIA model using six datasets containing different
+misalignment errors and two well-aligned datasets. Subsequently, we compared
+the proposed method with six other state-of-the-art image-to-image translation
+methods. The results of both quantitative analysis and qualitative visual
+inspection indicate that MITIA achieves superior performance compared to the
+competing state-of-the-art methods, both on misaligned data and aligned data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted as a research article by Medical Physics</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 1-Bit FQT: Pushing the Limit of Fully Quantized Training to 1-bit 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14267v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14267v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chang Gao, Jianfei Chen, Kang Zhao, Jiaqi Wang, Liping Jing
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fully quantized training (FQT) accelerates the training of deep neural
+networks by quantizing the activations, weights, and gradients into lower
+precision. To explore the ultimate limit of FQT (the lowest achievable
+precision), we make a first attempt to 1-bit FQT. We provide a theoretical
+analysis of FQT based on Adam and SGD, revealing that the gradient variance
+influences the convergence of FQT. Building on these theoretical results, we
+introduce an Activation Gradient Pruning (AGP) strategy. The strategy leverages
+the heterogeneity of gradients by pruning less informative gradients and
+enhancing the numerical precision of remaining gradients to mitigate gradient
+variance. Additionally, we propose Sample Channel joint Quantization (SCQ),
+which utilizes different quantization strategies in the computation of weight
+gradients and activation gradients to ensure that the method is friendly to
+low-bitwidth hardware. Finally, we present a framework to deploy our algorithm.
+For fine-tuning VGGNet-16 and ResNet-18 on multiple datasets, our algorithm
+achieves an average accuracy improvement of approximately 6%, compared to
+per-sample quantization. Moreover, our training speedup can reach a maximum of
+5.13x compared to full precision training.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Text3DAug -- <span class="highlight-title">Prompt</span>ed Instance Augmentation for LiDAR Perception <span class="chip">IROS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14253v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14253v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laurenz Reichardt, Luca Uhr, Oliver Wasenmüller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  LiDAR data of urban scenarios poses unique challenges, such as heterogeneous
+characteristics and inherent class imbalance. Therefore, large-scale datasets
+are necessary to apply deep learning methods. Instance augmentation has emerged
+as an efficient method to increase dataset diversity. However, current methods
+require the time-consuming curation of 3D models or costly manual data
+annotation. To overcome these limitations, we propose Text3DAug, a novel
+approach leveraging generative models for instance augmentation. Text3DAug does
+not depend on labeled data and is the first of its kind to generate instances
+and annotations from text. This allows for a fully automated pipeline,
+eliminating the need for manual effort in practical applications. Additionally,
+Text3DAug is sensor agnostic and can be applied regardless of the LiDAR sensor
+used. Comprehensive experimental analysis on LiDAR segmentation, detection and
+novel class discovery demonstrates that Text3DAug is effective in supplementing
+existing methods or as a standalone method, performing on par or better than
+established methods, however while overcoming their specific drawbacks. The
+code is publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the 2024 IEEE/RSJ International Conference on Intelligent
+  Robots and Systems (IROS 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Beyond Few-shot Object Detection: A Detailed <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14249v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14249v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vishal Chudasama, Hiran Sarkar, Pankaj Wasnik, Vineeth N Balasubramanian, Jayateja Kalla
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Object detection is a critical field in computer vision focusing on
+accurately identifying and locating specific objects in images or videos.
+Traditional methods for object detection rely on large labeled training
+datasets for each object category, which can be time-consuming and expensive to
+collect and annotate. To address this issue, researchers have introduced
+few-shot object detection (FSOD) approaches that merge few-shot learning and
+object detection principles. These approaches allow models to quickly adapt to
+new object categories with only a few annotated samples. While traditional FSOD
+methods have been studied before, this survey paper comprehensively reviews
+FSOD research with a specific focus on covering different FSOD settings such as
+standard FSOD, generalized FSOD, incremental FSOD, open-set FSOD, and domain
+adaptive FSOD. These approaches play a vital role in reducing the reliance on
+extensive labeled datasets, particularly as the need for efficient machine
+learning models continues to rise. This survey paper aims to provide a
+comprehensive understanding of the above-mentioned few-shot settings and
+explore the methodologies for each FSOD task. It thoroughly compares
+state-of-the-art methods across different FSOD settings, analyzing them in
+detail based on their evaluation protocols. Additionally, it offers insights
+into their applications, challenges, and potential future directions in the
+evolving field of object detection with limited data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>43 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cascaded Temporal Updating Network for Efficient Video Super-Resolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14244v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14244v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Li, Jiangxin Dong, Jinshan Pan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing video super-resolution (VSR) methods generally adopt a recurrent
+propagation network to extract spatio-temporal information from the entire
+video sequences, exhibiting impressive performance. However, the key components
+in recurrent-based VSR networks significantly impact model efficiency, e.g.,
+the alignment module occupies a substantial portion of model parameters, while
+the bidirectional propagation mechanism significantly amplifies the inference
+time. Consequently, developing a compact and efficient VSR method that can be
+deployed on resource-constrained devices, e.g., smartphones, remains
+challenging. To this end, we propose a cascaded temporal updating network
+(CTUN) for efficient VSR. We first develop an implicit cascaded alignment
+module to explore spatio-temporal correspondences from adjacent frames.
+Moreover, we propose a unidirectional propagation updating network to
+efficiently explore long-range temporal information, which is crucial for
+high-quality video reconstruction. Specifically, we develop a simple yet
+effective hidden updater that can leverage future information to update hidden
+features during forward propagation, significantly reducing inference time
+while maintaining performance. Finally, we formulate all of these components
+into an end-to-end trainable VSR network. Extensive experimental results show
+that our CTUN achieves a favorable trade-off between efficiency and performance
+compared to existing methods. Notably, compared with BasicVSR, our method
+obtains better results while employing only about 30% of the parameters and
+running time. The source code and pre-trained models will be available at
+https://github.com/House-Leo/CTUN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project website: https://github.com/House-Leo/CTUN</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gallery-Aware Uncertainty Estimation For Open-Set Face Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14229v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14229v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leonid Erlygin, Alexey Zaytsev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurately estimating image quality and model robustness improvement are
+critical challenges in unconstrained face recognition, which can be addressed
+through uncertainty estimation via probabilistic face embeddings. Previous
+research mainly focused on uncertainty estimation in face verification, leaving
+the open-set face recognition task underexplored. In open-set face recognition,
+one seeks to classify an image, which could also be unknown. Here, the low
+variance of probabilistic embedding does not imply a low error probability: an
+image embedding could be close to several classes in a gallery, thus yielding
+high uncertainty. We propose a method aware of two sources of ambiguity in the
+open-set recognition system: (1) the gallery uncertainty caused by overlapping
+classes and (2) the uncertainty of the face embeddings. To detect both types,
+we use a Bayesian probabilistic model of embedding distribution, which provides
+a principled uncertainty estimate. Challenging open-set face recognition
+datasets, such as IJB-C, serve as a testbed for our method. We also propose a
+new open-set recognition protocol for whale and dolphin identification. The
+proposed approach better identifies recognition errors than uncertainty
+estimation methods based solely on image quality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TC-PDM: Temporally Consistent Patch Diffusion Models for
+  Infrared-to-Visible Video Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14227v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14227v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anh-Dzung Doan, Vu Minh Hieu Phan, Surabhi Gupta, Markus Wagner, Tat-Jun Chin, Ian Reid
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Infrared imaging offers resilience against changing lighting conditions by
+capturing object temperatures. Yet, in few scenarios, its lack of visual
+details compared to daytime visible images, poses a significant challenge for
+human and machine interpretation. This paper proposes a novel diffusion method,
+dubbed Temporally Consistent Patch Diffusion Models (TC-DPM), for
+infrared-to-visible video translation. Our method, extending the Patch
+Diffusion Model, consists of two key components. Firstly, we propose a
+semantic-guided denoising, leveraging the strong representations of
+foundational models. As such, our method faithfully preserves the semantic
+structure of generated visible images. Secondly, we propose a novel temporal
+blending module to guide the denoising trajectory, ensuring the temporal
+consistency between consecutive frames. Experiment shows that TC-PDM
+outperforms state-of-the-art methods by 35.3% in FVD for infrared-to-visible
+video translation and by 6.1% in AP50 for day-to-night object detection. Our
+code is publicly available at https://github.com/dzungdoan6/tc-pdm
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MagicMan: Generative Novel View Synthesis of Humans with 3D-Aware
+  Diffusion and Iterative Refinement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14211v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14211v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xu He, Xiaoyu Li, Di Kang, Jiangnan Ye, Chaopeng Zhang, Liyang Chen, Xiangjun Gao, Han Zhang, Zhiyong Wu, Haolin Zhuang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing works in single-image human reconstruction suffer from weak
+generalizability due to insufficient training data or 3D inconsistencies for a
+lack of comprehensive multi-view knowledge. In this paper, we introduce
+MagicMan, a human-specific multi-view diffusion model designed to generate
+high-quality novel view images from a single reference image. As its core, we
+leverage a pre-trained 2D diffusion model as the generative prior for
+generalizability, with the parametric SMPL-X model as the 3D body prior to
+promote 3D awareness. To tackle the critical challenge of maintaining
+consistency while achieving dense multi-view generation for improved 3D human
+reconstruction, we first introduce hybrid multi-view attention to facilitate
+both efficient and thorough information interchange across different views.
+Additionally, we present a geometry-aware dual branch to perform concurrent
+generation in both RGB and normal domains, further enhancing consistency via
+geometry cues. Last but not least, to address ill-shaped issues arising from
+inaccurate SMPL-X estimation that conflicts with the reference image, we
+propose a novel iterative refinement strategy, which progressively optimizes
+SMPL-X accuracy while enhancing the quality and consistency of the generated
+multi-views. Extensive experimental results demonstrate that our method
+significantly outperforms existing approaches in both novel view synthesis and
+subsequent 3D human reconstruction tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://thuhcsi.github.io/MagicMan</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Driving in the Occupancy World: Vision-Centric 4D Occupancy Forecasting
+  and Planning via World Models for Autonomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14197v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14197v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Yang, Jianbiao Mei, Yukai Ma, Siliang Du, Wenqing Chen, Yijie Qian, Yuxiang Feng, Yong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  World models envision potential future states based on various ego actions.
+They embed extensive knowledge about the driving environment, facilitating safe
+and scalable autonomous driving. Most existing methods primarily focus on
+either data generation or the pretraining paradigms of world models. Unlike the
+aforementioned prior works, we propose Drive-OccWorld, which adapts a
+vision-centric 4D forecasting world model to end-to-end planning for autonomous
+driving. Specifically, we first introduce a semantic and motion-conditional
+normalization in the memory module, which accumulates semantic and dynamic
+information from historical BEV embeddings. These BEV features are then
+conveyed to the world decoder for future occupancy and flow forecasting,
+considering both geometry and spatiotemporal modeling. Additionally, we propose
+injecting flexible action conditions, such as velocity, steering angle,
+trajectory, and commands, into the world model to enable controllable
+generation and facilitate a broader range of downstream applications.
+Furthermore, we explore integrating the generative capabilities of the 4D world
+model with end-to-end planning, enabling continuous forecasting of future
+states and the selection of optimal trajectories using an occupancy-based cost
+function. Extensive experiments on the nuScenes dataset demonstrate that our
+method can generate plausible and controllable 4D occupancy, opening new
+avenues for driving world generation and end-to-end planning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Feature Aligning Few shot Learning Method Using Local Descriptors
+  Weighted Rules 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14192v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14192v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bingchen Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Few-shot classification involves identifying new categories using a limited
+number of labeled samples. Current few-shot classification methods based on
+local descriptors primarily leverage underlying consistent features across
+visible and invisible classes, facing challenges including redundant
+neighboring information, noisy representations, and limited interpretability.
+This paper proposes a Feature Aligning Few-shot Learning Method Using Local
+Descriptors Weighted Rules (FAFD-LDWR). It innovatively introduces a
+cross-normalization method into few-shot image classification to preserve the
+discriminative information of local descriptors as much as possible; and
+enhances classification performance by aligning key local descriptors of
+support and query sets to remove background noise. FAFD-LDWR performs
+excellently on three benchmark datasets , outperforming state-of-the-art
+methods in both 1-shot and 5-shot settings. The designed visualization
+experiments also demonstrate FAFD-LDWR's improvement in prediction
+interpretability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EMDFNet: Efficient Multi-scale and Diverse Feature Network for Traffic
+  Sign Detection <span class="chip">ICANN</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14189v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14189v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengyu Li, Chenhe Liu, Tengfei Li, Xinyu Wang, Shihui Zhang, Dongyang Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The detection of small objects, particularly traffic signs, is a critical
+subtask within object detection and autonomous driving. Despite the notable
+achievements in previous research, two primary challenges persist. Firstly, the
+main issue is the singleness of feature extraction. Secondly, the detection
+process fails to effectively integrate with objects of varying sizes or scales.
+These issues are also prevalent in generic object detection. Motivated by these
+challenges, in this paper, we propose a novel object detection network named
+Efficient Multi-scale and Diverse Feature Network (EMDFNet) for traffic sign
+detection that integrates an Augmented Shortcut Module and an Efficient Hybrid
+Encoder to address the aforementioned issues simultaneously. Specifically, the
+Augmented Shortcut Module utilizes multiple branches to integrate various
+spatial semantic information and channel semantic information, thereby
+enhancing feature diversity. The Efficient Hybrid Encoder utilizes global
+feature fusion and local feature interaction based on various features to
+generate distinctive classification features by integrating feature information
+in an adaptable manner. Extensive experiments on the Tsinghua-Tencent 100K
+(TT100K) benchmark and the German Traffic Sign Detection Benchmark (GTSDB)
+demonstrate that our EMDFNet outperforms other state-of-the-art detectors in
+performance while retaining the real-time processing capabilities of
+single-stage models. This substantiates the effectiveness of EMDFNet in
+detecting small traffic signs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages,5 figures,accepted to ICANN</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ensemble Predicate Decoding for Unbiased Scene Graph Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14187v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14187v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiasong Feng, Lichun Wang, Hongbo Xu, Kai Xu, Baocai Yin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scene Graph Generation (SGG) aims to generate a comprehensive graphical
+representation that accurately captures the semantic information of a given
+scenario. However, the SGG model's performance in predicting more fine-grained
+predicates is hindered by a significant predicate bias. According to existing
+works, the long-tail distribution of predicates in training data results in the
+biased scene graph. However, the semantic overlap between predicate categories
+makes predicate prediction difficult, and there is a significant difference in
+the sample size of semantically similar predicates, making the predicate
+prediction more difficult. Therefore, higher requirements are placed on the
+discriminative ability of the model. In order to address this problem, this
+paper proposes Ensemble Predicate Decoding (EPD), which employs multiple
+decoders to attain unbiased scene graph generation. Two auxiliary decoders
+trained on lower-frequency predicates are used to improve the discriminative
+ability of the model. Extensive experiments are conducted on the VG, and the
+experiment results show that EPD enhances the model's representation capability
+for predicates. In addition, we find that our approach ensures a relatively
+superior predictive capability for more frequent predicates compared to
+previous unbiased SGG methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Affine steerers for structured keypoint description <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14186v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14186v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Georg Bökman, Johan Edstedt, Michael Felsberg, Fredrik Kahl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a way to train deep learning based keypoint descriptors that makes
+them approximately equivariant for locally affine transformations of the image
+plane. The main idea is to use the representation theory of GL(2) to generalize
+the recently introduced concept of steerers from rotations to affine
+transformations. Affine steerers give high control over how keypoint
+descriptions transform under image transformations. We demonstrate the
+potential of using this control for image matching. Finally, we propose a way
+to finetune keypoint descriptors with a set of steerers on upright images and
+obtain state-of-the-art results on several standard benchmarks. Code will be
+published at github.com/georg-bn/affine-steerers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be presented at ECCV 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ I2EBench: A Comprehensive Benchmark for Instruction-based Image Editing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14180v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14180v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiwei Ma, Jiayi Ji, Ke Ye, Weihuang Lin, Zhibin Wang, Yonghan Zheng, Qiang Zhou, Xiaoshuai Sun, Rongrong Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Significant progress has been made in the field of Instruction-based Image
+Editing (IIE). However, evaluating these models poses a significant challenge.
+A crucial requirement in this field is the establishment of a comprehensive
+evaluation benchmark for accurately assessing editing results and providing
+valuable insights for its further development. In response to this need, we
+propose I2EBench, a comprehensive benchmark designed to automatically evaluate
+the quality of edited images produced by IIE models from multiple dimensions.
+I2EBench consists of 2,000+ images for editing, along with 4,000+ corresponding
+original and diverse instructions. It offers three distinctive characteristics:
+1) Comprehensive Evaluation Dimensions: I2EBench comprises 16 evaluation
+dimensions that cover both high-level and low-level aspects, providing a
+comprehensive assessment of each IIE model. 2) Human Perception Alignment: To
+ensure the alignment of our benchmark with human perception, we conducted an
+extensive user study for each evaluation dimension. 3) Valuable Research
+Insights: By analyzing the advantages and disadvantages of existing IIE models
+across the 16 dimensions, we offer valuable research insights to guide future
+development in the field. We will open-source I2EBench, including all
+instructions, input images, human annotations, edited images from all evaluated
+methods, and a simple script for evaluating the results from new IIE models.
+The code, dataset and generated images from all IIE models are provided in
+github: https://github.com/cocoshe/I2EBench.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Tech report, 39 pages, 41 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NimbleD: Enhancing <span class="highlight-title">Self-supervised</span> Monocular Depth Estimation with
+  Pseudo-labels and Large-scale Video <span class="highlight-title">Pre-train</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14177v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14177v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Albert Luginov, Muhammad Shahzad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce NimbleD, an efficient self-supervised monocular depth estimation
+learning framework that incorporates supervision from pseudo-labels generated
+by a large vision model. This framework does not require camera intrinsics,
+enabling large-scale pre-training on publicly available videos. Our
+straightforward yet effective learning strategy significantly enhances the
+performance of fast and lightweight models without introducing any overhead,
+allowing them to achieve performance comparable to state-of-the-art
+self-supervised monocular depth estimation models. This advancement is
+particularly beneficial for virtual and augmented reality applications
+requiring low latency inference. The source code, model weights, and
+acknowledgments are available at https://github.com/xapaxca/nimbled .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SwiftBrush v2: Make Your One-step Diffusion Model Better Than Its
+  Teacher <span class="chip">ECCV'24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14176v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14176v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Trung Dao, Thuan Hoang Nguyen, Thanh Le, Duc Vu, Khoi Nguyen, Cuong Pham, Anh Tran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we aim to enhance the performance of SwiftBrush, a prominent
+one-step text-to-image diffusion model, to be competitive with its multi-step
+Stable Diffusion counterpart. Initially, we explore the quality-diversity
+trade-off between SwiftBrush and SD Turbo: the former excels in image
+diversity, while the latter excels in image quality. This observation motivates
+our proposed modifications in the training methodology, including better weight
+initialization and efficient LoRA training. Moreover, our introduction of a
+novel clamped CLIP loss enhances image-text alignment and results in improved
+image quality. Remarkably, by combining the weights of models trained with
+efficient LoRA and full training, we achieve a new state-of-the-art one-step
+diffusion model, achieving an FID of 8.14 and surpassing all GAN-based and
+multi-step Stable Diffusion models. The evaluation code is available at:
+https://github.com/vinairesearch/swiftbrushv2.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ECCV'24</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BackFlip: The Impact of Local and Global Data Augmentations on Artistic
+  Image Aesthetic Assessment <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14173v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14173v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ombretta Strafforello, Gonzalo Muradas Odriozola, Fatemeh Behrad, Li-Wei Chen, Anne-Sofie Maerten, Derya Soydaner, Johan Wagemans
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Assessing the aesthetic quality of artistic images presents unique challenges
+due to the subjective nature of aesthetics and the complex visual
+characteristics inherent to artworks. Basic data augmentation techniques
+commonly applied to natural images in computer vision may not be suitable for
+art images in aesthetic evaluation tasks, as they can change the composition of
+the art images. In this paper, we explore the impact of local and global data
+augmentation techniques on artistic image aesthetic assessment (IAA). We
+introduce BackFlip, a local data augmentation technique designed specifically
+for artistic IAA. We evaluate the performance of BackFlip across three artistic
+image datasets and four neural network architectures, comparing it with the
+commonly used data augmentation techniques. Then, we analyze the effects of
+components within the BackFlip pipeline through an ablation study. Our findings
+demonstrate that local augmentations, such as BackFlip, tend to outperform
+global augmentations on artistic IAA in most cases, probably because they do
+not perturb the composition of the art images. These results emphasize the
+importance of considering both local and global augmentations in future
+computational aesthetics research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at the VISART VII workshop at ECCV 2024. Ombretta
+  Strafforello, Gonzalo Muradas Odriozola, Fatemeh Behrad, Li-Wei Chen,
+  Anne-Sofie Maerten and Derya Soydaner contributed equally to this work</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explaining Vision-Language Similarities in Dual Encoders with
+  Feature-Pair Attributions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14153v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14153v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucas Möller, Pascal Tilli, Ngoc Thang Vu, Sebastian Padó
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dual encoder architectures like CLIP models map two types of inputs into a
+shared embedding space and learn similarities between them. However, it is not
+understood how such models compare two inputs. Here, we address this research
+gap with two contributions. First, we derive a method to attribute predictions
+of any differentiable dual encoder onto feature-pair interactions between its
+inputs. Second, we apply our method to CLIP-type models and show that they
+learn fine-grained correspondences between parts of captions and regions in
+images. They match objects across input modes and also account for mismatches.
+However, this visual-linguistic grounding ability heavily varies between object
+classes, depends on the training data distribution, and largely improves after
+in-domain training. Using our method we can identify knowledge gaps about
+specific object classes in individual models and can monitor their improvement
+upon fine-tuning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Application of Disentanglement to Map Registration Problem 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14152v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14152v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hae Jin Song, Patrycja Krawczuk, Po-Hsuan Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Geospatial data come from various sources, such as satellites, aircraft, and
+LiDAR. The variability of the source is not limited to the types of data
+acquisition techniques, as we have maps from different time periods. To
+incorporate these data for a coherent analysis, it is essential to first align
+different "styles" of geospatial data to its matching images that point to the
+same location on the surface of the Earth. In this paper, we approach the image
+registration as a two-step process of (1) extracting geospatial contents
+invariant to visual (and any other non-content-related) information, and (2)
+matching the data based on such (purely) geospatial contents. We hypothesize
+that a combination of $\beta$-VAE-like architecture [2] and adversarial
+training will achieve both the disentanglement of the geographic information
+and artistic styles and generation of new map tiles by composing the encoded
+geographic information with any artistic style.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 2D-Malafide: Adversarial Attacks Against Face Deepfake Detection Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14143v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14143v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chiara Galdi, Michele Panariello, Massimiliano Todisco, Nicholas Evans
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce 2D-Malafide, a novel and lightweight adversarial attack designed
+to deceive face deepfake detection systems. Building upon the concept of 1D
+convolutional perturbations explored in the speech domain, our method leverages
+2D convolutional filters to craft perturbations which significantly degrade the
+performance of state-of-the-art face deepfake detectors. Unlike traditional
+additive noise approaches, 2D-Malafide optimises a small number of filter
+coefficients to generate robust adversarial perturbations which are
+transferable across different face images. Experiments, conducted using the
+FaceForensics++ dataset, demonstrate that 2D-Malafide substantially degrades
+detection performance in both white-box and black-box settings, with larger
+filter sizes having the greatest impact. Additionally, we report an
+explainability analysis using GradCAM which illustrates how 2D-Malafide
+misleads detection systems by altering the image areas used most for
+classification. Our findings highlight the vulnerability of current deepfake
+detection systems to convolutional adversarial attacks as well as the need for
+future work to enhance detection robustness through improved image fidelity
+constraints.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at BIOSIG 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Foodfusion: A Novel Approach for Food Image Composition via Diffusion
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14135v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14135v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaohua Shi, Xuan Wang, Si Shi, Xule Wang, Mingrui Zhu, Nannan Wang, Xinbo Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Food image composition requires the use of existing dish images and
+background images to synthesize a natural new image, while diffusion models
+have made significant advancements in image generation, enabling the
+construction of end-to-end architectures that yield promising results. However,
+existing diffusion models face challenges in processing and fusing information
+from multiple images and lack access to high-quality publicly available
+datasets, which prevents the application of diffusion models in food image
+composition. In this paper, we introduce a large-scale, high-quality food image
+composite dataset, FC22k, which comprises 22,000 foreground, background, and
+ground truth ternary image pairs. Additionally, we propose a novel food image
+composition method, Foodfusion, which leverages the capabilities of the
+pre-trained diffusion models and incorporates a Fusion Module for processing
+and integrating foreground and background information. This fused information
+aligns the foreground features with the background structure by merging the
+global structural information at the cross-attention layer of the denoising
+UNet. To further enhance the content and structure of the background, we also
+integrate a Content-Structure Control Module. Extensive experiments demonstrate
+the effectiveness and scalability of our proposed method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GenFormer -- Generated Images are All You Need to Improve Robustness of
+  <span class="highlight-title">Transformer</span>s on Small <span class="highlight-title">Dataset</span>s <span class="chip">ICPR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14131v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14131v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sven Oehri, Nikolas Ebert, Ahmed Abdullah, Didier Stricker, Oliver Wasenmüller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent studies showcase the competitive accuracy of Vision Transformers
+(ViTs) in relation to Convolutional Neural Networks (CNNs), along with their
+remarkable robustness. However, ViTs demand a large amount of data to achieve
+adequate performance, which makes their application to small datasets
+challenging, falling behind CNNs. To overcome this, we propose GenFormer, a
+data augmentation strategy utilizing generated images, thereby improving
+transformer accuracy and robustness on small-scale image classification tasks.
+In our comprehensive evaluation we propose Tiny ImageNetV2, -R, and -A as new
+test set variants of Tiny ImageNet by transferring established ImageNet
+generalization and robustness benchmarks to the small-scale data domain.
+Similarly, we introduce MedMNIST-C and EuroSAT-C as corrupted test set variants
+of established fine-grained datasets in the medical and aerial domain. Through
+a series of experiments conducted on small datasets of various domains,
+including Tiny ImageNet, CIFAR, EuroSAT and MedMNIST datasets, we demonstrate
+the synergistic power of our method, in particular when combined with common
+train and test time augmentations, knowledge distillation, and architectural
+design choices. Additionally, we prove the effectiveness of our approach under
+challenging conditions with limited training data, demonstrating significant
+improvements in both accuracy and robustness, bridging the gap between CNNs and
+ViTs in the small-scale dataset domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted at International Conference on Pattern
+  Recognition (ICPR), 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ShapeMamba-EM: Fine-Tuning Foundation Model with Local Shape Descriptors
+  and Mamba Blocks for 3D EM Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14114v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14114v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruohua Shi, Qiufan Pang, Lei Ma, Lingyu Duan, Tiejun Huang, Tingting Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Electron microscopy (EM) imaging offers unparalleled resolution for analyzing
+neural tissues, crucial for uncovering the intricacies of synaptic connections
+and neural processes fundamental to understanding behavioral mechanisms.
+Recently, the foundation models have demonstrated impressive performance across
+numerous natural and medical image segmentation tasks. However, applying these
+foundation models to EM segmentation faces significant challenges due to domain
+disparities. This paper presents ShapeMamba-EM, a specialized fine-tuning
+method for 3D EM segmentation, which employs adapters for long-range dependency
+modeling and an encoder for local shape description within the original
+foundation model. This approach effectively addresses the unique volumetric and
+morphological complexities of EM data. Tested over a wide range of EM images,
+covering five segmentation tasks and 10 datasets, ShapeMamba-EM outperforms
+existing methods, establishing a new standard in EM image segmentation and
+enhancing the understanding of neural tissue architecture.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bengali Sign Language Recognition through Hand Pose Estimation using
+  Multi-Branch Spatial-Temporal Attention Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14111v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14111v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abu Saleh Musa Miah, Md. Al Mehedi Hasan, Md Hadiuzzaman, Muhammad Nazrul Islam, Jungpil Shin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hand gesture-based sign language recognition (SLR) is one of the most
+advanced applications of machine learning, and computer vision uses hand
+gestures. Although, in the past few years, many researchers have widely
+explored and studied how to address BSL problems, specific unaddressed issues
+remain, such as skeleton and transformer-based BSL recognition. In addition,
+the lack of evaluation of the BSL model in various concealed environmental
+conditions can prove the generalized property of the existing model by facing
+daily life signs. As a consequence, existing BSL recognition systems provide a
+limited perspective of their generalisation ability as they are tested on
+datasets containing few BSL alphabets that have a wide disparity in gestures
+and are easy to differentiate. To overcome these limitations, we propose a
+spatial-temporal attention-based BSL recognition model considering hand joint
+skeletons extracted from the sequence of images. The main aim of utilising hand
+skeleton-based BSL data is to ensure the privacy and low-resolution sequence of
+images, which need minimum computational cost and low hardware configurations.
+Our model captures discriminative structural displacements and short-range
+dependency based on unified joint features projected onto high-dimensional
+feature space. Specifically, the use of Separable TCN combined with a powerful
+multi-head spatial-temporal attention architecture generated high-performance
+accuracy. The extensive experiments with a proposed dataset and two benchmark
+BSL datasets with a wide range of evaluations, such as intra- and inter-dataset
+evaluation settings, demonstrated that our proposed models achieve competitive
+performance with extremely low computational complexity and run faster than
+existing models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LSM-YOLO: A Compact and Effective ROI Detector for Medical Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14087v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14087v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongwen Yu, Qiu Guan, Jianmin Yang, Zhiqiang Yang, Qianwei Zhou, Yang Chen, Feng Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In existing medical Region of Interest (ROI) detection, there lacks an
+algorithm that can simultaneously satisfy both real-time performance and
+accuracy, not meeting the growing demand for automatic detection in medicine.
+Although the basic YOLO framework ensures real-time detection due to its fast
+speed, it still faces challenges in maintaining precision concurrently. To
+alleviate the above problems, we propose a novel model named Lightweight Shunt
+Matching-YOLO (LSM-YOLO), with Lightweight Adaptive Extraction (LAE) and
+Multipath Shunt Feature Matching (MSFM). Firstly, by using LAE to refine
+feature extraction, the model can obtain more contextual information and
+high-resolution details from multiscale feature maps, thereby extracting
+detailed features of ROI in medical images while reducing the influence of
+noise. Secondly, MSFM is utilized to further refine the fusion of high-level
+semantic features and low-level visual features, enabling better fusion between
+ROI features and neighboring features, thereby improving the detection rate for
+better diagnostic assistance. Experimental results demonstrate that LSM-YOLO
+achieves 48.6% AP on a private dataset of pancreatic tumors, 65.1% AP on the
+BCCD blood cell detection public dataset, and 73.0% AP on the Br35h brain tumor
+detection public dataset. Our model achieves state-of-the-art performance with
+minimal parameter cost on the above three datasets. The source codes are at:
+https://github.com/VincentYuuuuuu/LSM-YOLO.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HABD: a houma alliance book ancient handwritten character recognition
+  database 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14084v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14084v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoyu Yuan, Xiaohua Huang, Zibo Zhang, Yabo Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Houma Alliance Book, one of history's earliest calligraphic examples, was
+unearthed in the 1970s. These artifacts were meticulously organized,
+reproduced, and copied by the Shanxi Provincial Institute of Cultural Relics.
+However, because of their ancient origins and severe ink erosion, identifying
+characters in the Houma Alliance Book is challenging, necessitating the use of
+digital technology. In this paper, we propose a new ancient handwritten
+character recognition database for the Houma alliance book, along with a novel
+benchmark based on deep learning architectures. More specifically, a collection
+of 26,732 characters samples from the Houma Alliance Book were gathered,
+encompassing 327 different types of ancient characters through iterative
+annotation. Furthermore, benchmark algorithms were proposed by combining four
+deep neural network classifiers with two data augmentation methods. This
+research provides valuable resources and technical support for further studies
+on the Houma Alliance Book and other ancient characters. This contributes to
+our understanding of ancient culture and history, as well as the preservation
+and inheritance of humanity's cultural heritage.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SONICS: Synthetic Or Not -- Identifying Counterfeit Songs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14080v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14080v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Awsafur Rahman, Zaber Ibn Abdul Hakim, Najibul Haque Sarker, Bishmoy Paul, Shaikh Anowarul Fattah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent surge in AI-generated songs presents exciting possibilities and
+challenges. While these tools democratize music creation, they also necessitate
+the ability to distinguish between human-composed and AI-generated songs for
+safeguarding artistic integrity and content curation. Existing research and
+datasets in fake song detection only focus on singing voice deepfake detection
+(SVDD), where the vocals are AI-generated but the instrumental music is sourced
+from real songs. However, this approach is inadequate for contemporary
+end-to-end AI-generated songs where all components (vocals, lyrics, music, and
+style) could be AI-generated. Additionally, existing datasets lack lyrics-music
+diversity, long-duration songs, and open fake songs. To address these gaps, we
+introduce SONICS, a novel dataset for end-to-end Synthetic Song Detection
+(SSD), comprising over 97k songs with over 49k synthetic songs from popular
+platforms like Suno and Udio. Furthermore, we highlight the importance of
+modeling long-range temporal dependencies in songs for effective authenticity
+detection, an aspect overlooked in existing methods. To capture these patterns,
+we propose a novel model, SpecTTTra, that is up to 3 times faster and 6 times
+more memory efficient compared to popular CNN and Transformer-based models
+while maintaining competitive performance. Finally, we offer both AI-based and
+Human evaluation benchmarks, addressing another deficiency in current research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating the Visual Similarity of Southwest China's Ethnic Minority
+  Brocade Based on Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14060v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14060v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shichen Liu, Huaxing Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper employs deep learning methods to investigate the visual similarity
+of ethnic minority patterns in Southwest China. A customized SResNet-18 network
+was developed, achieving an accuracy of 98.7% on the test set, outperforming
+ResNet-18, VGGNet-16, and AlexNet. The extracted feature vectors from
+SResNet-18 were evaluated using three metrics: cosine similarity, Euclidean
+distance, and Manhattan distance. The analysis results were visually
+represented on an ethnic thematic map, highlighting the connections between
+ethnic patterns and their regional distributions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages,2tables,5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Let Video Teaches You More: Video-to-Image Knowledge Distillation using
+  DEtection <span class="highlight-title">TRansformer</span> for Medical Video Lesion Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14051v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14051v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuncheng Jiang, Zixun Zhang, Jun Wei, Chun-Mei Feng, Guanbin Li, Xiang Wan, Shuguang Cui, Zhen Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  AI-assisted lesion detection models play a crucial role in the early
+screening of cancer. However, previous image-based models ignore the
+inter-frame contextual information present in videos. On the other hand,
+video-based models capture the inter-frame context but are computationally
+expensive. To mitigate this contradiction, we delve into Video-to-Image
+knowledge distillation leveraging DEtection TRansformer (V2I-DETR) for the task
+of medical video lesion detection. V2I-DETR adopts a teacher-student network
+paradigm. The teacher network aims at extracting temporal contexts from
+multiple frames and transferring them to the student network, and the student
+network is an image-based model dedicated to fast prediction in inference. By
+distilling multi-frame contexts into a single frame, the proposed V2I-DETR
+combines the advantages of utilizing temporal contexts from video-based models
+and the inference speed of image-based models. Through extensive experiments,
+V2I-DETR outperforms previous state-of-the-art methods by a large margin while
+achieving the real-time inference speed (30 FPS) as the image-based model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>BIBM2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Alleviating Class Imbalance in Semi-supervised Multi-organ Segmentation
+  via Balanced Subclass Regularization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14047v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14047v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenghao Feng, Lu Wen, Binyu Yan, Jiaqi Cui, Yan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semi-supervised learning (SSL) has shown notable potential in relieving the
+heavy demand of dense prediction tasks on large-scale well-annotated datasets,
+especially for the challenging multi-organ segmentation (MoS). However, the
+prevailing class-imbalance problem in MoS, caused by the substantial variations
+in organ size, exacerbates the learning difficulty of the SSL network. To
+alleviate this issue, we present a two-phase semi-supervised network (BSR-Net)
+with balanced subclass regularization for MoS. Concretely, in Phase I, we
+introduce a class-balanced subclass generation strategy based on balanced
+clustering to effectively generate multiple balanced subclasses from original
+biased ones according to their pixel proportions. Then, in Phase II, we design
+an auxiliary subclass segmentation (SCS) task within the multi-task framework
+of the main MoS task. The SCS task contributes a balanced subclass
+regularization to the main MoS task and transfers unbiased knowledge to the MoS
+network, thus alleviating the influence of the class-imbalance problem.
+Extensive experiments conducted on two publicly available datasets, i.e., the
+MICCAI FLARE 2022 dataset and the WORD dataset, verify the superior performance
+of our method compared with other methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Collaborative Perception in Multi-Robot Systems: Case Studies in
+  Household Cleaning and Warehouse Operations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14039v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14039v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bharath Rajiv Nair
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper explores the paradigm of Collaborative Perception (CP), where
+multiple robots and sensors in the environment share and integrate sensor data
+to construct a comprehensive representation of the surroundings. By aggregating
+data from various sensors and utilizing advanced algorithms, the collaborative
+perception framework improves task efficiency, coverage, and safety. Two case
+studies are presented to showcase the benefits of collaborative perception in
+multi-robot systems. The first case study illustrates the benefits and
+advantages of using CP for the task of household cleaning with a team of
+cleaning robots. The second case study performs a comparative analysis of the
+performance of CP versus Standalone Perception (SP) for Autonomous Mobile
+Robots operating in a warehouse environment. The case studies validate the
+effectiveness of CP in enhancing multi-robot coordination, task completion, and
+overall system performance and its potential to impact operations in other
+applications as well. Future investigations will focus on optimizing the
+framework and validating its performance through empirical testing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FAST-LIVO2: Fast, Direct LiDAR-Inertial-Visual Odometry 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14035v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14035v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chunran Zheng, Wei Xu, Zuhao Zou, Tong Hua, Chongjian Yuan, Dongjiao He, Bingyang Zhou, Zheng Liu, Jiarong Lin, Fangcheng Zhu, Yunfan Ren, Rong Wang, Fanle Meng, Fu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes FAST-LIVO2: a fast, direct LiDAR-inertial-visual odometry
+framework to achieve accurate and robust state estimation in SLAM tasks and
+provide great potential in real-time, onboard robotic applications. FAST-LIVO2
+fuses the IMU, LiDAR and image measurements efficiently through an ESIKF. To
+address the dimension mismatch between the heterogeneous LiDAR and image
+measurements, we use a sequential update strategy in the Kalman filter. To
+enhance the efficiency, we use direct methods for both the visual and LiDAR
+fusion, where the LiDAR module registers raw points without extracting edge or
+plane features and the visual module minimizes direct photometric errors
+without extracting ORB or FAST corner features. The fusion of both visual and
+LiDAR measurements is based on a single unified voxel map where the LiDAR
+module constructs the geometric structure for registering new LiDAR scans and
+the visual module attaches image patches to the LiDAR points. To enhance the
+accuracy of image alignment, we use plane priors from the LiDAR points in the
+voxel map (and even refine the plane prior) and update the reference patch
+dynamically after new images are aligned. Furthermore, to enhance the
+robustness of image alignment, FAST-LIVO2 employs an on-demanding raycast
+operation and estimates the image exposure time in real time. Lastly, we detail
+three applications of FAST-LIVO2: UAV onboard navigation demonstrating the
+system's computation efficiency for real-time onboard navigation, airborne
+mapping showcasing the system's mapping accuracy, and 3D model rendering
+(mesh-based and NeRF-based) underscoring the suitability of our reconstructed
+dense map for subsequent rendering tasks. We open source our code, dataset and
+application on GitHub to benefit the robotics community.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages, 31 figures, due to the limitation that 'The abstract field
+  cannot exceed 1,920 characters', the abstract presented here is shorter than
+  the one in the PDF file</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ More Pictures Say More: Visual Intersection Network for Open Set Object
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14032v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14032v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bingcheng Dong, Yuning Ding, Jinrong Zhang, Sifan Zhang, Shenglan Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Open Set Object Detection has seen rapid development recently, but it
+continues to pose significant challenges. Language-based methods, grappling
+with the substantial modal disparity between textual and visual modalities,
+require extensive computational resources to bridge this gap. Although
+integrating visual prompts into these frameworks shows promise for enhancing
+performance, it always comes with constraints related to textual semantics. In
+contrast, viusal-only methods suffer from the low-quality fusion of multiple
+visual prompts. In response, we introduce a strong DETR-based model, Visual
+Intersection Network for Open Set Object Detection (VINO), which constructs a
+multi-image visual bank to preserve the semantic intersections of each category
+across all time steps. Our innovative multi-image visual updating mechanism
+learns to identify the semantic intersections from various visual prompts,
+enabling the flexible incorporation of new information and continuous
+optimization of feature representations. Our approach guarantees a more precise
+alignment between target category semantics and region semantics, while
+significantly reducing pre-training time and resource demands compared to
+language-based methods. Furthermore, the integration of a segmentation head
+illustrates the broad applicability of visual intersection in various visual
+tasks. VINO, which requires only 7 RTX4090 GPU days to complete one epoch on
+the Objects365v1 dataset, achieves competitive performance on par with
+vision-language models on benchmarks such as LVIS and ODinW35.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SurGen: Text-Guided Diffusion Model for Surgical Video Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14028v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14028v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joseph Cho, Samuel Schmidgall, Cyril Zakka, Mrudang Mathur, Rohan Shad, William Hiesinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion-based video generation models have made significant strides,
+producing outputs with improved visual fidelity, temporal coherence, and user
+control. These advancements hold great promise for improving surgical education
+by enabling more realistic, diverse, and interactive simulation environments.
+In this study, we introduce SurGen, a text-guided diffusion model tailored for
+surgical video synthesis, producing the highest resolution and longest duration
+videos among existing surgical video generation models. We validate the visual
+and temporal quality of the outputs using standard image and video generation
+metrics. Additionally, we assess their alignment to the corresponding text
+prompts through a deep learning classifier trained on surgical data. Our
+results demonstrate the potential of diffusion models to serve as valuable
+educational tools for surgical trainees.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Video-CCAM: Enhancing Video-Language Understanding with Causal
+  Cross-Attention Masks for Short and Long Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14023v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14023v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiajun Fei, Dian Li, Zhidong Deng, Zekun Wang, Gang Liu, Hui Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal large language models (MLLMs) have demonstrated considerable
+potential across various downstream tasks that require cross-domain knowledge.
+MLLMs capable of processing videos, known as Video-MLLMs, have attracted broad
+interest in video-language understanding. However, videos, especially long
+videos, contain more visual tokens than images, making them difficult for LLMs
+to process. Existing works either downsample visual features or extend the LLM
+context size, risking the loss of high-resolution information or slowing down
+inference speed. To address these limitations, we apply cross-attention layers
+in the intermediate projector between the visual encoder and the large language
+model (LLM). As the naive cross-attention mechanism is insensitive to temporal
+order, we further introduce causal cross-attention masks (CCAMs) within the
+cross-attention layers. This Video-MLLM, named Video-CCAM, is trained in a
+straightforward two-stage fashion: feature alignment and visual instruction
+tuning. We develop several Video-CCAM models based on LLMs of different sizes
+(4B, 9B, and 14B). Video-CCAM proves to be a robust Video-MLLM and shows
+outstanding performance from short videos to long ones. Among standard video
+benchmarks like MVBench and VideoChatGPT-QA, Video-CCAM shows outstanding
+performances (1st/2nd/3rd in MVBench and TGIF-QA, 2nd/3rd/4th in MSVD-QA,
+MSRVTT-QA, and ActivityNet-QA). In benchmarks encompassing long videos,
+Video-CCAM models can be directly adapted to long video understanding and still
+achieve exceptional scores despite being trained solely with images and
+16-frame videos. Using 96 frames (6$\times$ the training number of frames),
+Video-CCAM models rank 1st/2nd/3rd in VideoVista and 1st/2nd/4th in MLVU among
+all open-source Video-MLLMs, respectively. The code is publicly available in
+\url{https://github.com/QQ-MM/Video-CCAM}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Pixel-Aligned Multi-View Generation with Depth Guided Decoder 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14016v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14016v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenggang Tang, Peiye Zhuang, Chaoyang Wang, Aliaksandr Siarohin, Yash Kant, Alexander Schwing, Sergey Tulyakov, Hsin-Ying Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The task of image-to-multi-view generation refers to generating novel views
+of an instance from a single image. Recent methods achieve this by extending
+text-to-image latent diffusion models to multi-view version, which contains an
+VAE image encoder and a U-Net diffusion model. Specifically, these generation
+methods usually fix VAE and finetune the U-Net only. However, the significant
+downscaling of the latent vectors computed from the input images and
+independent decoding leads to notable pixel-level misalignment across multiple
+views. To address this, we propose a novel method for pixel-level
+image-to-multi-view generation. Unlike prior work, we incorporate attention
+layers across multi-view images in the VAE decoder of a latent video diffusion
+model. Specifically, we introduce a depth-truncated epipolar attention,
+enabling the model to focus on spatially adjacent regions while remaining
+memory efficient. Applying depth-truncated attn is challenging during inference
+as the ground-truth depth is usually difficult to obtain and pre-trained depth
+estimation models is hard to provide accurate depth. Thus, to enhance the
+generalization to inaccurate depth when ground truth depth is missing, we
+perturb depth inputs during training. During inference, we employ a rapid
+multi-view to 3D reconstruction approach, NeuS, to obtain coarse depth for the
+depth-truncated epipolar attention. Our model enables better pixel alignment
+across multi-view images. Moreover, we demonstrate the efficacy of our approach
+in improving downstream multi-view to 3D reconstruction tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Multiscale Gradient Fusion Method for Edge Detection in Color Images
+  Utilizing the CBM3D Filter 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14013v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14013v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhuoyue Wang, Yiyi Tao, Danqing Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, a color edge detection strategy based on collaborative
+filtering combined with multiscale gradient fusion is proposed. The
+block-matching and 3D (BM3D) filter are used to enhance the sparse
+representation in the transform domain and achieve the effect of denoising,
+whereas the multiscale gradient fusion makes up for the defect of loss of
+details in single-scale edge detection and improves the edge detection
+resolution and quality. First, the RGB images in the dataset are converted to
+XYZ color space images through mathematical operations. Second, the colored
+block-matching and 3D (CBM3D) filter are used on the sparse images and to
+remove noise interference. Then, the vector gradients of the color image and
+the anisotropic Gaussian directional derivative of the two scale parameters are
+calculated and averaged pixel-by-pixel to obtain a new edge strength map.
+Finally, the edge features are enhanced by image normalization and non-maximum
+suppression technology, and on that basis, the edge contour is obtained by
+double threshold selection and a new morphological refinement method. Through
+an experimental analysis of the edge detection dataset, the method proposed has
+good noise robustness and high edge quality, which is better than the Color
+Sobel, Color Canny, SE and Color AGDD as shown by the PR curve, AUC, PSNR, MSE,
+and FOM indicators.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>1 figure, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LMM-VQA: Advancing Video Quality Assessment with Large Multimodal Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14008v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14008v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qihang Ge, Wei Sun, Yu Zhang, Yunhao Li, Zhongpeng Ji, Fengyu Sun, Shangling Jui, Xiongkuo Min, Guangtao Zhai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The explosive growth of videos on streaming media platforms has underscored
+the urgent need for effective video quality assessment (VQA) algorithms to
+monitor and perceptually optimize the quality of streaming videos. However, VQA
+remains an extremely challenging task due to the diverse video content and the
+complex spatial and temporal distortions, thus necessitating more advanced
+methods to address these issues. Nowadays, large multimodal models (LMMs), such
+as GPT-4V, have exhibited strong capabilities for various visual understanding
+tasks, motivating us to leverage the powerful multimodal representation ability
+of LMMs to solve the VQA task. Therefore, we propose the first Large
+Multi-Modal Video Quality Assessment (LMM-VQA) model, which introduces a novel
+spatiotemporal visual modeling strategy for quality-aware feature extraction.
+Specifically, we first reformulate the quality regression problem into a
+question and answering (Q&A) task and construct Q&A prompts for VQA instruction
+tuning. Then, we design a spatiotemporal vision encoder to extract spatial and
+temporal features to represent the quality characteristics of videos, which are
+subsequently mapped into the language space by the spatiotemporal projector for
+modality alignment. Finally, the aligned visual tokens and the quality-inquired
+text tokens are aggregated as inputs for the large language model (LLM) to
+generate the quality score and level. Extensive experiments demonstrate that
+LMM-VQA achieves state-of-the-art performance across five VQA benchmarks,
+exhibiting an average improvement of $5\%$ in generalization ability over
+existing methods. Furthermore, due to the advanced design of the spatiotemporal
+encoder and projector, LMM-VQA also performs exceptionally well on general
+video understanding tasks, further validating its effectiveness. Our code will
+be released at https://github.com/Sueqk/LMM-VQA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Avatar Concept Slider: Manipulate Concepts In Your Human Avatar With
+  Fine-grained Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13995v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13995v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yixuan He, Lin Geng Foo, Ajmal Saeed Mian, Hossein Rahmani, Jun Jiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language based editing of 3D human avatars to precisely match user
+requirements is challenging due to the inherent ambiguity and limited
+expressiveness of natural language. To overcome this, we propose the Avatar
+Concept Slider (ACS), a 3D avatar editing method that allows precise
+manipulation of semantic concepts in human avatars towards a specified
+intermediate point between two extremes of concepts, akin to moving a knob
+along a slider track. To achieve this, our ACS has three designs. 1) A Concept
+Sliding Loss based on Linear Discriminant Analysis to pinpoint the
+concept-specific axis for precise editing. 2) An Attribute Preserving Loss
+based on Principal Component Analysis for improved preservation of avatar
+identity during editing. 3) A 3D Gaussian Splatting primitive selection
+mechanism based on concept-sensitivity, which updates only the primitives that
+are the most sensitive to our target concept, to improve efficiency. Results
+demonstrate that our ACS enables fine-grained 3D avatar editing with efficient
+feedback, without harming the avatar quality or compromising the avatar's
+identifying attributes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automatic Medical Report Generation: Methods and Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13988v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13988v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Li Guo, Anas M. Tahir, Dong Zhang, Z. Jane Wang, Rabab K. Ward
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The increasing demand for medical imaging has surpassed the capacity of
+available radiologists, leading to diagnostic delays and potential
+misdiagnoses. Artificial intelligence (AI) techniques, particularly in
+automatic medical report generation (AMRG), offer a promising solution to this
+dilemma. This review comprehensively examines AMRG methods from 2021 to 2024.
+It (i) presents solutions to primary challenges in this field, (ii) explores
+AMRG applications across various imaging modalities, (iii) introduces publicly
+available datasets, (iv) outlines evaluation metrics, (v) identifies techniques
+that significantly enhance model performance, and (vi) discusses unresolved
+issues and potential future research directions. This paper aims to provide a
+comprehensive understanding of the existing literature and inspire valuable
+future research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>42 pages and 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dual-Path Adversarial Lifting for Domain Shift Correction in Online
+  Test-time Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13983v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13983v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yushun Tang, Shuoshuo Chen, Zhihe Lu, Xinchao Wang, Zhihai He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer-based methods have achieved remarkable success in various machine
+learning tasks. How to design efficient test-time adaptation methods for
+transformer models becomes an important research task. In this work, motivated
+by the dual-subband wavelet lifting scheme developed in multi-scale signal
+processing which is able to efficiently separate the input signals into
+principal components and noise components, we introduce a dual-path token
+lifting for domain shift correction in test time adaptation. Specifically, we
+introduce an extra token, referred to as \textit{domain shift token}, at each
+layer of the transformer network. We then perform dual-path lifting with
+interleaved token prediction and update between the path of domain shift tokens
+and the path of class tokens at all network layers. The prediction and update
+networks are learned in an adversarial manner. Specifically, the task of the
+prediction network is to learn the residual noise of domain shift which should
+be largely invariant across all classes and all samples in the target domain.
+In other words, the predicted domain shift noise should be indistinguishable
+between all sample classes. On the other hand, the task of the update network
+is to update the class tokens by removing the domain shift from the input image
+samples so that input samples become more discriminative between different
+classes in the feature space. To effectively learn the prediction and update
+networks with two adversarial tasks, both theoretically and practically, we
+demonstrate that it is necessary to use smooth optimization for the update
+network but non-smooth optimization for the prediction network. Experimental
+results on the benchmark datasets demonstrate that our proposed method
+significantly improves the online fully test-time domain adaptation
+performance. Code is available at \url{https://github.com/yushuntang/DPAL}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ARANet: Attention-based Residual Adversarial Network with Deep
+  Supervision for Radiotherapy Dose Prediction of Cervical Cancer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13981v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13981v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lu Wen, Wenxia Yin, Zhenghao Feng, Xi Wu, Deng Xiong, Yan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Radiation therapy is the mainstay treatment for cervical cancer, and its
+ultimate goal is to ensure the planning target volume (PTV) reaches the
+prescribed dose while reducing dose deposition of organs-at-risk (OARs) as much
+as possible. To achieve these clinical requirements, the medical physicist
+needs to manually tweak the radiotherapy plan repeatedly in a trial-anderror
+manner until finding the optimal one in the clinic. However, such
+trial-and-error processes are quite time-consuming, and the quality of plans
+highly depends on the experience of the medical physicist. In this paper, we
+propose an end-to-end Attentionbased Residual Adversarial Network with deep
+supervision, namely ARANet, to automatically predict the 3D dose distribution
+of cervical cancer. Specifically, given the computer tomography (CT) images and
+their corresponding segmentation masks of PTV and OARs, ARANet employs a
+prediction network to generate the dose maps. We also utilize a multi-scale
+residual attention module and deep supervision mechanism to enforce the
+prediction network to extract more valuable dose features while suppressing
+irrelevant information. Our proposed method is validated on an in-house dataset
+including 54 cervical cancer patients, and experimental results have
+demonstrated its obvious superiority compared to other state-of-the-art
+methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by 2024 IEEE International Conference on Cybernetics and
+  Intelligent Systems (CIS) and IEEE Conference on Robotics, Automation and
+  Mechatronics (RAM)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FusionSAM: Latent Space driven Segment Anything Model for Multimodal
+  Fusion and Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13980v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13980v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daixun Li, Weiying Xie, Mingxiang Cao, Yunke Wang, Jiaqing Zhang, Yunsong Li, Leyuan Fang, Chang Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal image fusion and segmentation enhance scene understanding in
+autonomous driving by integrating data from various sensors. However, current
+models struggle to efficiently segment densely packed elements in such scenes,
+due to the absence of comprehensive fusion features that can guide mid-process
+fine-tuning and focus attention on relevant areas. The Segment Anything Model
+(SAM) has emerged as a transformative segmentation method. It provides more
+effective prompts through its flexible prompt encoder, compared to transformers
+lacking fine-tuned control. Nevertheless, SAM has not been extensively studied
+in the domain of multimodal fusion for natural images. In this paper, we
+introduce SAM into multimodal image segmentation for the first time, proposing
+a novel framework that combines Latent Space Token Generation (LSTG) and Fusion
+Mask Prompting (FMP) modules to enhance SAM's multimodal fusion and
+segmentation capabilities. Specifically, we first obtain latent space features
+of the two modalities through vector quantization and embed them into a
+cross-attention-based inter-domain fusion module to establish long-range
+dependencies between modalities. Then, we use these comprehensive fusion
+features as prompts to guide precise pixel-level segmentation. Extensive
+experiments on several public datasets demonstrate that the proposed method
+significantly outperforms SAM and SAM2 in multimodal autonomous driving
+scenarios, achieving at least 3.9$\%$ higher segmentation mIoU than the
+state-of-the-art approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Nemesis: Normalizing the Soft-<span class="highlight-title">prompt</span> Vectors of Vision-Language Models <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13979v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13979v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuai Fu, Xiequn Wang, Qiushi Huang, Yu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the prevalence of large-scale pretrained vision-language models (VLMs),
+such as CLIP, soft-prompt tuning has become a popular method for adapting these
+models to various downstream tasks. However, few works delve into the inherent
+properties of learnable soft-prompt vectors, specifically the impact of their
+norms to the performance of VLMs. This motivates us to pose an unexplored
+research question: ``Do we need to normalize the soft prompts in VLMs?'' To
+fill this research gap, we first uncover a phenomenon, called the
+\textbf{Low-Norm Effect} by performing extensive corruption experiments,
+suggesting that reducing the norms of certain learned prompts occasionally
+enhances the performance of VLMs, while increasing them often degrades it. To
+harness this effect, we propose a novel method named \textbf{N}ormalizing
+th\textbf{e} soft-pro\textbf{m}pt v\textbf{e}ctors of vi\textbf{si}on-language
+model\textbf{s} (\textbf{Nemesis}) to normalize soft-prompt vectors in VLMs. To
+the best of our knowledge, our work is the first to systematically investigate
+the role of norms of soft-prompt vector in VLMs, offering valuable insights for
+future research in soft-prompt tuning. The code is available at
+\texttt{\href{https://github.com/ShyFoo/Nemesis}{https://github.com/ShyFoo/Nemesis}}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICLR 2024 (Spotlight)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Histology Virtual Staining with Mask-Guided Adversarial Transfer
+  Learning for Tertiary Lymphoid Structure Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13978v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13978v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiuli Wang, Yongxu Liu, Li Ma, Xianqi Wang, Wei Chen, Xiaohong Yao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Histological Tertiary Lymphoid Structures (TLSs) are increasingly recognized
+for their correlation with the efficacy of immunotherapy in various solid
+tumors. Traditionally, the identification and characterization of TLSs rely on
+immunohistochemistry (IHC) staining techniques, utilizing markers such as CD20
+for B cells. Despite the specificity of IHC, Hematoxylin-Eosin (H&E) staining
+offers a more accessible and cost-effective choice. Capitalizing on the
+prevalence of H&E staining slides, we introduce a novel Mask-Guided Adversarial
+Transfer Learning method designed for virtual pathological staining. This
+method adeptly captures the nuanced color variations across diverse tissue
+types under various staining conditions, such as nucleus, red blood cells,
+positive reaction regions, without explicit label information, and adeptly
+synthesizes realistic IHC-like virtual staining patches, even replicating the
+positive reaction. Further, we propose the Virtual IHC Pathology Analysis
+Network (VIPA-Net), an integrated framework encompassing a Mask-Guided Transfer
+Module and an H&E-Based Virtual Staining TLS Detection Module. VIPA-Net
+synergistically harnesses both H\&E staining slides and the synthesized virtual
+IHC patches to enhance the detection of TLSs within H&E Whole Slide Images
+(WSIs). We evaluate the network with a comprehensive dataset comprising 1019
+annotated slides from The Cancer Genome Atlas (TCGA). Experimental results
+compellingly illustrate that the VIPA-Net substantially elevates TLS detection
+accuracy, effectively circumventing the need for actual CD20 staining across
+the public dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DynaSurfGS: Dynamic Surface Reconstruction with Planar-based Gaussian
+  Splatting <span class="chip">3DV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13972v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13972v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weiwei Cai, Weicai Ye, Peng Ye, Tong He, Tao Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dynamic scene reconstruction has garnered significant attention in recent
+years due to its capabilities in high-quality and real-time rendering. Among
+various methodologies, constructing a 4D spatial-temporal representation, such
+as 4D-GS, has gained popularity for its high-quality rendered images. However,
+these methods often produce suboptimal surfaces, as the discrete 3D Gaussian
+point clouds fail to align with the object's surface precisely. To address this
+problem, we propose DynaSurfGS to achieve both photorealistic rendering and
+high-fidelity surface reconstruction of dynamic scenarios. Specifically, the
+DynaSurfGS framework first incorporates Gaussian features from 4D neural voxels
+with the planar-based Gaussian Splatting to facilitate precise surface
+reconstruction. It leverages normal regularization to enforce the smoothness of
+the surface of dynamic objects. It also incorporates the as-rigid-as-possible
+(ARAP) constraint to maintain the approximate rigidity of local neighborhoods
+of 3D Gaussians between timesteps and ensure that adjacent 3D Gaussians remain
+closely aligned throughout. Extensive experiments demonstrate that DynaSurfGS
+surpasses state-of-the-art methods in both high-fidelity surface reconstruction
+and photorealistic rendering.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>homepage: https://open3dvlab.github.io/DynaSurfGS/, code:
+  https://github.com/Open3DVLab/DynaSurfGS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Smart Multi-Modal Search: Contextual Sparse and Dense Embedding
+  Integration in Adobe Express 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14698v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14698v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cherag Aroraa, Tracy Holloway King, Jayant Kumar, Yi Lu, Sanat Sharma, Arvind Srikantan, David Uvalle, Josep Valls-Vargas, Harsha Vardhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As user content and queries become increasingly multi-modal, the need for
+effective multi-modal search systems has grown. Traditional search systems
+often rely on textual and metadata annotations for indexed images, while
+multi-modal embeddings like CLIP enable direct search using text and image
+embeddings. However, embedding-based approaches face challenges in integrating
+contextual features such as user locale and recency. Building a scalable
+multi-modal search system requires fine-tuning several components. This paper
+presents a multi-modal search architecture and a series of AB tests that
+optimize embeddings and multi-modal technologies in Adobe Express template
+search. We address considerations such as embedding model selection, the roles
+of embeddings in matching and ranking, and the balance between dense and sparse
+embeddings. Our iterative approach demonstrates how utilizing sparse, dense,
+and contextual features enhances short and long query search, significantly
+reduces null rates (over 70\%), and increases click-through rates (CTR). Our
+findings provide insights into developing robust multi-modal search systems,
+thereby enhancing relevance for complex queries.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Neural Network Interpretability Through Conductance-Based
+  Information Plane Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14681v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14681v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaouad Dabounou, Amine Baazzouz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Information Plane is a conceptual framework used to analyze the flow of
+information in neural networks, but traditional methods based on activations
+may not fully capture the dynamics of information processing. This paper
+introduces a new approach that uses layer conductance, a measure of sensitivity
+to input features, to enhance the Information Plane analysis. By incorporating
+gradient-based contributions, we provide a more precise characterization of
+information dynamics within the network. The proposed conductance-based
+Information Plane and a new Information Transformation Efficiency (ITE) metric
+are evaluated on pretrained ResNet50 and VGG16 models using the ImageNet
+dataset. Our results demonstrate the ability to identify critical hidden layers
+that contribute significantly to model performance and interpretability, giving
+insights into information compression, preservation, and utilization across
+layers. The conductance-based approach offers a granular perspective on feature
+attribution, enhancing our understanding of the decision-making processes
+within neural networks. Furthermore, our empirical findings challenge certain
+theoretical predictions of the Information Bottleneck theory, highlighting the
+complexities of information dynamics in real-world data scenarios. The proposed
+method not only advances our understanding of information dynamics in neural
+networks but also has the potential to significantly impact the broader field
+of Artificial Intelligence by enabling the development of more interpretable,
+efficient, and robust models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ gWaveNet: Classification of Gravity Waves from Noisy Satellite Data
+  using Custom Kernel Integrated Deep Learning Method <span class="chip">ICPR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14674v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14674v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seraj Al Mahmud Mostafa, Omar Faruque, Chenxi Wang, Jia Yue, Sanjay Purushotham, Jianwu Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Atmospheric gravity waves occur in the Earths atmosphere caused by an
+interplay between gravity and buoyancy forces. These waves have profound
+impacts on various aspects of the atmosphere, including the patterns of
+precipitation, cloud formation, ozone distribution, aerosols, and pollutant
+dispersion. Therefore, understanding gravity waves is essential to comprehend
+and monitor changes in a wide range of atmospheric behaviors. Limited studies
+have been conducted to identify gravity waves from satellite data using machine
+learning techniques. Particularly, without applying noise removal techniques,
+it remains an underexplored area of research. This study presents a novel
+kernel design aimed at identifying gravity waves within satellite images. The
+proposed kernel is seamlessly integrated into a deep convolutional neural
+network, denoted as gWaveNet. Our proposed model exhibits impressive
+proficiency in detecting images containing gravity waves from noisy satellite
+data without any feature engineering. The empirical results show our model
+outperforms related approaches by achieving over 98% training accuracy and over
+94% test accuracy which is known to be the best result for gravity waves
+detection up to the time of this work. We open sourced our code at
+https://rb.gy/qn68ku.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted at the 27th International Conference on
+  Pattern Recognition (ICPR) 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Physically Feasible Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14672v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14672v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shamik Basu, Christos Sakaridis, Luc Van Gool
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  State-of-the-art semantic segmentation models are typically optimized in a
+data-driven fashion, minimizing solely per-pixel classification objectives on
+their training data. This purely data-driven paradigm often leads to absurd
+segmentations, especially when the domain of input images is shifted from the
+one encountered during training. For instance, state-of-the-art models may
+assign the label ``road'' to a segment which is located above a segment that is
+respectively labeled as ``sky'', although our knowledge of the physical world
+dictates that such a configuration is not feasible for images captured by
+forward-facing upright cameras. Our method, Physically Feasible Semantic
+Segmentation (PhyFea), extracts explicit physical constraints that govern
+spatial class relations from the training sets of semantic segmentation
+datasets and enforces a differentiable loss function that penalizes violations
+of these constraints to promote prediction feasibility. PhyFea yields
+significant performance improvements in mIoU over each state-of-the-art network
+we use as baseline across ADE20K, Cityscapes and ACDC, notably a $1.5\%$
+improvement on ADE20K and a $2.1\%$ improvement on ACDC.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comparative Analysis: Violence Recognition from Videos using Transfer
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14659v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14659v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dursun Dashdamirov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Action recognition has become a hot topic in computer vision. However, the
+main applications of computer vision in video processing have focused on
+detection of relatively simple actions while complex events such as violence
+detection have been comparatively less investigated. This study focuses on the
+benchmarking of various deep learning techniques on a complex dataset. Next, a
+larger dataset is utilized to test the uplift from increasing volume of data.
+The dataset size increase from 500 to 1,600 videos resulted in a notable
+average accuracy improvement of 6% across four models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 5 figures, The paper will be published in IEEE AICT 2024
+  Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BreakNet: Discontinuity-Resilient Multi-Scale <span class="highlight-title">Transformer</span> Segmentation
+  of Retinal Layers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14606v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14606v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Razieh Ganjee, Bingjie Wang, Lingyun Wang, Chengcheng Zhao, José-Alain Sahel, Shaohua Pi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visible light optical coherence tomography (vis-OCT) is gaining traction for
+retinal imaging due to its high resolution and functional capabilities.
+However, the significant absorption of hemoglobin in the visible light range
+leads to pronounced shadow artifacts from retinal blood vessels, posing
+challenges for accurate layer segmentation. In this study, we present BreakNet,
+a multi-scale Transformer-based segmentation model designed to address boundary
+discontinuities caused by these shadow artifacts. BreakNet utilizes
+hierarchical Transformer and convolutional blocks to extract multi-scale global
+and local feature maps, capturing essential contextual, textural, and edge
+characteristics. The model incorporates decoder blocks that expand pathwaproys
+to enhance the extraction of fine details and semantic information, ensuring
+precise segmentation. Evaluated on rodent retinal images acquired with
+prototype vis-OCT, BreakNet demonstrated superior performance over
+state-of-the-art segmentation models, such as TCCT-BP and U-Net, even when
+faced with limited-quality ground truth data. Our findings indicate that
+BreakNet has the potential to significantly improve retinal quantification and
+analysis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 3D Point Cloud Network Pruning: When Some Weights Do not Matter <span class="chip">BMVC 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14601v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14601v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amrijit Biswas, Md. Ismail Hossain, M M Lutfe Elahi, Ali Cheraghian, Fuad Rahman, Nabeel Mohammed, Shafin Rahman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A point cloud is a crucial geometric data structure utilized in numerous
+applications. The adoption of deep neural networks referred to as Point Cloud
+Neural Networks (PC- NNs), for processing 3D point clouds, has significantly
+advanced fields that rely on 3D geometric data to enhance the efficiency of
+tasks. Expanding the size of both neural network models and 3D point clouds
+introduces significant challenges in minimizing computational and memory
+requirements. This is essential for meeting the demanding requirements of
+real-world applications, which prioritize minimal energy consumption and low
+latency. Therefore, investigating redundancy in PCNNs is crucial yet
+challenging due to their sensitivity to parameters. Additionally, traditional
+pruning methods face difficulties as these networks rely heavily on weights and
+points. Nonetheless, our research reveals a promising phenomenon that could
+refine standard PCNN pruning techniques. Our findings suggest that preserving
+only the top p% of the highest magnitude weights is crucial for accuracy
+preservation. For example, pruning 99% of the weights from the PointNet model
+still results in accuracy close to the base level. Specifically, in the
+ModelNet40 dataset, where the base accuracy with the PointNet model was 87. 5%,
+preserving only 1% of the weights still achieves an accuracy of 86.8%. Codes
+are available in: https://github.com/apurba-nsu-rnd-lab/PCNN_Pruning
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in BMVC 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PVAFN: Point-Voxel Attention Fusion Network with Multi-Pooling Enhancing
+  for 3D Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14600v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14600v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yidi Li, Jiahao Wen, Bin Ren, Wenhao Li, Zhenhuan Xu, Hao Guo, Hong Liu, Nicu Sebe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The integration of point and voxel representations is becoming more common in
+LiDAR-based 3D object detection. However, this combination often struggles with
+capturing semantic information effectively. Moreover, relying solely on point
+features within regions of interest can lead to information loss and
+limitations in local feature representation. To tackle these challenges, we
+propose a novel two-stage 3D object detector, called Point-Voxel Attention
+Fusion Network (PVAFN). PVAFN leverages an attention mechanism to improve
+multi-modal feature fusion during the feature extraction phase. In the
+refinement stage, it utilizes a multi-pooling strategy to integrate both
+multi-scale and region-specific information effectively. The point-voxel
+attention mechanism adaptively combines point cloud and voxel-based
+Bird's-Eye-View (BEV) features, resulting in richer object representations that
+help to reduce false detections. Additionally, a multi-pooling enhancement
+module is introduced to boost the model's perception capabilities. This module
+employs cluster pooling and pyramid pooling techniques to efficiently capture
+key geometric details and fine-grained shape structures, thereby enhancing the
+integration of local and global features. Extensive experiments on the KITTI
+and Waymo datasets demonstrate that the proposed PVAFN achieves competitive
+performance. The code and models will be available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>3D Object Detection</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Implicit Concept Removal of Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.05873v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.05873v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhili Liu, Kai Chen, Yifan Zhang, Jianhua Han, Lanqing Hong, Hang Xu, Zhenguo Li, Dit-Yan Yeung, James Kwok
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-image (T2I) diffusion models often inadvertently generate unwanted
+concepts such as watermarks and unsafe images. These concepts, termed as the
+"implicit concepts", could be unintentionally learned during training and then
+be generated uncontrollably during inference. Existing removal methods still
+struggle to eliminate implicit concepts primarily due to their dependency on
+the model's ability to recognize concepts it actually can not discern. To
+address this, we utilize the intrinsic geometric characteristics of implicit
+concepts and present the Geom-Erasing, a novel concept removal method based on
+the geometric-driven control. Specifically, once an unwanted implicit concept
+is identified, we integrate the existence and geometric information of the
+concept into the text prompts with the help of an accessible classifier or
+detector model. Subsequently, the model is optimized to identify and
+disentangle this information, which is then adopted as negative prompts during
+generation. Moreover, we introduce the Implicit Concept Dataset (ICD), a novel
+image-text dataset imbued with three typical implicit concepts (i.e., QR codes,
+watermarks, and text), reflecting real-life situations where implicit concepts
+are easily injected. Geom-Erasing effectively mitigates the generation of
+implicit concepts, achieving the state-of-the-art results on the Inappropriate
+Image Prompts (I2P) and our challenging Implicit Concept Dataset (ICD)
+benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Global Attractor for a Reaction-Diffusion Model Arising in Biological
+  Dynamic in 3D Soil Structure 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02060v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02060v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohamed Elghandouri, Khalil Ezzinbi, Mouad Klai, Olivier Monga
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Partial Differential Equations (PDEs) play a crucial role as tools for
+modeling and comprehending intricate natural processes, notably within the
+domain of biology. This research explores the domain of microbial activity
+within the complex matrix of 3D soil structures, providing valuable
+understanding into both the existence and uniqueness of solutions and the
+asymptotic behavior of the corresponding PDE model. Our investigation results
+in the discovery of a global attractor, a fundamental feature with significant
+implications for long-term system behavior. To enhance the clarity of our
+findings, numerical simulations are employed to visually illustrate the
+attributes of this global attractor.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint submitted to Mathematical Modeling in Natural Phenomena</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Error Analysis of 3D Gaussian Splatting and an Optimal Projection
+  Strategy <span class="chip">ECCV2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.00752v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.00752v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Letian Huang, Jiayang Bai, Jie Guo, Yuanqi Li, Yanwen Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D Gaussian Splatting has garnered extensive attention and application in
+real-time neural rendering. Concurrently, concerns have been raised about the
+limitations of this technology in aspects such as point cloud storage,
+performance, and robustness in sparse viewpoints, leading to various
+improvements. However, there has been a notable lack of attention to the
+fundamental problem of projection errors introduced by the local affine
+approximation inherent in the splatting itself, and the consequential impact of
+these errors on the quality of photo-realistic rendering. This paper addresses
+the projection error function of 3D Gaussian Splatting, commencing with the
+residual error from the first-order Taylor expansion of the projection
+function. The analysis establishes a correlation between the error and the
+Gaussian mean position. Subsequently, leveraging function optimization theory,
+this paper analyzes the function's minima to provide an optimal projection
+strategy for Gaussian Splatting referred to Optimal Gaussian Splatting, which
+can accommodate a variety of camera models. Experimental validation further
+confirms that this projection methodology reduces artifacts, resulting in a
+more convincingly realistic rendering.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ECCV2024; Project Page:
+  https://letianhuang.github.io/op43dgs/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DQ-DETR: DETR with Dynamic Query for Tiny Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.03507v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.03507v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi-Xin Huang, Hou-I Liu, Hong-Han Shuai, Wen-Huang Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite previous DETR-like methods having performed successfully in generic
+object detection, tiny object detection is still a challenging task for them
+since the positional information of object queries is not customized for
+detecting tiny objects, whose scale is extraordinarily smaller than general
+objects. Also, DETR-like methods using a fixed number of queries make them
+unsuitable for aerial datasets, which only contain tiny objects, and the
+numbers of instances are imbalanced between different images. Thus, we present
+a simple yet effective model, named DQ-DETR, which consists of three different
+components: categorical counting module, counting-guided feature enhancement,
+and dynamic query selection to solve the above-mentioned problems. DQ-DETR uses
+the prediction and density maps from the categorical counting module to
+dynamically adjust the number of object queries and improve the positional
+information of queries. Our model DQ-DETR outperforms previous CNN-based and
+DETR-like methods, achieving state-of-the-art mAP 30.2% on the AI-TOD-V2
+dataset, which mostly consists of tiny objects.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SpikeGS: Reconstruct 3D scene via fast-moving bio-inspired sensors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.03771v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.03771v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yijia Guo, Liwen Hu, Lei Ma, Tiejun Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D Gaussian Splatting (3DGS) demonstrates unparalleled superior performance
+in 3D scene reconstruction. However, 3DGS heavily relies on the sharp images.
+Fulfilling this requirement can be challenging in real-world scenarios
+especially when the camera moves fast, which severely limits the application of
+3DGS. To address these challenges, we proposed Spike Gausian Splatting
+(SpikeGS), the first framework that integrates the spike streams into 3DGS
+pipeline to reconstruct 3D scenes via a fast-moving bio-inspired camera. With
+accumulation rasterization, interval supervision, and a specially designed
+pipeline, SpikeGS extracts detailed geometry and texture from high temporal
+resolution but texture lacking spike stream, reconstructs 3D scenes captured in
+1 second. Extensive experiments on multiple synthetic and real-world datasets
+demonstrate the superiority of SpikeGS compared with existing spike-based and
+deblur 3D scene reconstruction methods. Codes and data will be released soon.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GloSoFarID: Global multispectral <span class="highlight-title">dataset</span> for Solar Farm IDentification
+  in satellite imagery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.05180v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.05180v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiyuan Yang, Ryan Rad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Solar Photovoltaic (PV) technology is increasingly recognized as a pivotal
+solution in the global pursuit of clean and renewable energy. This technology
+addresses the urgent need for sustainable energy alternatives by converting
+solar power into electricity without greenhouse gas emissions. It not only
+curtails global carbon emissions but also reduces reliance on finite,
+non-renewable energy sources. In this context, monitoring solar panel farms
+becomes essential for understanding and facilitating the worldwide shift toward
+clean energy. This study contributes to this effort by developing the first
+comprehensive global dataset of multispectral satellite imagery of solar panel
+farms. This dataset is intended to form the basis for training robust machine
+learning models, which can accurately map and analyze the expansion and
+distribution of solar panel farms globally. The insights gained from this
+endeavor will be instrumental in guiding informed decision-making for a
+sustainable energy future. https://github.com/yzyly1992/GloSoFarID
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Swin <span class="highlight-title">transformer</span>s are robust to distribution and concept drift in
+  endoscopy-based longitudinal rectal cancer assessment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.03762v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.03762v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jorge Tapias Gomez, Aneesh Rangnekar, Hannah Williams, Hannah Thompson, Julio Garcia-Aguilar, Joshua Jesse Smith, Harini Veeraraghavan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Endoscopic images are used at various stages of rectal cancer treatment
+starting from cancer screening, diagnosis, during treatment to assess response
+and toxicity from treatments such as colitis, and at follow up to detect new
+tumor or local regrowth (LR). However, subjective assessment is highly variable
+and can underestimate the degree of response in some patients, subjecting them
+to unnecessary surgery, or overestimate response that places patients at risk
+of disease spread. Advances in deep learning has shown the ability to produce
+consistent and objective response assessment for endoscopic images. However,
+methods for detecting cancers, regrowth, and monitoring response during the
+entire course of patient treatment and follow-up are lacking. This is because,
+automated diagnosis and rectal cancer response assessment requires methods that
+are robust to inherent imaging illumination variations and confounding
+conditions (blood, scope, blurring) present in endoscopy images as well as
+changes to the normal lumen and tumor during treatment. Hence, a hierarchical
+shifted window (Swin) transformer was trained to distinguish rectal cancer from
+normal lumen using endoscopy images. Swin as well as two convolutional
+(ResNet-50, WideResNet-50), and vision transformer (ViT) models were trained
+and evaluated on follow-up longitudinal images to detect LR on private dataset
+as well as on out-of-distribution (OOD) public colonoscopy datasets to detect
+pre/non-cancerous polyps. Color shifts were applied using optimal transport to
+simulate distribution shifts. Swin and ResNet models were similarly accurate in
+the in-distribution dataset. Swin was more accurate than other methods
+(follow-up: 0.84, OOD: 0.83) even when subject to color shifts (follow-up:
+0.83, OOD: 0.87), indicating capability to provide robust performance for
+longitudinal cancer assessment.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Binocular Model: A deep learning solution for online melt pool
+  temperature analysis using dual-wavelength Imaging Pyrometry 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.11126v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.11126v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Javid Akhavan, Chaitanya Krishna Vallabh, Xiayun Zhao, Souran Manoochehri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In metal Additive Manufacturing (AM), monitoring the temperature of the Melt
+Pool (MP) is crucial for ensuring part quality, process stability, defect
+prevention, and overall process optimization. Traditional methods, are slow to
+converge and require extensive manual effort to translate data into actionable
+insights, rendering them impractical for real-time monitoring and control. To
+address this challenge, we propose an Artificial Intelligence (AI)-based
+solution aimed at reducing manual data processing reliance and improving the
+efficiency of transitioning from data to insight. In our study, we utilize a
+dataset comprising dual-wavelength real-time process monitoring data and
+corresponding temperature maps. We introduce a deep learning model called the
+"Binocular model," which exploits dual input observations to perform a precise
+analysis of MP temperature in Laser Powder Bed Fusion (L-PBF). Through advanced
+deep learning techniques, we seamlessly convert raw data into temperature maps,
+significantly streamlining the process and enabling batch processing at a rate
+of up to 750 frames per second, approximately 1000 times faster than
+conventional methods. Our Binocular model achieves high accuracy in temperature
+estimation, evidenced by a 0.95 R-squared score, while simultaneously enhancing
+processing efficiency by a factor of $\sim1000x$ times. This model directly
+addresses the challenge of real-time MP temperature monitoring and offers
+insights into the encountered constraints and the benefits of our Deep
+Learning-based approach. By combining efficiency and precision, our work
+contributes to the advancement of temperature monitoring in L-PBF, thus driving
+progress in the field of metal AM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cross-view Action Recognition Understanding From Exocentric to
+  Egocentric Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15699v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15699v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thanh-Dat Truong, Khoa Luu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding action recognition in egocentric videos has emerged as a vital
+research topic with numerous practical applications. With the limitation in the
+scale of egocentric data collection, learning robust deep learning-based action
+recognition models remains difficult. Transferring knowledge learned from the
+large-scale exocentric data to the egocentric data is challenging due to the
+difference in videos across views. Our work introduces a novel cross-view
+learning approach to action recognition (CVAR) that effectively transfers
+knowledge from the exocentric to the selfish view. First, we present a novel
+geometric-based constraint into the self-attention mechanism in Transformer
+based on analyzing the camera positions between two views. Then, we propose a
+new cross-view self-attention loss learned on unpaired cross-view data to
+enforce the self-attention mechanism learning to transfer knowledge across
+views. Finally, to further improve the performance of our cross-view learning
+approach, we present the metrics to measure the correlations in videos and
+attention maps effectively. Experimental results on standard egocentric action
+recognition benchmarks, i.e., Charades-Ego, EPIC-Kitchens-55, and
+EPIC-Kitchens-100, have shown our approach's effectiveness and state-of-the-art
+performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Pediatric TSC-Related Epilepsy Classification from Clinical MR Images
+  Using Quantum Neural Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.12615v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.12615v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ling Lin, Yihang Zhou, Zhanqi Hu, Dian Jiang, Congcong Liu, Shuo Zhou, Yanjie Zhu, Jianxiang Liao, Dong Liang, Hairong Zheng, Haifeng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tuberous sclerosis complex (TSC) manifests as a multisystem disorder with
+significant neurological implications. This study addresses the critical need
+for robust classification models tailored to TSC in pediatric patients,
+introducing QResNet,a novel deep learning model seamlessly integrating
+conventional convolutional neural networks with quantum neural networks. The
+model incorporates a two-layer quantum layer (QL), comprising ZZFeatureMap and
+Ansatz layers, strategically designed for processing classical data within a
+quantum framework. A comprehensive evaluation, demonstrates the superior
+performance of QResNet in TSC MRI image classification compared to conventional
+3D-ResNet models. These compelling findings underscore the potential of quantum
+computing to revolutionize medical imaging and diagnostics.Remarkably, this
+method surpasses conventional CNNs in accuracy and Area Under the Curve (AUC)
+metrics with the current dataset. Future research endeavors may focus on
+exploring the scalability and practical implementation of quantum algorithms in
+real-world medical imaging scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages,4 figures,2 tables,presented at ISBI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Attention-guided Feature Distillation for Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.05451v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.05451v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amir M. Mansourian, Arya Jalali, Rozhan Ahmadi, Shohreh Kasaei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In contrast to existing complex methodologies commonly employed for
+distilling knowledge from a teacher to a student, this paper showcases the
+efficacy of a simple yet powerful method for utilizing refined feature maps to
+transfer attention. The proposed method has proven to be effective in
+distilling rich information, outperforming existing methods in semantic
+segmentation as a dense prediction task. The proposed Attention-guided Feature
+Distillation (AttnFD) method, employs the Convolutional Block Attention Module
+(CBAM), which refines feature maps by taking into account both channel-specific
+and spatial information content. Simply using the Mean Squared Error (MSE) loss
+function between the refined feature maps of the teacher and the student,
+AttnFD demonstrates outstanding performance in semantic segmentation, achieving
+state-of-the-art results in terms of improving the mean Intersection over Union
+(mIoU) of the student network on the PascalVoc 2012, Cityscapes, COCO, and
+CamVid datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 8 figures, and 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Docling Technical Report 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.09869v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.09869v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christoph Auer, Maksym Lysak, Ahmed Nassar, Michele Dolfi, Nikolaos Livathinos, Panos Vagenas, Cesar Berrospi Ramis, Matteo Omenetti, Fabian Lindlbauer, Kasper Dinkla, Valery Weber, Lucas Morin, Ingmar Meijer, Viktor Kuropiatnyk, Peter W. J. Staar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This technical report introduces Docling, an easy to use, self-contained,
+MIT-licensed open-source package for PDF document conversion. It is powered by
+state-of-the-art specialized AI models for layout analysis (DocLayNet) and
+table structure recognition (TableFormer), and runs efficiently on commodity
+hardware in a small resource budget. The code interface allows for easy
+extensibility and addition of new features and models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Filter & Align: Curating Image-Text Data with Human Knowledge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.06726v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.06726v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Zhang, Fangxun Shu, Tianyang Liu, Sucheng Ren, Hao Jiang, Cihang Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The increasing availability of image-text pairs has largely fueled the rapid
+advancement in vision-language foundation models. However, the vast scale of
+these datasets inevitably introduces significant variability in data quality,
+which can adversely affect the model performance. This highlights the critical
+role of data filtering, not only to enhance training efficiency but also to
+improve overall data quality. Existing methods typically rely on metrics such
+as CLIP Score and BLIP Score, which are derived from pre-trained models.
+However, these models are often trained on uncurated, noisy datasets, which can
+perpetuate errors and misalignments in the filtered dataset. We present a novel
+algorithm that incorporates human knowledge on image-text alignment to guide
+filtering vast corpus of web-crawled image-text datasets into a compact and
+high-quality form. To systemically capture human preferences on image-text
+alignments, we collect a diverse image-text dataset where each image is
+associated with multiple captions from various sources, and establish a
+comprehensive set of both subjective and objective criteria for critically
+guiding the alignment assessment from labelers. Additionally, we train a reward
+model on these human-preference annotations to internalize the nuanced human
+understanding of image-text alignment. The resulting reward model thus can act
+as a human-like referee to filter image-text pairs. Extensive experiments
+demonstrate that we can maintain, sometimes even improve, model performance
+while compressing the image-text datasets up to ~90%. An impressive example is
+that, by aggressively reducing the total training sample from 130M to only
+15.5M, our BLIP-B/16 models consistently show an average improvement of 2.9% on
+retrieval tasks and 11.5% on captioning tasks compared to full-size-dataset
+counterparts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PDEBENCH: An Extensive Benchmark for Scientific Machine Learning <span class="chip">NeurIPS 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.07182v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.07182v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Makoto Takamoto, Timothy Praditia, Raphael Leiteritz, Dan MacKinlay, Francesco Alesiani, Dirk Pflüger, Mathias Niepert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning-based modeling of physical systems has experienced increased
+interest in recent years. Despite some impressive progress, there is still a
+lack of benchmarks for Scientific ML that are easy to use but still challenging
+and representative of a wide range of problems. We introduce PDEBench, a
+benchmark suite of time-dependent simulation tasks based on Partial
+Differential Equations (PDEs). PDEBench comprises both code and data to
+benchmark the performance of novel machine learning models against both
+classical numerical simulations and machine learning baselines. Our proposed
+set of benchmark problems contribute the following unique features: (1) A much
+wider range of PDEs compared to existing benchmarks, ranging from relatively
+common examples to more realistic and difficult problems; (2) much larger
+ready-to-use datasets compared to prior work, comprising multiple simulation
+runs across a larger number of initial and boundary conditions and PDE
+parameters; (3) more extensible source codes with user-friendly APIs for data
+generation and baseline results with popular machine learning models (FNO,
+U-Net, PINN, Gradient-Based Inverse Method). PDEBench allows researchers to
+extend the benchmark freely for their own purposes using a standardized API and
+to compare the performance of new models to existing baseline methods. We also
+propose new evaluation metrics with the aim to provide a more holistic
+understanding of learning methods in the context of Scientific ML. With those
+metrics we identify tasks which are challenging for recent ML methods and
+propose these tasks as future challenges for the community. The code is
+available at https://github.com/pdebench/PDEBench.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages (main body) + 34 pages (supplemental material), accepted for
+  publication in NeurIPS 2022 Track Datasets and Benchmarks</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VFMM3D: Releasing the Potential of Image by Vision Foundation Model for
+  Monocular 3D Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.09431v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.09431v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bonan Ding, Jin Xie, Jing Nie, Jiale Cao, Xuelong Li, Yanwei Pang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to its cost-effectiveness and widespread availability, monocular 3D
+object detection, which relies solely on a single camera during inference,
+holds significant importance across various applications, including autonomous
+driving and robotics. Nevertheless, directly predicting the coordinates of
+objects in 3D space from monocular images poses challenges. Therefore, an
+effective solution involves transforming monocular images into LiDAR-like
+representations and employing a LiDAR-based 3D object detector to predict the
+3D coordinates of objects. The key step in this method is accurately converting
+the monocular image into a reliable point cloud form. In this paper, we present
+VFMM3D, an innovative framework that leverages the capabilities of Vision
+Foundation Models (VFMs) to accurately transform single-view images into LiDAR
+point cloud representations. VFMM3D utilizes the Segment Anything Model (SAM)
+and Depth Anything Model (DAM) to generate high-quality pseudo-LiDAR data
+enriched with rich foreground information. Specifically, the Depth Anything
+Model (DAM) is employed to generate dense depth maps. Subsequently, the Segment
+Anything Model (SAM) is utilized to differentiate foreground and background
+regions by predicting instance masks. These predicted instance masks and depth
+maps are then combined and projected into 3D space to generate pseudo-LiDAR
+points. Finally, any object detectors based on point clouds can be utilized to
+predict the 3D coordinates of objects. Comprehensive experiments are conducted
+on two challenging 3D object detection datasets, KITTI and Waymo. Our VFMM3D
+establishes a new state-of-the-art performance on both datasets. Additionally,
+experimental results demonstrate the generality of VFMM3D, showcasing its
+seamless integration into various LiDAR-based 3D object detectors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Interpretable Representation Learning of Cardiac MRI via Attribute
+  Regularization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.08282v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.08282v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maxime Di Folco, Cosmin I. Bercea, Emily Chan, Julia A. Schnabel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interpretability is essential in medical imaging to ensure that clinicians
+can comprehend and trust artificial intelligence models. Several approaches
+have been recently considered to encode attributes in the latent space to
+enhance its interpretability. Notably, attribute regularization aims to encode
+a set of attributes along the dimensions of a latent representation. However,
+this approach is based on Variational AutoEncoder and suffers from blurry
+reconstruction. In this paper, we propose an Attributed-regularized Soft
+Introspective Variational Autoencoder that combines attribute regularization of
+the latent space within the framework of an adversarially trained variational
+autoencoder. We demonstrate on short-axis cardiac Magnetic Resonance images of
+the UK Biobank the ability of the proposed method to address blurry
+reconstruction issues of variational autoencoder methods while preserving the
+latent space interpretability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2312.08915</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Planner3D: LLM-enhanced graph prior meets 3D indoor scene explicit
+  regularization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.12848v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.12848v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yao Wei, Martin Renqiang Min, George Vosselman, Li Erran Li, Michael Ying Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Compositional 3D scene synthesis has diverse applications across a spectrum
+of industries such as robotics, films, and video games, as it closely mirrors
+the complexity of real-world multi-object environments. Conventional works
+typically employ shape retrieval based frameworks which naturally suffer from
+limited shape diversity. Recent progresses have been made in object shape
+generation with generative models such as diffusion models, which increases the
+shape fidelity. However, these approaches separately treat 3D shape generation
+and layout generation. The synthesized scenes are usually hampered by layout
+collision, which suggests that the scene-level fidelity is still
+under-explored. In this paper, we aim at generating realistic and reasonable 3D
+indoor scenes from scene graph. To enrich the priors of the given scene graph
+inputs, large language model is utilized to aggregate the global-wise features
+with local node-wise and edge-wise features. With a unified graph encoder,
+graph features are extracted to guide joint layout-shape generation. Additional
+regularization is introduced to explicitly constrain the produced 3D layouts.
+Benchmarked on the SG-FRONT dataset, our method achieves better 3D scene
+synthesis, especially in terms of scene-level fidelity. The source code will be
+released after publication.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LF Tracy: A Unified Single-Pipeline Approach for Salient Object
+  Detection in Light Field Cameras <span class="chip">ICPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.16712v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.16712v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fei Teng, Jiaming Zhang, Jiawei Liu, Kunyu Peng, Xina Cheng, Zhiyong Li, Kailun Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Leveraging rich information is crucial for dense prediction tasks. Light
+field (LF) cameras are instrumental in this regard, as they allow data to be
+sampled from various perspectives. This capability provides valuable spatial,
+depth, and angular information, enhancing scene-parsing tasks. However, we have
+identified two overlooked issues for the LF salient object detection (SOD)
+task. (1): Previous approaches predominantly employ a customized two-stream
+design to discover the spatial and depth features within light field images.
+The network struggles to learn the implicit angular information between
+different images due to a lack of intra-network data connectivity. (2): Little
+research has been directed towards the data augmentation strategy for LF SOD.
+Research on inter-network data connectivity is scant. In this study, we propose
+an efficient paradigm (LF Tracy) to address those issues. This comprises a
+single-pipeline encoder paired with a highly efficient information aggregation
+(IA) module (around 8M parameters) to establish an intra-network connection.
+Then, a simple yet effective data augmentation strategy called MixLD is
+designed to bridge the inter-network connections. Owing to this innovative
+paradigm, our model surpasses the existing state-of-the-art method through
+extensive experiments. Especially, LF Tracy demonstrates a 23% improvement over
+previous results on the latest large-scale PKU dataset. The source code is
+publicly available at: https://github.com/FeiBryantkit/LF-Tracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICPR 2024. The source code is publicly available at:
+  https://github.com/FeiBryantkit/LF-Tracy</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Focus on Neighbors and Know the Whole: Towards Consistent Dense
+  Multiview Text-to-Image Generator for 3D Creation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13149v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13149v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bonan Li, Zicheng Zhang, Xingyi Yang, Xinchao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generating dense multiview images from text prompts is crucial for creating
+high-fidelity 3D assets. Nevertheless, existing methods struggle with
+space-view correspondences, resulting in sparse and low-quality outputs. In
+this paper, we introduce CoSER, a novel consistent dense Multiview
+Text-to-Image Generator for Text-to-3D, achieving both efficiency and quality
+by meticulously learning neighbor-view coherence and further alleviating
+ambiguity through the swift traversal of all views. For achieving neighbor-view
+consistency, each viewpoint densely interacts with adjacent viewpoints to
+perceive the global spatial structure, and aggregates information along motion
+paths explicitly defined by physical principles to refine details. To further
+enhance cross-view consistency and alleviate content drift, CoSER rapidly scan
+all views in spiral bidirectional manner to aware holistic information and then
+scores each point based on semantic material. Subsequently, we conduct weighted
+down-sampling along the spatial dimension based on scores, thereby facilitating
+prominent information fusion across all views with lightweight computation.
+Technically, the core module is built by integrating the attention mechanism
+with a selective state space model, exploiting the robust learning capabilities
+of the former and the low overhead of the latter. Extensive evaluation shows
+that CoSER is capable of producing dense, high-fidelity, content-consistent
+multiview images that can be flexibly integrated into various 3D generation
+models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Spectral Improvement for Unsupervised Image Instance Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.02474v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.02474v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Farnoosh Arefi, Amir M. Mansourian, Shohreh Kasaei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep spectral methods reframe the image decomposition process as a graph
+partitioning task by extracting features using self-supervised learning and
+utilizing the Laplacian of the affinity matrix to obtain eigensegments.
+However, instance segmentation has received less attention compared to other
+tasks within the context of deep spectral methods. This paper addresses the
+fact that not all channels of the feature map extracted from a self-supervised
+backbone contain sufficient information for instance segmentation purposes. In
+fact, Some channels are noisy and hinder the accuracy of the task. To overcome
+this issue, this paper proposes two channel reduction modules: Noise Channel
+Reduction (NCR) and Deviation-based Channel Reduction (DCR). The NCR retains
+channels with lower entropy, as they are less likely to be noisy, while DCR
+prunes channels with low standard deviation, as they lack sufficient
+information for effective instance segmentation. Furthermore, the paper
+demonstrates that the dot product, commonly used in deep spectral methods, is
+not suitable for instance segmentation due to its sensitivity to feature map
+values, potentially leading to incorrect instance segments. A new similarity
+metric called Bray-Curtis over Chebyshev (BoC) is proposed to address this
+issue. It takes into account the distribution of features in addition to their
+values, providing a more robust similarity measure for instance segmentation.
+Quantitative and qualitative results on the Youtube-VIS2019 dataset highlight
+the improvements achieved by the proposed channel reduction methods and the use
+of BoC instead of the conventional dot product for creating the affinity
+matrix. These improvements are observed in terms of mean Intersection over
+Union and extracted instance segments, demonstrating enhanced instance
+segmentation performance. The code is available on:
+https://github.com/farnooshar/SpecUnIIS
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 13 figures and 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ InstantStyleGaussian: Efficient Art Style Transfer with 3D Gaussian
+  Splatting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.04249v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.04249v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin-Yi Yu, Jun-Xin Yu, Li-Bo Zhou, Yan Wei, Lin-Lin Ou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present InstantStyleGaussian, an innovative 3D style transfer method based
+on the 3D Gaussian Splatting (3DGS) scene representation. By inputting a
+target-style image, it quickly generates new 3D GS scenes. Our method operates
+on pre-reconstructed GS scenes, combining diffusion models with an improved
+iterative dataset update strategy. It utilizes diffusion models to generate
+target style images, adds these new images to the training dataset, and uses
+this dataset to iteratively update and optimize the GS scenes, significantly
+accelerating the style editing process while ensuring the quality of the
+generated scenes. Extensive experimental results demonstrate that our method
+ensures high-quality stylized scenes while offering significant advantages in
+style transfer speed and consistency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SigFormer: Sparse Signal-Guided <span class="highlight-title">Transformer</span> for Multi-Modal Human Action
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.17428v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.17428v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qi Liu, Xinchen Liu, Kun Liu, Xiaoyan Gu, Wu Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal human action segmentation is a critical and challenging task with
+a wide range of applications. Nowadays, the majority of approaches concentrate
+on the fusion of dense signals (i.e., RGB, optical flow, and depth maps).
+However, the potential contributions of sparse IoT sensor signals, which can be
+crucial for achieving accurate recognition, have not been fully explored. To
+make up for this, we introduce a Sparse signalguided Transformer (SigFormer) to
+combine both dense and sparse signals. We employ mask attention to fuse
+localized features by constraining cross-attention within the regions where
+sparse signals are valid. However, since sparse signals are discrete, they lack
+sufficient information about the temporal action boundaries. Therefore, in
+SigFormer, we propose to emphasize the boundary information at two stages to
+alleviate this problem. In the first feature extraction stage, we introduce an
+intermediate bottleneck module to jointly learn both category and boundary
+features of each dense modality through the inner loss functions. After the
+fusion of dense modalities and sparse signals, we then devise a two-branch
+architecture that explicitly models the interrelationship between action
+category and temporal boundary. Experimental results demonstrate that SigFormer
+outperforms the state-of-the-art approaches on a multi-modal action
+segmentation dataset from real industrial environments, reaching an outstanding
+F1 score of 0.958. The codes and pre-trained models have been available at
+https://github.com/LIUQI-creat/SigFormer.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Be Persistent: Towards a Unified Solution for Mitigating Shortcuts in
+  Deep Learning <span class="chip">ECAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.11237v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.11237v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hadi M. Dolatabadi, Sarah M. Erfani, Christopher Leckie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks (DNNs) are vulnerable to shortcut learning: rather than
+learning the intended task, they tend to draw inconclusive relationships
+between their inputs and outputs. Shortcut learning is ubiquitous among many
+failure cases of neural networks, and traces of this phenomenon can be seen in
+their generalizability issues, domain shift, adversarial vulnerability, and
+even bias towards majority groups. In this paper, we argue that this
+commonality in the cause of various DNN issues creates a significant
+opportunity that should be leveraged to find a unified solution for shortcut
+learning. To this end, we outline the recent advances in topological data
+analysis (TDA), and persistent homology (PH) in particular, to sketch a unified
+roadmap for detecting shortcuts in deep learning. We demonstrate our arguments
+by investigating the topological features of computational graphs in DNNs using
+two cases of unlearnable examples and bias in decision-making as our test
+studies. Our analysis of these two failure cases of DNNs reveals that finding a
+unified solution for shortcut learning in DNNs is not out of reach, and TDA can
+play a significant role in forming such a framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the 2024 European Conference on Artificial Intelligence
+  (ECAI)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-head Spatial-Spectral Mamba for Hyperspectral Image Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.01224v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.01224v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Ahmad, Muhammad Hassaan Farooq Butt, Muhammad Usama, Hamad Ahmed Altuwaijri, Manuel Mazzara, Salvatore Distefano
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spatial-Spectral Mamba (SSM) improves computational efficiency and captures
+long-range dependencies, addressing Transformer limitations. However,
+traditional Mamba models overlook rich spectral information in HSIs and
+struggle with high dimensionality and sequential data. To address these issues,
+we propose the SSM with multi-head self-attention and token enhancement
+(MHSSMamba). This model integrates spectral and spatial information by
+enhancing spectral tokens and using multi-head attention to capture complex
+relationships between spectral bands and spatial locations. It also manages
+long-range dependencies and the sequential nature of HSI data, preserving
+contextual information across spectral bands. MHSSMamba achieved remarkable
+classification accuracies of 97.62\% on Pavia University, 96.92\% on the
+University of Houston, 96.85\% on Salinas, and 99.49\% on Wuhan-longKou
+datasets. The source code is available at
+\href{https://github.com/MHassaanButt/MHA\_SS\_Mamba}{GitHub}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Self-Supervised</span> Skeleton-Based Action Representation Learning: A
+  Benchmark and Beyond 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02978v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02978v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiahang Zhang, Lilang Lin, Shuai Yang, Jiaying Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning (SSL), which aims to learn meaningful prior
+representations from unlabeled data, has been proven effective for
+skeleton-based action understanding. Different from the image domain, skeleton
+data possesses sparser spatial structures and diverse representation forms,
+with the absence of background clues and the additional temporal dimension,
+presenting new challenges for spatial-temporal motion pretext task design.
+Recently, many endeavors have been made for skeleton-based SSL, achieving
+remarkable progress. However, a systematic and thorough review is still
+lacking. In this paper, we conduct, for the first time, a comprehensive survey
+on self-supervised skeleton-based action representation learning. Following the
+taxonomy of context-based, generative learning, and contrastive learning
+approaches, we make a thorough review and benchmark of existing works and shed
+light on the future possible directions. Remarkably, our investigation
+demonstrates that most SSL works rely on the single paradigm, learning
+representations of a single level, and are evaluated on the action recognition
+task solely, which leaves the generalization power of skeleton SSL models
+under-explored. To this end, a novel and effective SSL method for skeleton is
+further proposed, which integrates versatile representation learning objectives
+of different granularity, substantially boosting the generalization capacity
+for multiple skeleton downstream tasks. Extensive experiments under three
+large-scale datasets demonstrate our method achieves superior generalization
+performance on various downstream tasks, including recognition, retrieval,
+detection, and few-shot learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VASARI-auto: equitable, efficient, and economical featurisation of
+  glioma MRI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.15318v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.15318v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        James K Ruffle, Samia Mohinta, Kelly Pegoretti Baruteau, Rebekah Rajiah, Faith Lee, Sebastian Brandner, Parashkev Nachev, Harpreet Hyare
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The VASARI MRI feature set is a quantitative system designed to standardise
+glioma imaging descriptions. Though effective, deriving VASARI is
+time-consuming and seldom used in clinical practice. This is a problem that
+machine learning could plausibly automate. Using glioma data from 1172
+patients, we developed VASARI-auto, an automated labelling software applied to
+both open-source lesion masks and our openly available tumour segmentation
+model. In parallel, two consultant neuroradiologists independently quantified
+VASARI features in a subsample of 100 glioblastoma cases. We quantified: 1)
+agreement across neuroradiologists and VASARI-auto; 2) calibration of
+performance equity; 3) an economic workforce analysis; and 4) fidelity in
+predicting patient survival. Tumour segmentation was compatible with the
+current state of the art and equally performant regardless of age or sex. A
+modest inter-rater variability between in-house neuroradiologists was
+comparable to between neuroradiologists and VASARI-auto, with far higher
+agreement between VASARI-auto methods. The time taken for neuroradiologists to
+derive VASARI was substantially higher than VASARI-auto (mean time per case 317
+vs. 3 seconds). A UK hospital workforce analysis forecast that three years of
+VASARI featurisation would demand 29,777 consultant neuroradiologist workforce
+hours ({\pounds}1,574,935), reducible to 332 hours of computing time (and
+{\pounds}146 of power) with VASARI-auto. The best-performing survival model
+utilised VASARI-auto features as opposed to those derived by neuroradiologists.
+VASARI-auto is a highly efficient automated labelling system with equitable
+performance across patient age or sex, a favourable economic profile if used as
+a decision support tool, and with non-inferior fidelity in downstream patient
+survival prediction. Future work should iterate upon and integrate such tools
+to enhance patient care.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>36 pages, 8 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SSL-Interactions: Pretext Tasks for Interactive Trajectory Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.07729v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.07729v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Prarthana Bhattacharyya, Chengjie Huang, Krzysztof Czarnecki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper addresses motion forecasting in multi-agent environments, pivotal
+for ensuring safety of autonomous vehicles. Traditional as well as recent
+data-driven marginal trajectory prediction methods struggle to properly learn
+non-linear agent-to-agent interactions. We present SSL-Interactions that
+proposes pretext tasks to enhance interaction modeling for trajectory
+prediction. We introduce four interaction-aware pretext tasks to encapsulate
+various aspects of agent interactions: range gap prediction, closest distance
+prediction, direction of movement prediction, and type of interaction
+prediction. We further propose an approach to curate interaction-heavy
+scenarios from datasets. This curated data has two advantages: it provides a
+stronger learning signal to the interaction model, and facilitates generation
+of pseudo-labels for interaction-centric pretext tasks. We also propose three
+new metrics specifically designed to evaluate predictions in interactive
+scenes. Our empirical evaluations indicate SSL-Interactions outperforms
+state-of-the-art motion forecasting methods quantitatively with up to 8%
+improvement, and qualitatively, for interaction-heavy scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at IV-2024. 13 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Helios: An extremely low power event-based gesture recognition for
+  always-on smart eyewear <span class="chip">ECCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.05206v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.05206v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Prarthana Bhattacharyya, Joshua Mitton, Ryan Page, Owen Morgan, Ben Menzies, Gabriel Homewood, Kemi Jacobs, Paolo Baesso, David Trickett, Chris Mair, Taru Muhonen, Rory Clark, Louis Berridge, Richard Vigars, Iain Wallace
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces Helios, the first extremely low-power, real-time,
+event-based hand gesture recognition system designed for all-day on smart
+eyewear. As augmented reality (AR) evolves, current smart glasses like the Meta
+Ray-Bans prioritize visual and wearable comfort at the expense of
+functionality. Existing human-machine interfaces (HMIs) in these devices, such
+as capacitive touch and voice controls, present limitations in ergonomics,
+privacy and power consumption. Helios addresses these challenges by leveraging
+natural hand interactions for a more intuitive and comfortable user experience.
+Our system utilizes a extremely low-power and compact 3mmx4mm/20mW event camera
+to perform natural hand-based gesture recognition for always-on smart eyewear.
+The camera's output is processed by a convolutional neural network (CNN)
+running on a NXP Nano UltraLite compute platform, consuming less than 350mW.
+Helios can recognize seven classes of gestures, including subtle microgestures
+like swipes and pinches, with 91% accuracy. We also demonstrate real-time
+performance across 20 users at a remarkably low latency of 60ms. Our user
+testing results align with the positive feedback we received during our recent
+successful demo at AWE-USA-2024.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ECCV-Integrating Computer Vision in Smart Eyewear, 2024.
+  18 pages, 10 figures. First three authors contributed equally to this paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Vision meets algae: A novel way for microalgae recognization and health
+  monitor 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.07546v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.07546v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shizheng Zhou, Juntao Jiang, Xiaohan Hong, Yan Hong, Pengcheng Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Marine microalgae are widespread in the ocean and play a crucial role in the
+ecosystem. Automatic identification and location of marine microalgae in
+microscopy images would help establish marine ecological environment monitoring
+and water quality evaluation system. We proposed a new dataset for the
+detection of marine microalgae and a range of detection methods, the dataset
+including images of different genus of algae and the same genus in different
+states. We set the number of unbalanced classes in the data set and added
+images of mixed water samples in the test set to simulate the actual situation
+in the field. Then we trained, validated and tested the, TOOD, YOLOv5, YOLOv8
+and variants of RCNN algorithms on this dataset. The results showed both
+one-stage and two-stage object detection models can achieve high mean average
+precision, which proves the ability of computer vision in multi-object
+detection of microalgae, and provides basic data and models for real-time
+detection of microalgal cells.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Syn-to-Real Unsupervised Domain Adaptation for Indoor 3D Object
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.11311v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.11311v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunsong Wang, Na Zhao, Gim Hee Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The use of synthetic data in indoor 3D object detection offers the potential
+of greatly reducing the manual labor involved in 3D annotations and training
+effective zero-shot detectors. However, the complicated domain shifts across
+syn-to-real indoor datasets remains underexplored. In this paper, we propose a
+novel Object-wise Hierarchical Domain Alignment (OHDA) framework for
+syn-to-real unsupervised domain adaptation in indoor 3D object detection. Our
+approach includes an object-aware augmentation strategy to effectively
+diversify the source domain data, and we introduce a two-branch adaptation
+framework consisting of an adversarial training branch and a pseudo labeling
+branch, in order to simultaneously reach holistic-level and class-level domain
+alignment. The pseudo labeling is further refined through two proposed schemes
+specifically designed for indoor UDA. Our adaptation results from synthetic
+dataset 3D-FRONT to real-world datasets ScanNetV2 and SUN RGB-D demonstrate
+remarkable mAP25 improvements of 9.7% and 9.1% over Source-Only baselines,
+respectively, and consistently outperform the methods adapted from 2D and 3D
+outdoor scenarios. The code will be publicly available upon paper acceptance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Jump-teaching: Ultra Efficient and Robust Learning with Noisy Label 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17137v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17137v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kangye Ji, Fei Cheng, Zeqing Wang, Bohu Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sample selection is the most straightforward technique to combat label noise,
+aiming to distinguish mislabeled samples during training and avoid the
+degradation of the robustness of the model. In the workflow, $\textit{selecting
+possibly clean data}$ and $\textit{model update}$ are iterative. However, their
+interplay and intrinsic characteristics hinder the robustness and efficiency of
+learning with noisy labels: 1) The model chooses clean data with selection
+bias, leading to the accumulated error in the model update. 2) Most selection
+strategies leverage partner networks or supplementary information to mitigate
+label corruption, albeit with increased computation resources and lower
+throughput speed. Therefore, we employ only one network with the jump manner
+update to decouple the interplay and mine more semantic information from the
+loss for a more precise selection. Specifically, the selection of clean data
+for each model update is based on one of the prior models, excluding the last
+iteration. The strategy of model update exhibits a jump behavior in the form.
+Moreover, we map the outputs of the network and labels into the same semantic
+feature space, respectively. In this space, a detailed and simple loss
+distribution is generated to distinguish clean samples more effectively. Our
+proposed approach achieves almost up to $2.53\times$ speedup, $0.46\times$ peak
+memory footprint, and superior robustness over state-of-the-art works with
+various noise settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Searching a Compact Architecture for Robust Multi-Exposure Image Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.12236v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.12236v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhu Liu, Jinyuan Liu, Guanyao Wu, Zihang Chen, Xin Fan, Risheng Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, learning-based methods have achieved significant
+advancements in multi-exposure image fusion. However, two major stumbling
+blocks hinder the development, including pixel misalignment and inefficient
+inference. Reliance on aligned image pairs in existing methods causes
+susceptibility to artifacts due to device motion. Additionally, existing
+techniques often rely on handcrafted architectures with huge network
+engineering, resulting in redundant parameters, adversely impacting inference
+efficiency and flexibility. To mitigate these limitations, this study
+introduces an architecture search-based paradigm incorporating self-alignment
+and detail repletion modules for robust multi-exposure image fusion.
+  Specifically, targeting the extreme discrepancy of exposure, we propose the
+self-alignment module, leveraging scene relighting to constrain the
+illumination degree for following alignment and feature extraction. Detail
+repletion is proposed to enhance the texture details of scenes. Additionally,
+incorporating a hardware-sensitive constraint, we present the fusion-oriented
+architecture search to explore compact and efficient networks for fusion. The
+proposed method outperforms various competitive schemes, achieving a noteworthy
+3.19\% improvement in PSNR for general scenarios and an impressive 23.5\%
+enhancement in misaligned scenarios. Moreover, it significantly reduces
+inference time by 69.1\%. The code will be available at
+https://github.com/LiuZhu-CV/CRMEF.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Monkey: Image Resolution and Text Label Are Important Things for Large
+  Multi-modal Models <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.06607v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.06607v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhang Li, Biao Yang, Qiang Liu, Zhiyin Ma, Shuo Zhang, Jingxu Yang, Yabo Sun, Yuliang Liu, Xiang Bai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Multimodal Models (LMMs) have shown promise in vision-language tasks
+but struggle with high-resolution input and detailed scene understanding.
+Addressing these challenges, we introduce Monkey to enhance LMM capabilities.
+Firstly, Monkey processes input images by dividing them into uniform patches,
+each matching the size (e.g., 448x448) used in the original training of the
+well-trained vision encoder. Equipped with individual adapter for each patch,
+Monkey can handle higher resolutions up to 1344x896 pixels, enabling the
+detailed capture of complex visual information. Secondly, it employs a
+multi-level description generation method, enriching the context for
+scene-object associations. This two-part strategy ensures more effective
+learning from generated data: the higher resolution allows for a more detailed
+capture of visuals, which in turn enhances the effectiveness of comprehensive
+descriptions. Extensive ablative results validate the effectiveness of our
+designs. Additionally, experiments on 18 datasets further demonstrate that
+Monkey surpasses existing LMMs in many tasks like Image Captioning and various
+Visual Question Answering formats. Specially, in qualitative tests focused on
+dense text question answering, Monkey has exhibited encouraging results
+compared with GPT4V. Code is available at
+https://github.com/Yuliang-Liu/Monkey.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR 2024 Highlight</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dynamic Domains, Dynamic Solutions: DPCore for Continual Test-Time
+  Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.10737v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.10737v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunbei Zhang, Akshay Mehra, Jihun Hamm
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continual Test-Time Adaptation (CTTA) seeks to adapt a source pre-trained
+model to continually changing, unlabeled target domains. Existing TTA methods
+are typically designed for environments where domain changes occur sequentially
+and can struggle in more dynamic scenarios, as illustrated in Figure
+\ref{fig:settings}. Inspired by the principles of online K-Means, we introduce
+a novel approach to CTTA through visual prompting. We propose a \emph{Dynamic
+Prompt Coreset} that not only preserves knowledge from previously visited
+domains but also accommodates learning from new potential domains. This is
+complemented by a distance-based \emph{Weight Updating Mechanism} that ensures
+the coreset remains current and relevant. Our approach employs a fixed model
+architecture alongside the coreset and an innovative updating system to
+effectively mitigate challenges such as catastrophic forgetting and error
+accumulation. Extensive testing on four widely-used benchmarks demonstrates
+that our method consistently outperforms state-of-the-art alternatives in both
+classification and segmentation CTTA tasks across the structured and dynamic
+CTTA settings, with $99\%$ fewer trainable parameters.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Practical X-ray Gastric Cancer Screening Using Refined Stochastic Data
+  Augmentation and Hard Boundary Box Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2108.08158v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2108.08158v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hideaki Okamoto, Quan Huu Cap, Takakiyo Nomura, Kazuhito Nabeshima, Jun Hashimoto, Hitoshi Iyatomi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Endoscopy is widely used to diagnose gastric cancer and has a high diagnostic
+performance, but it must be performed by a physician, which limits the number
+of people who can be diagnosed. In contrast, gastric X-rays can be performed by
+technicians and screen a much larger number of patients, but accurate diagnosis
+requires experience. We propose an unprecedented and practical gastric cancer
+diagnosis support system for gastric X-ray images, enabling more people to be
+screened. The system is based on a general deep learning-based object detection
+model and incorporates two novel techniques: refined probabilistic stomach
+image augmentation (R-sGAIA) and hard boundary box training (HBBT). R-sGAIA
+enhances the probabilistic gastric fold region, providing more learning
+patterns for cancer detection models. HBBT is an efficient training method that
+improves model performance by allowing the use of unannotated negative (i.e.,
+healthy control) samples, which are typically unusable in conventional
+detection models. The proposed system achieves a sensitivity (SE) for gastric
+cancer of 90.2%, higher than that of an expert (85.5%). Additionally, two out
+of five detected candidate boxes are cancerous, maintaining high precision
+while processing images at a speed of 0.51 seconds per image. The system also
+outperforms methods using the same object detection model and state-of-the-art
+data augmentation, showing a 5.9-point improvement in the F1 score. In summary,
+this system efficiently identifies areas for radiologists to examine within a
+practical timeframe, significantly reducing their workload.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Unified Membership Inference Method for Visual <span class="highlight-title">Self-supervised</span> Encoder
+  via Part-aware Capability <span class="chip">CCS2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.02462v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.02462v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jie Zhu, Jirong Zha, Ding Li, Leye Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning shows promise in harnessing extensive unlabeled
+data, but it also confronts significant privacy concerns, especially in vision.
+In this paper, we aim to perform membership inference on visual self-supervised
+models in a more realistic setting: self-supervised training method and details
+are unknown for an adversary when attacking as he usually faces a black-box
+system in practice. In this setting, considering that self-supervised model
+could be trained by completely different self-supervised paradigms, e.g.,
+masked image modeling and contrastive learning, with complex training details,
+we propose a unified membership inference method called PartCrop. It is
+motivated by the shared part-aware capability among models and stronger part
+response on the training data. Specifically, PartCrop crops parts of objects in
+an image to query responses with the image in representation space. We conduct
+extensive attacks on self-supervised models with different training protocols
+and structures using three widely used image datasets. The results verify the
+effectiveness and generalization of PartCrop. Moreover, to defend against
+PartCrop, we evaluate two common approaches, i.e., early stop and differential
+privacy, and propose a tailored method called shrinking crop scale range. The
+defense experiments indicate that all of them are effective. Our code is
+available at https://github.com/JiePKU/PartCrop.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM CCS2024, Full version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Teaching AI the Anatomy Behind the Scan: Addressing Anatomical Flaws in
+  Medical Image Segmentation with Learnable Prior 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.18878v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.18878v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Young Seok Jeon, Hongfei Yang, Huazhu Fu, Mengling Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Imposing key anatomical features, such as the number of organs, their shapes
+and relative positions, is crucial for building a robust multi-organ
+segmentation model. Current attempts to incorporate anatomical features include
+broadening the effective receptive field (ERF) size with data-intensive
+modules, or introducing anatomical constraints that scales poorly to
+multi-organ segmentation. We introduce a novel architecture called the
+Anatomy-Informed Cascaded Segmentation Network (AIC-Net). AIC-Net incorporates
+a learnable input termed "Anatomical Prior", which can be adapted to
+patient-specific anatomy using a differentiable spatial deformation. The
+deformed prior later guides decoder layers towards more anatomy-informed
+predictions. We repeat this process at a local patch level to enhance the
+representation of intricate objects, resulting in a cascaded network structure.
+AIC-Net is a general method that enhances any existing segmentation models to
+be more anatomy-aware. We have validated the performance of AIC-Net, with
+various backbones, on two multi-organ segmentation tasks: abdominal organs and
+vertebrae. For each respective task, our benchmarks demonstrate improved dice
+score and Hausdorff distance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ UNetMamba: An Efficient UNet-Like Mamba for Semantic Segmentation of
+  High-Resolution Remote Sensing Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.11545v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.11545v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Enze Zhu, Zhan Chen, Dingkai Wang, Hanru Shi, Xiaoxuan Liu, Lei Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic segmentation of high-resolution remote sensing images is vital in
+downstream applications such as land-cover mapping, urban planning and disaster
+assessment.Existing Transformer-based methods suffer from the constraint
+between accuracy and efficiency, while the recently proposed Mamba is renowned
+for being efficient. Therefore, to overcome the dilemma, we propose UNetMamba,
+a UNet-like semantic segmentation model based on Mamba. It incorporates a mamba
+segmentation decoder (MSD) that can efficiently decode the complex information
+within high-resolution images, and a local supervision module (LSM), which is
+train-only but can significantly enhance the perception of local contents.
+Extensive experiments demonstrate that UNetMamba outperforms the
+state-of-the-art methods with mIoU increased by 0.87% on LoveDA and 0.36% on
+ISPRS Vaihingen, while achieving high efficiency through the lightweight
+design, less memory footprint and reduced computational cost. The source code
+is available at https://github.com/EnzeZhu2001/UNetMamba.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From Text to Pixel: Advancing Long-Context Understanding in MLLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.14213v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.14213v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yujie Lu, Xiujun Li, Tsu-Jui Fu, Miguel Eckstein, William Yang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid progress in Multimodal Large Language Models (MLLMs) has
+significantly advanced their ability to process and understand complex visual
+and textual information. However, the integration of multiple images and
+extensive textual contexts remains a challenge due to the inherent limitation
+of the models' capacity to handle long input sequences efficiently. In this
+paper, we introduce SEEKER, a multimodal large language model designed to
+tackle this issue. SEEKER aims to optimize the compact encoding of long text by
+compressing the text sequence into the visual pixel space via images, enabling
+the model to handle long text within a fixed token-length budget efficiently.
+Our empirical experiments on six long-context multimodal tasks demonstrate that
+SEEKER can leverage fewer image tokens to convey the same amount of textual
+information compared with the OCR-based approach, and is more efficient in
+understanding long-form multimodal input and generating long-form textual
+output, outperforming all existing proprietary and open-source MLLMs by large
+margins.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MaVEn: An Effective Multi-granularity Hybrid Visual Encoding Framework
+  for Multimodal Large Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.12321v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.12321v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaoya Jiang, Jia Hongrui, Haiyang Xu, Wei Ye, Mengfan Dong, Ming Yan, Ji Zhang, Fei Huang, Shikun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents MaVEn, an innovative Multi-granularity Visual Encoding
+framework designed to enhance the capabilities of Multimodal Large Language
+Models (MLLMs) in multi-image reasoning. Current MLLMs primarily focus on
+single-image visual understanding, limiting their ability to interpret and
+integrate information across multiple images. MaVEn addresses this limitation
+by combining discrete visual symbol sequences, which abstract coarse-grained
+semantic concepts, with traditional continuous representation sequences that
+model fine-grained features. This dual approach bridges the semantic gap
+between visual and textual data, thereby improving the model's ability to
+process and interpret information from multiple images effectively.
+Additionally, we design a dynamic reduction mechanism by for long-sequence
+continuous features to enhance multi-image processing efficiency. Experimental
+results demonstrate that MaVEn significantly enhances MLLMs' understanding in
+complex multi-image scenarios, while also improving performance in single-image
+contexts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Practical Guidelines for Cell Segmentation Models Under Optical
+  Aberrations in Microscopy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.08549v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.08549v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Boyuan Peng, Jiaju Chen, P. Bilha Githinji, Ijaz Gul, Qihui Ye, Minjiang Chen, Peiwu Qin, Xingru Huang, Chenggang Yan, Dongmei Yu, Jiansong Ji, Zhenglin Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cell segmentation is essential in biomedical research for analyzing cellular
+morphology and behavior. Deep learning methods, particularly convolutional
+neural networks (CNNs), have revolutionized cell segmentation by extracting
+intricate features from images. However, the robustness of these methods under
+microscope optical aberrations remains a critical challenge. This study
+evaluates cell image segmentation models under optical aberrations from
+fluorescence and bright field microscopy. By simulating different types of
+aberrations, including astigmatism, coma, spherical aberration, trefoil, and
+mixed aberrations, we conduct a thorough evaluation of various cell instance
+segmentation models using the DynamicNuclearNet (DNN) and LIVECell datasets,
+representing fluorescence and bright field microscopy cell datasets,
+respectively. We train and test several segmentation models, including the Otsu
+threshold method and Mask R-CNN with different network heads (FPN, C3) and
+backbones (ResNet, VGG, Swin Transformer), under aberrated conditions.
+Additionally, we provide usage recommendations for the Cellpose 2.0 Toolbox on
+complex cell degradation images. The results indicate that the combination of
+FPN and SwinS demonstrates superior robustness in handling simple cell images
+affected by minor aberrations. In contrast, Cellpose 2.0 proves effective for
+complex cell images under similar conditions. Furthermore, we innovatively
+propose the Point Spread Function Image Label Classification Model (PLCM). This
+model can quickly and accurately identify aberration types and amplitudes from
+PSF images, assisting researchers without optical training. Through PLCM,
+researchers can better apply our proposed cell segmentation guidelines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Semantic Communication based on Large Language Model for Underwater
+  Image Transmission 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.12616v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.12616v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weilong Chen, Wenxuan Xu, Haoran Chen, Xinran Zhang, Zhijin Qin, Yanru Zhang, Zhu Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Underwater communication is essential for environmental monitoring, marine
+biology research, and underwater exploration. Traditional underwater
+communication faces limitations like low bandwidth, high latency, and
+susceptibility to noise, while semantic communication (SC) offers a promising
+solution by focusing on the exchange of semantics rather than symbols or bits.
+However, SC encounters challenges in underwater environments, including
+semantic information mismatch and difficulties in accurately identifying and
+transmitting critical information that aligns with the diverse requirements of
+underwater applications. To address these challenges, we propose a novel
+Semantic Communication (SC) framework based on Large Language Models (LLMs).
+Our framework leverages visual LLMs to perform semantic compression and
+prioritization of underwater image data according to the query from users. By
+identifying and encoding key semantic elements within the images, the system
+selectively transmits high-priority information while applying higher
+compression rates to less critical regions. On the receiver side, an LLM-based
+recovery mechanism, along with Global Vision ControlNet and Key Region
+ControlNet networks, aids in reconstructing the images, thereby enhancing
+communication efficiency and robustness. Our framework reduces the overall data
+size to 0.8\% of the original. Experimental results demonstrate that our method
+significantly outperforms existing approaches, ensuring high-quality,
+semantically accurate image reconstruction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Aligning Cyber Space with Physical World: A Comprehensive <span class="highlight-title">Survey</span> on
+  Embodied AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.06886v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.06886v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Liu, Weixing Chen, Yongjie Bai, Xiaodan Liang, Guanbin Li, Wen Gao, Liang Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Embodied Artificial Intelligence (Embodied AI) is crucial for achieving
+Artificial General Intelligence (AGI) and serves as a foundation for various
+applications that bridge cyberspace and the physical world. Recently, the
+emergence of Multi-modal Large Models (MLMs) and World Models (WMs) have
+attracted significant attention due to their remarkable perception,
+interaction, and reasoning capabilities, making them a promising architecture
+for the brain of embodied agents. However, there is no comprehensive survey for
+Embodied AI in the era of MLMs. In this survey, we give a comprehensive
+exploration of the latest advancements in Embodied AI. Our analysis firstly
+navigates through the forefront of representative works of embodied robots and
+simulators, to fully understand the research focuses and their limitations.
+Then, we analyze four main research targets: 1) embodied perception, 2)
+embodied interaction, 3) embodied agent, and 4) sim-to-real adaptation,
+covering the state-of-the-art methods, essential paradigms, and comprehensive
+datasets. Additionally, we explore the complexities of MLMs in virtual and real
+embodied agents, highlighting their significance in facilitating interactions
+in dynamic digital and physical environments. Finally, we summarize the
+challenges and limitations of embodied AI and discuss their potential future
+directions. We hope this survey will serve as a foundational reference for the
+research community and inspire continued innovation. The associated project can
+be found at https://github.com/HCPLab-SYSU/Embodied_AI_Paper_List.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The first comprehensive review of Embodied AI in the era of MLMs, 39
+  pages. We also provide the paper list for Embodied AI:
+  https://github.com/HCPLab-SYSU/Embodied_AI_Paper_List</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reliable Representations Learning for Incomplete Multi-View Partial
+  Multi-Label Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.17117v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.17117v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengliang Liu, Jie Wen, Yong Xu, Bob Zhang, Liqiang Nie, Min Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As a cross-topic of multi-view learning and multi-label classification,
+multi-view multi-label classification has gradually gained traction in recent
+years. The application of multi-view contrastive learning has further
+facilitated this process, however, the existing multi-view contrastive learning
+methods crudely separate the so-called negative pair, which largely results in
+the separation of samples belonging to the same category or similar ones.
+Besides, plenty of multi-view multi-label learning methods ignore the possible
+absence of views and labels. To address these issues, in this paper, we propose
+an incomplete multi-view partial multi-label classification network named RANK.
+In this network, a label-driven multi-view contrastive learning strategy is
+proposed to leverage supervised information to preserve the structure within
+view and perform consistent alignment across views. Furthermore, we break
+through the view-level weights inherent in existing methods and propose a
+quality-aware sub-network to dynamically assign quality scores to each view of
+each sample. The label correlation information is fully utilized in the final
+multi-label cross-entropy classification loss, effectively improving the
+discriminative power. Last but not least, our model is not only able to handle
+complete multi-view multi-label datasets, but also works on datasets with
+missing instances and labels. Extensive experiments confirm that our RANK
+outperforms existing state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Please contact me if you have any questions: liucl1996@163.com</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Perception-guided Jailbreak against Text-to-Image Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.10848v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.10848v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yihao Huang, Le Liang, Tianlin Li, Xiaojun Jia, Run Wang, Weikai Miao, Geguang Pu, Yang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, Text-to-Image (T2I) models have garnered significant
+attention due to their remarkable advancements. However, security concerns have
+emerged due to their potential to generate inappropriate or Not-Safe-For-Work
+(NSFW) images. In this paper, inspired by the observation that texts with
+different semantics can lead to similar human perceptions, we propose an
+LLM-driven perception-guided jailbreak method, termed PGJ. It is a black-box
+jailbreak method that requires no specific T2I model (model-free) and generates
+highly natural attack prompts. Specifically, we propose identifying a safe
+phrase that is similar in human perception yet inconsistent in text semantics
+with the target unsafe word and using it as a substitution. The experiments
+conducted on six open-source models and commercial online services with
+thousands of prompts have verified the effectiveness of PGJ.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Self-supervised</span> Learning of LiDAR 3D Point Clouds via 2D-3D Neural
+  Calibration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12452v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12452v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Zhang, Siyu Ren, Junhui Hou, Jinjian Wu, Yixuan Yuan, Guangming Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a novel self-supervised learning framework for
+enhancing 3D perception in autonomous driving scenes. Specifically, our
+approach, namely NCLR, focuses on 2D-3D neural calibration, a novel pretext
+task that estimates the rigid pose aligning camera and LiDAR coordinate
+systems. First, we propose the learnable transformation alignment to bridge the
+domain gap between image and point cloud data, converting features into a
+unified representation space for effective comparison and matching. Second, we
+identify the overlapping area between the image and point cloud with the fused
+features. Third, we establish dense 2D-3D correspondences to estimate the rigid
+pose. The framework not only learns fine-grained matching from points to pixels
+but also achieves alignment of the image and point cloud at a holistic level,
+understanding their relative pose. We demonstrate the efficacy of NCLR by
+applying the pre-trained backbone to downstream tasks, such as LiDAR-based 3D
+semantic segmentation, object detection, and panoptic segmentation.
+Comprehensive experiments on various datasets illustrate the superiority of
+NCLR over existing self-supervised methods. The results confirm that joint
+learning from different modalities significantly enhances the network's
+understanding abilities and effectiveness of learned representation. The code
+is publicly available at https://github.com/Eaphan/NCLR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OCRBench: On the Hidden Mystery of OCR in Large Multimodal Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.07895v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.07895v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuliang Liu, Zhang Li, Mingxin Huang, Biao Yang, Wenwen Yu, Chunyuan Li, Xucheng Yin, Cheng-lin Liu, Lianwen Jin, Xiang Bai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large models have recently played a dominant role in natural language
+processing and multimodal vision-language learning. However, their
+effectiveness in text-related visual tasks remains relatively unexplored. In
+this paper, we conducted a comprehensive evaluation of Large Multimodal Models,
+such as GPT4V and Gemini, in various text-related visual tasks including Text
+Recognition, Scene Text-Centric Visual Question Answering (VQA),
+Document-Oriented VQA, Key Information Extraction (KIE), and Handwritten
+Mathematical Expression Recognition (HMER). To facilitate the assessment of
+Optical Character Recognition (OCR) capabilities in Large Multimodal Models, we
+propose OCRBench, a comprehensive evaluation benchmark. OCRBench contains 29
+datasets, making it the most comprehensive OCR evaluation benchmark available.
+Furthermore, our study reveals both the strengths and weaknesses of these
+models, particularly in handling multilingual text, handwritten text,
+non-semantic text, and mathematical expression recognition. Most importantly,
+the baseline results presented in this study could provide a foundational
+framework for the conception and assessment of innovative strategies targeted
+at enhancing zero-shot multimodal techniques. The evaluation pipeline and
+benchmark are available at https://github.com/Yuliang-Liu/MultimodalOCR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PALM: Pushing Adaptive Learning Rate Mechanisms for Continual Test-Time
+  Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.10650v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.10650v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sarthak Kumar Maharana, Baoming Zhang, Yunhui Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Real-world vision models in dynamic environments face rapid shifts in domain
+distributions, leading to decreased recognition performance. Using unlabeled
+test data, continual test-time adaptation (CTTA) directly adjusts a pre-trained
+source discriminative model to these changing domains. A highly effective CTTA
+method involves applying layer-wise adaptive learning rates for selectively
+adapting pre-trained layers. However, it suffers from the poor estimation of
+domain shift and the inaccuracies arising from the pseudo-labels. This work
+aims to overcome these limitations by identifying layers for adaptation via
+quantifying model prediction uncertainty without relying on pseudo-labels. We
+utilize the magnitude of gradients as a metric, calculated by backpropagating
+the KL divergence between the softmax output and a uniform distribution, to
+select layers for further adaptation. Subsequently, for the parameters
+exclusively belonging to these selected layers, with the remaining ones frozen,
+we evaluate their sensitivity to approximate the domain shift and adjust their
+learning rates accordingly. We conduct extensive image classification
+experiments on CIFAR-10C, CIFAR-100C, and ImageNet-C, demonstrating the
+superior efficacy of our method compared to prior approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Phase-shifted remote photoplethysmography for estimating heart rate and
+  blood pressure from facial video 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.04560v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.04560v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gyutae Hwang, Sang Jun Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human health can be critically affected by cardiovascular diseases, such as
+hypertension, arrhythmias, and stroke. Heart rate and blood pressure are
+important biometric information for the monitoring of cardiovascular system and
+early diagnosis of cardiovascular diseases. Existing methods for estimating the
+heart rate are based on electrocardiography and photoplethyomography, which
+require contacting the sensor to the skin surface. Moreover, catheter and
+cuff-based methods for measuring blood pressure cause inconvenience and have
+limited applicability. Therefore, in this thesis, we propose a vision-based
+method for estimating the heart rate and blood pressure. This thesis proposes a
+2-stage deep learning framework consisting of a dual remote
+photoplethysmography network (DRP-Net) and bounded blood pressure network
+(BBP-Net). In the first stage, DRP-Net infers remote photoplethysmography
+(rPPG) signals for the acral and facial regions, and these phase-shifted rPPG
+signals are utilized to estimate the heart rate. In the second stage, BBP-Net
+integrates temporal features and analyzes phase discrepancy between the acral
+and facial rPPG signals to estimate SBP and DBP values. To improve the accuracy
+of estimating the heart rate, we employed a data augmentation method based on a
+frame interpolation model. Moreover, we designed BBP-Net to infer blood
+pressure within a predefined range by incorporating a scaled sigmoid function.
+Our method resulted in estimating the heart rate with the mean absolute error
+(MAE) of 1.78 BPM, reducing the MAE by 34.31 % compared to the recent method,
+on the MMSE-HR dataset. The MAE for estimating the systolic blood pressure
+(SBP) and diastolic blood pressure (DBP) were 10.19 mmHg and 7.09 mmHg. On the
+V4V dataset, the MAE for the heart rate, SBP, and DBP were 3.83 BPM, 13.64
+mmHg, and 9.4 mmHg, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>33 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Obtaining Optimal Spiking Neural Network in Sequence Learning via
+  CRNN-SNN Conversion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.09403v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.09403v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiahao Su, Kang You, Zekai Xu, Weizhi Xu, Zhezhi He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spiking neural networks (SNNs) are becoming a promising alternative to
+conventional artificial neural networks (ANNs) due to their rich neural
+dynamics and the implementation of energy-efficient neuromorphic chips.
+However, the non-differential binary communication mechanism makes SNN hard to
+converge to an ANN-level accuracy. When SNN encounters sequence learning, the
+situation becomes worse due to the difficulties in modeling long-range
+dependencies. To overcome these difficulties, researchers developed variants of
+LIF neurons and different surrogate gradients but still failed to obtain good
+results when the sequence became longer (e.g., $>$500). Unlike them, we obtain
+an optimal SNN in sequence learning by directly mapping parameters from a
+quantized CRNN. We design two sub-pipelines to support the end-to-end
+conversion of different structures in neural networks, which is called
+CNN-Morph (CNN $\rightarrow$ QCNN $\rightarrow$ BIFSNN) and RNN-Morph (RNN
+$\rightarrow$ QRNN $\rightarrow$ RBIFSNN). Using conversion pipelines and the
+s-analog encoding method, the conversion error of our framework is zero.
+Furthermore, we give the theoretical and experimental demonstration of the
+lossless CRNN-SNN conversion. Our results show the effectiveness of our method
+over short and long timescales tasks compared with the state-of-the-art
+learning- and conversion-based methods. We reach the highest accuracy of 99.16%
+(0.46 $\uparrow$) on S-MNIST, 94.95% (3.95 $\uparrow$) on PS-MNIST (sequence
+length of 784) respectively, and the lowest loss of 0.057 (0.013 $\downarrow$)
+within 8 time-steps in collision avoidance dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by 33rd International Conference on Artificial Neural
+  Networks</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GSFusion: Online RGB-D Mapping Where Gaussian Splatting Meets TSDF
+  Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.12677v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.12677v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaxin Wei, Stefan Leutenegger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional volumetric fusion algorithms preserve the spatial structure of 3D
+scenes, which is beneficial for many tasks in computer vision and robotics.
+However, they often lack realism in terms of visualization. Emerging 3D
+Gaussian splatting bridges this gap, but existing Gaussian-based reconstruction
+methods often suffer from artifacts and inconsistencies with the underlying 3D
+structure, and struggle with real-time optimization, unable to provide users
+with immediate feedback in high quality. One of the bottlenecks arises from the
+massive amount of Gaussian parameters that need to be updated during
+optimization. Instead of using 3D Gaussian as a standalone map representation,
+we incorporate it into a volumetric mapping system to take advantage of
+geometric information and propose to use a quadtree data structure on images to
+drastically reduce the number of splats initialized. In this way, we
+simultaneously generate a compact 3D Gaussian map with fewer artifacts and a
+volumetric map on the fly. Our method, GSFusion, significantly enhances
+computational efficiency without sacrificing rendering quality, as demonstrated
+on both synthetic and real datasets. Code will be available at
+https://github.com/goldoak/GSFusion.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Image-to-Text Logic Jailbreak: Your Imagination can Help You Do Anything 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.02534v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.02534v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaotian Zou, Ke Li, Yongkang Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Visual Language Model\textbfs (VLMs) such as GPT-4V have achieved
+remarkable success in generating comprehensive and nuanced responses.
+Researchers have proposed various benchmarks for evaluating the capabilities of
+VLMs. With the integration of visual and text inputs in VLMs, new security
+issues emerge, as malicious attackers can exploit multiple modalities to
+achieve their objectives. This has led to increasing attention on the
+vulnerabilities of VLMs to jailbreak. Most existing research focuses on
+generating adversarial images or nonsensical image to jailbreak these models.
+However, no researchers evaluate whether logic understanding capabilities of
+VLMs in flowchart can influence jailbreak. Therefore, to fill this gap, this
+paper first introduces a novel dataset Flow-JD specifically designed to
+evaluate the logic-based flowchart jailbreak capabilities of VLMs. We conduct
+an extensive evaluation on GPT-4o, GPT-4V, other 5 SOTA open source VLMs and
+the jailbreak rate is up to 92.8%. Our research reveals significant
+vulnerabilities in current VLMs concerning image-to-text jailbreak and these
+findings underscore the the urgency for the development of robust and effective
+future defenses.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PCNN: Probable-Class Nearest-Neighbor Explanations Improve Fine-Grained
+  Image Classification Accuracy for AIs and Humans 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.13651v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.13651v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                         Giang,  Nguyen, Valerie Chen, Mohammad Reza Taesiri, Anh Totti Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nearest neighbors (NN) are traditionally used to compute final decisions,
+e.g., in Support Vector Machines or k-NN classifiers, and to provide users with
+explanations for the model's decision. In this paper, we show a novel utility
+of nearest neighbors: To improve predictions of a frozen, pretrained image
+classifier C. We leverage an image comparator S that (1) compares the input
+image with NN images from the top-K most probable classes given by C; and (2)
+uses scores from S to weight the confidence scores of C to refine predictions.
+Our method consistently improves fine-grained image classification accuracy on
+CUB-200, Cars-196, and Dogs-120. Also, a human study finds that showing users
+our probable-class nearest neighbors (PCNN) reduces over-reliance on AI, thus
+improving their decision accuracy over prior work which only shows only the
+most-probable (top-1) class examples.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Transaction on Machine Learning Research 2024; 50 pages,
+  35 Figures & 17 Tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Analysis of Unstructured High-Density Crowded Scenes for Crowd
+  Monitoring 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.11836v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.11836v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexandre Matov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We are interested in developing an automated system for detection of
+organized movements in human crowds. Computer vision algorithms can extract
+information from videos of crowded scenes and automatically detect and track
+groups of individuals undergoing organized motion that represents an anomalous
+behavior in the context of conflict aversion. Our system can detect organized
+cohorts against the background of randomly moving objects and we can estimate
+the number of participants in an organized cohort, the speed and direction of
+motion in real time, within three to four video frames, which is less than one
+second from the onset of motion captured on a CCTV. We have performed
+preliminary analysis in this context in biological cell data containing up to
+four thousand objects per frame and will extend this numerically to a
+hundred-fold for public safety applications.
+  We envisage using the existing infrastructure of video cameras for acquiring
+image datasets on-the-fly and deploying an easy-to-use data-driven software
+system for parsing of significant events by analyzing image sequences taken
+inside and outside of sports stadiums or other public venues. Other prospective
+users are organizers of political rallies, civic and wildlife organizations,
+security firms, and the military. We will optimize the performance of the
+software by implementing a classification method able to distinguish between
+activities posing a threat and those not posing a threat.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ What Color Scheme is More Effective in Assisting Readers to Locate
+  Information in a Color-Coded Article? <span class="chip">IEEE VIS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.06494v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.06494v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ho Yin Ng, Zeyu He, Ting-Hao 'Kenneth' Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Color coding, a technique assigning specific colors to cluster information
+types, has proven advantages in aiding human cognitive activities, especially
+reading and comprehension. The rise of Large Language Models (LLMs) has
+streamlined document coding, enabling simple automatic text labeling with
+various schemes. This has the potential to make color-coding more accessible
+and benefit more users. However, the impact of color choice on information
+seeking is understudied. We conducted a user study assessing various color
+schemes' effectiveness in LLM-coded text documents, standardizing contrast
+ratios to approximately 5.55:1 across schemes. Participants performed timed
+information-seeking tasks in color-coded scholarly abstracts. Results showed
+non-analogous and yellow-inclusive color schemes improved performance, with the
+latter also being more preferred by participants. These findings can inform
+better color scheme choices for text annotation. As LLMs advance document
+coding, we advocate for more research focusing on the "color" aspect of
+color-coding techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper will appear at IEEE VIS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Denoising Monte Carlo Renders with Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.00491v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.00491v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vaibhav Vavilala, Rahul Vasanth, David Forsyth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Physically-based renderings contain Monte-Carlo noise, with variance that
+increases as the number of rays per pixel decreases. This noise, while
+zero-mean for good modern renderers, can have heavy tails (most notably, for
+scenes containing specular or refractive objects). Learned methods for
+restoring low fidelity renders are highly developed, because suppressing render
+noise means one can save compute and use fast renders with few rays per pixel.
+We demonstrate that a diffusion model can denoise low fidelity renders
+successfully. Furthermore, our method can be conditioned on a variety of
+natural render information, and this conditioning helps performance.
+Quantitative experiments show that our method is competitive with SOTA across a
+range of sampling rates. Qualitative examination of the reconstructions
+suggests that the image prior applied by a diffusion method strongly favors
+reconstructions that are like real images -- so have straight shadow
+boundaries, curved specularities and no fireflies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 18 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">15</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Contextual Bandit with Herding Effects: Algorithms and Recommendation
+  Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14432v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14432v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luyue Xu, Liming Wang, Hong Xie, Mingqiang Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contextual bandits serve as a fundamental algorithmic framework for
+optimizing recommendation decisions online. Though extensive attention has been
+paid to tailoring contextual bandits for recommendation applications, the
+"herding effects" in user feedback have been ignored. These herding effects
+bias user feedback toward historical ratings, breaking down the assumption of
+unbiased feedback inherent in contextual bandits. This paper develops a novel
+variant of the contextual bandit that is tailored to address the feedback bias
+caused by the herding effects. A user feedback model is formulated to capture
+this feedback bias. We design the TS-Conf (Thompson Sampling under Conformity)
+algorithm, which employs posterior sampling to balance the exploration and
+exploitation tradeoff. We prove an upper bound for the regret of the algorithm,
+revealing the impact of herding effects on learning speed. Extensive
+experiments on datasets demonstrate that TS-Conf outperforms four benchmark
+algorithms. Analysis reveals that TS-Conf effectively mitigates the negative
+impact of herding effects, resulting in faster learning and improved
+recommendation accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CURE4Rec: A Benchmark for Recommendation Unlearning with Deeper
+  Influence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14393v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14393v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaochao Chen, Jiaming Zhang, Yizhao Zhang, Li Zhang, Lingjuan Lyu, Yuyuan Li, Biao Gong, Chenggang Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With increasing privacy concerns in artificial intelligence, regulations have
+mandated the right to be forgotten, granting individuals the right to withdraw
+their data from models. Machine unlearning has emerged as a potential solution
+to enable selective forgetting in models, particularly in recommender systems
+where historical data contains sensitive user information. Despite recent
+advances in recommendation unlearning, evaluating unlearning methods
+comprehensively remains challenging due to the absence of a unified evaluation
+framework and overlooked aspects of deeper influence, e.g., fairness. To
+address these gaps, we propose CURE4Rec, the first comprehensive benchmark for
+recommendation unlearning evaluation. CURE4Rec covers four aspects, i.e.,
+unlearning Completeness, recommendation Utility, unleaRning efficiency, and
+recommendation fairnEss, under three data selection strategies, i.e., core
+data, edge data, and random data. Specifically, we consider the deeper
+influence of unlearning on recommendation fairness and robustness towards data
+with varying impact levels. We construct multiple datasets with CURE4Rec
+evaluation and conduct extensive experiments on existing recommendation
+unlearning methods. Our code is released at
+https://github.com/xiye7lai/CURE4Rec.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Are LLM-based Recommenders Already the Best? Simple Scaled Cross-entropy
+  Unleashes the Potential of Traditional Sequential Recommenders 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14238v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14238v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cong Xu, Zhangchi Zhu, Mo Yu, Jun Wang, Jianyong Wang, Wei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have been garnering increasing attention in the
+recommendation community. Some studies have observed that LLMs, when fine-tuned
+by the cross-entropy (CE) loss with a full softmax, could achieve
+`state-of-the-art' performance in sequential recommendation. However, most of
+the baselines used for comparison are trained using a pointwise/pairwise loss
+function. This inconsistent experimental setting leads to the underestimation
+of traditional methods and further fosters over-confidence in the ranking
+capability of LLMs.
+  In this study, we provide theoretical justification for the superiority of
+the cross-entropy loss by demonstrating its two desirable properties: tightness
+and coverage. Furthermore, this study sheds light on additional novel insights:
+1) Taking into account only the recommendation performance, CE is not yet
+optimal as it is not a quite tight bound in terms of some ranking metrics. 2)
+In scenarios that full softmax cannot be performed, an effective alternative is
+to scale up the sampled normalizing term. These findings then help unleash the
+potential of traditional recommendation models, allowing them to surpass
+LLM-based counterparts. Given the substantial computational burden, existing
+LLM-based methods are not as effective as claimed for sequential
+recommendation. We hope that these theoretical understandings in conjunction
+with the empirical results will facilitate an objective evaluation of LLM-based
+recommendation in the future.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages. arXiv admin note: substantial text overlap with
+  arXiv:2402.06216</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Lifelong Learning Embeddings: An Algorithmic Approach to
+  Dynamically Extend Embeddings <span class="chip">KDD2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14118v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14118v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miguel Alves Gomes, Philipp Meisen, Tobias Meisen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid evolution of technology has transformed business operations and
+customer interactions worldwide, with personalization emerging as a key
+opportunity for e-commerce companies to engage customers more effectively. The
+application of machine learning, particularly that of deep learning models, has
+gained significant traction due to its ability to rapidly recognize patterns in
+large datasets, thereby offering numerous possibilities for personalization.
+These models use embeddings to map discrete information, such as product IDs,
+into a latent vector space, a method increasingly popular in recent years.
+However, e-commerce's dynamic nature, characterized by frequent new product
+introductions, poses challenges for these embeddings, which typically require
+fixed dimensions and inputs, leading to the need for periodic retraining from
+scratch. This paper introduces a modular algorithm that extends embedding input
+size while preserving learned knowledge, addressing the challenges posed by
+e-commerce's dynamism. The proposed algorithm also incorporates strategies to
+mitigate the cold start problem associated with new products. The results of
+initial experiments suggest that this method outperforms traditional
+embeddings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted Extended Abstract for 3rd Workshop on End-End Customer
+  Journey Optimization at KDD2024, Barcelona, Spain</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AgentMove: Predicting Human Mobility Anywhere Using Large Language Model
+  based Agentic Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13986v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13986v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jie Feng, Yuwei Du, Jie Zhao, Yong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human mobility prediction plays a crucial role in various real-world
+applications. Although deep learning based models have shown promising results
+over the past decade, their reliance on extensive private mobility data for
+training and their inability to perform zero-shot predictions, have hindered
+further advancements. Recently, attempts have been made to apply large language
+models (LLMs) to mobility prediction task. However, their performance has been
+constrained by the absence of a systematic design of workflow. They directly
+generate the final output using LLMs, which limits the potential of LLMs to
+uncover complex mobility patterns and underestimates their extensive reserve of
+global geospatial knowledge. In this paper, we introduce AgentMove, a
+systematic agentic prediction framework to achieve generalized mobility
+prediction for any cities worldwide. In AgentMove, we first decompose the
+mobility prediction task into three sub-tasks and then design corresponding
+modules to complete these subtasks, including spatial-temporal memory for
+individual mobility pattern mining, world knowledge generator for modeling the
+effects of urban structure and collective knowledge extractor for capturing the
+shared patterns among population. Finally, we combine the results of three
+modules and conduct a reasoning step to generate the final predictions.
+Extensive experiments on mobility data from two sources in 12 cities
+demonstrate that AgentMove outperforms the best baseline more than 8% in
+various metrics and it shows robust predictions with various LLMs as base and
+also less geographical bias across cities. Codes and data can be found in
+https://github.com/tsinghua-fib-lab/AgentMove.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Smart Multi-Modal Search: Contextual Sparse and Dense Embedding
+  Integration in Adobe Express 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14698v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14698v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cherag Aroraa, Tracy Holloway King, Jayant Kumar, Yi Lu, Sanat Sharma, Arvind Srikantan, David Uvalle, Josep Valls-Vargas, Harsha Vardhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As user content and queries become increasingly multi-modal, the need for
+effective multi-modal search systems has grown. Traditional search systems
+often rely on textual and metadata annotations for indexed images, while
+multi-modal embeddings like CLIP enable direct search using text and image
+embeddings. However, embedding-based approaches face challenges in integrating
+contextual features such as user locale and recency. Building a scalable
+multi-modal search system requires fine-tuning several components. This paper
+presents a multi-modal search architecture and a series of AB tests that
+optimize embeddings and multi-modal technologies in Adobe Express template
+search. We address considerations such as embedding model selection, the roles
+of embeddings in matching and ranking, and the balance between dense and sparse
+embeddings. Our iterative approach demonstrates how utilizing sparse, dense,
+and contextual features enhances short and long query search, significantly
+reduces null rates (over 70\%), and increases click-through rates (CTR). Our
+findings provide insights into developing robust multi-modal search systems,
+thereby enhancing relevance for complex queries.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Federated User Preference Modeling for Privacy-Preserving Cross-Domain
+  Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14689v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14689v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Li Wang, Shoujin Wang, Quangui Zhang, Qiang Wu, Min Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-domain recommendation (CDR) aims to address the data-sparsity problem
+by transferring knowledge across domains. Existing CDR methods generally assume
+that the user-item interaction data is shareable between domains, which leads
+to privacy leakage. Recently, some privacy-preserving CDR (PPCDR) models have
+been proposed to solve this problem. However, they primarily transfer simple
+representations learned only from user-item interaction histories, overlooking
+other useful side information, leading to inaccurate user preferences.
+Additionally, they transfer differentially private user-item interaction
+matrices or embeddings across domains to protect privacy. However, these
+methods offer limited privacy protection, as attackers may exploit external
+information to infer the original data. To address these challenges, we propose
+a novel Federated User Preference Modeling (FUPM) framework. In FUPM, first, a
+novel comprehensive preference exploration module is proposed to learn users'
+comprehensive preferences from both interaction data and additional data
+including review texts and potentially positive items. Next, a private
+preference transfer module is designed to first learn differentially private
+local and global prototypes, and then privately transfer the global prototypes
+using a federated learning strategy. These prototypes are generalized
+representations of user groups, making it difficult for attackers to infer
+individual information. Extensive experiments on four CDR tasks conducted on
+the Amazon and Douban datasets validate the superiority of FUPM over SOTA
+baselines. Code is available at https://github.com/Lili1013/FUPM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bridging the Gap: Unpacking the Hidden Challenges in Knowledge
+  Distillation for Online Ranking Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14678v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14678v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikhil Khani, Shuo Yang, Aniruddh Nath, Yang Liu, Pendo Abbo, Li Wei, Shawn Andrews, Maciej Kula, Jarrod Kahn, Zhe Zhao, Lichan Hong, Ed Chi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge Distillation (KD) is a powerful approach for compressing a large
+model into a smaller, more efficient model, particularly beneficial for
+latency-sensitive applications like recommender systems. However, current KD
+research predominantly focuses on Computer Vision (CV) and NLP tasks,
+overlooking unique data characteristics and challenges inherent to recommender
+systems. This paper addresses these overlooked challenges, specifically: (1)
+mitigating data distribution shifts between teacher and student models, (2)
+efficiently identifying optimal teacher configurations within time and
+budgetary constraints, and (3) enabling computationally efficient and rapid
+sharing of teacher labels to support multiple students. We present a robust KD
+system developed and rigorously evaluated on multiple large-scale personalized
+video recommendation systems within Google. Our live experiment results
+demonstrate significant improvements in student model performance while
+ensuring consistent and reliable generation of high quality teacher labels from
+a continuous data stream of data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ KGPrune: a Web Application to Extract Subgraphs of Interest from
+  Wikidata with Analogical Pruning <span class="chip">ECAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14658v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14658v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pierre Monnin, Cherif-Hassan Nousradine, Lucas Jarnac, Laurel Zuckerman, Miguel Couceiro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge graphs (KGs) have become ubiquitous publicly available knowledge
+sources, and are nowadays covering an ever increasing array of domains.
+However, not all knowledge represented is useful or pertaining when considering
+a new application or specific task. Also, due to their increasing size,
+handling large KGs in their entirety entails scalability issues. These two
+aspects asks for efficient methods to extract subgraphs of interest from
+existing KGs. To this aim, we introduce KGPrune, a Web Application that, given
+seed entities of interest and properties to traverse, extracts their
+neighboring subgraphs from Wikidata. To avoid topical drift, KGPrune relies on
+a frugal pruning algorithm based on analogical reasoning to only keep relevant
+neighbors while pruning irrelevant ones. The interest of KGPrune is illustrated
+by two concrete applications, namely, bootstrapping an enterprise KG and
+extracting knowledge related to looted artworks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as a demo paper at ECAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Relationships are Complicated! An Analysis of Relationships Between
+  <span class="highlight-title">Dataset</span>s on the Web 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14636v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14636v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kate Lin, Tarfah Alrashed, Natasha Noy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Web today has millions of datasets, and the number of datasets continues
+to grow at a rapid pace. These datasets are not standalone entities; rather,
+they are intricately connected through complex relationships. Semantic
+relationships between datasets provide critical insights for research and
+decision-making processes. In this paper, we study dataset relationships from
+the perspective of users who discover, use, and share datasets on the Web: what
+relationships are important for different tasks? What contextual information
+might users want to know? We first present a comprehensive taxonomy of
+relationships between datasets on the Web and map these relationships to user
+tasks performed during dataset discovery. We develop a series of methods to
+identify these relationships and compare their performance on a large corpus of
+datasets generated from Web pages with schema.org markup. We demonstrate that
+machine-learning based methods that use dataset metadata achieve multi-class
+classification accuracy of 90%. Finally, we highlight gaps in available
+semantic markup for datasets and discuss how incorporating comprehensive
+semantics can facilitate the identification of dataset relationships. By
+providing a comprehensive overview of dataset relationships at scale, this
+paper sets a benchmark for future research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MODOC: A Modular Interface for Flexible Interlinking of Text Retrieval
+  and Text Generation Functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14623v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14623v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yingqiang Gao, Jhony Prada, Nianlong Gu, Jessica Lam, Richard H. R. Hahnloser
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) produce eloquent texts but often the content
+they generate needs to be verified. Traditional information retrieval systems
+can assist with this task, but most systems have not been designed with
+LLM-generated queries in mind. As such, there is a compelling need for
+integrated systems that provide both retrieval and generation functionality
+within a single user interface.
+  We present MODOC, a modular user interface that leverages the capabilities of
+LLMs and provides assistance with detecting their confabulations, promoting
+integrity in scientific writing. MODOC represents a significant step forward in
+scientific writing assistance. Its modular architecture supports flexible
+functions for retrieving information and for writing and generating text in a
+single, user-friendly interface.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Hybrid RAG System with Comprehensive Enhancement on Complex Reasoning <span class="chip">KDD</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.05141v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.05141v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ye Yuan, Chengwu Liu, Jingyang Yuan, Gongbo Sun, Siqi Li, Ming Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-augmented generation (RAG) is a framework enabling large language
+models (LLMs) to enhance their accuracy and reduce hallucinations by
+integrating external knowledge bases. In this paper, we introduce a hybrid RAG
+system enhanced through a comprehensive suite of optimizations that
+significantly improve retrieval quality, augment reasoning capabilities, and
+refine numerical computation ability. We refined the text chunks and tables in
+web pages, added attribute predictors to reduce hallucinations, conducted LLM
+Knowledge Extractor and Knowledge Graph Extractor, and finally built a
+reasoning strategy with all the references. We evaluated our system on the CRAG
+dataset through the Meta CRAG KDD Cup 2024 Competition. Both the local and
+online evaluations demonstrate that our system significantly enhances complex
+reasoning capabilities. In local evaluations, we have significantly improved
+accuracy and reduced error rates compared to the baseline model, achieving a
+notable increase in scores. In the meanwhile, we have attained outstanding
+results in online assessments, demonstrating the performance and generalization
+capabilities of the proposed system. The source code for our system is released
+in \url{https://gitlab.aicrowd.com/shizueyy/crag-new}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical report for 3rd prize in Task 1 of Meta CRAG KDD Cup 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond KAN: Introducing KarSein for Adaptive High-Order Feature
+  Interaction Modeling in CTR Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.08713v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.08713v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunxiao Shi, Wujiang Xu, Mingyu Jin, Haimin Zhang, Qiang Wu, Yongfeng Zhang, Min Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modeling feature interactions is crucial for click-through rate (CTR)
+prediction, particularly when it comes to high-order explicit interactions.
+Traditional methods struggle with this task because they often predefine a
+maximum interaction order, which relies heavily on prior knowledge and can
+limit the model's effectiveness. Additionally, modeling high-order interactions
+typically leads to increased computational costs. Therefore, the challenge lies
+in adaptively modeling high-order feature interactions while maintaining
+efficiency. To address this issue, we introduce Kolmogorov-Arnold Represented
+Sparse Efficient Interaction Network (KarSein), designed to optimize both
+predictive accuracy and computational efficiency. We firstly identify
+limitations of directly applying Kolmogorov-Arnold Networks (KAN) to CTR and
+then introduce KarSein to overcome these issues. It features a novel
+architecture that reduces the computational costs of KAN and supports embedding
+vectors as feature inputs. Additionally, KarSein employs guided symbolic
+regression to address the challenge of KAN in spontaneously learning
+multiplicative relationships. Extensive experiments demonstrate KarSein's
+superior performance, achieving significant predictive accuracy with minimal
+computational overhead. Furthermore, KarSein maintains strong global
+explainability while enabling the removal of redundant features, resulting in a
+sparse network structure. These advantages also position KarSein as a promising
+method for efficient inference.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>KarSein for CTR</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Stem-Agnostic Single-Decoder System for Music Source Separation Beyond
+  Four Stems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.18747v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.18747v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karn N. Watcharasupat, Alexander Lerch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite significant recent progress across multiple subtasks of audio source
+separation, few music source separation systems support separation beyond the
+four-stem vocals, drums, bass, and other (VDBO) setup. Of the very few current
+systems that support source separation beyond this setup, most continue to rely
+on an inflexible decoder setup that can only support a fixed pre-defined set of
+stems. Increasing stem support in these inflexible systems correspondingly
+requires increasing computational complexity, rendering extensions of these
+systems computationally infeasible for long-tail instruments. In this work, we
+propose Banquet, a system that allows source separation of multiple stems using
+just one decoder. A bandsplit source separation model is extended to work in a
+query-based setup in tandem with a music instrument recognition PaSST model. On
+the MoisesDB dataset, Banquet, at only 24.9 M trainable parameters, approached
+the performance level of the significantly more complex 6-stem Hybrid
+Transformer Demucs on VDBO stems and outperformed it on guitar and piano. The
+query-based setup allows for the separation of narrow instrument classes such
+as clean acoustic guitars, and can be successfully applied to the extraction of
+less common stems such as reeds and organs. Implementation is available at
+https://github.com/kwatcharasupat/query-bandit.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the 25th International Society for Music Information
+  Retrieval Conference (ISMIR 2024). Camera-ready version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Explain then Rank: Scale Calibration of Neural Rankers Using Natural
+  Language Explanations from LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.12276v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.12276v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Puxuan Yu, Daniel Cohen, Hemank Lamba, Joel Tetreault, Alex Jaimes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In search settings, calibrating the scores during the ranking process to
+quantities such as click-through rates or relevance levels enhances a system's
+usefulness and trustworthiness for downstream users. While previous research
+has improved this notion of calibration for low complexity learning-to-rank
+models, the larger data demands and parameter count specific to modern neural
+text rankers produce unique obstacles that hamper the efficacy of methods
+intended for the learning-to-rank setting.
+  This paper proposes exploiting large language models (LLMs) to provide
+relevance and uncertainty signals for these neural text rankers to produce
+scale-calibrated scores through Monte Carlo sampling of natural language
+explanations (NLEs). Our approach transforms the neural ranking task from
+ranking textual query-document pairs to ranking corresponding synthesized NLEs.
+Comprehensive experiments on two popular document ranking datasets show that
+the NLE-based calibration approach consistently outperforms past calibration
+methods and LLM-based methods for ranking, calibration, and query performance
+prediction tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">141</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Practitioner's Guide to Continual Multimodal <span class="highlight-title">Pretrain</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14471v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14471v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karsten Roth, Vishaal Udandarao, Sebastian Dziadzio, Ameya Prabhu, Mehdi Cherti, Oriol Vinyals, Olivier Hénaff, Samuel Albanie, Matthias Bethge, Zeynep Akata
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal foundation models serve numerous applications at the intersection
+of vision and language. Still, despite being pretrained on extensive data, they
+become outdated over time. To keep models updated, research into continual
+pretraining mainly explores scenarios with either (1) infrequent,
+indiscriminate updates on large-scale new data, or (2) frequent, sample-level
+updates. However, practical model deployment often operates in the gap between
+these two limit cases, as real-world applications often demand adaptation to
+specific subdomains, tasks or concepts -- spread over the entire, varying life
+cycle of a model. In this work, we complement current perspectives on continual
+pretraining through a research test bed as well as provide comprehensive
+guidance for effective continual model updates in such scenarios. We first
+introduce FoMo-in-Flux, a continual multimodal pretraining benchmark with
+realistic compute constraints and practical deployment requirements,
+constructed over 63 datasets with diverse visual and semantic coverage. Using
+FoMo-in-Flux, we explore the complex landscape of practical continual
+pretraining through multiple perspectives: (1) A data-centric investigation of
+data mixtures and stream orderings that emulate real-world deployment
+situations, (2) a method-centric investigation ranging from simple fine-tuning
+and traditional continual learning strategies to parameter-efficient updates
+and model merging, (3) meta learning rate schedules and mechanistic design
+choices, and (4) the influence of model and compute scaling. Together, our
+insights provide a practitioner's guide to continual multimodal pretraining for
+real-world deployment. Our benchmark and code is here:
+https://github.com/ExplainableML/fomo_in_flux.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report. 52 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A domain decomposition-based autoregressive deep learning model for
+  unsteady and nonlinear partial differential equations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14461v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14461v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sheel Nidhan, Haoliang Jiang, Lalit Ghule, Clancy Umphrey, Rishikesh Ranade, Jay Pathak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a domain-decomposition-based deep learning (DL)
+framework, named transient-CoMLSim, for accurately modeling unsteady and
+nonlinear partial differential equations (PDEs). The framework consists of two
+key components: (a) a convolutional neural network (CNN)-based autoencoder
+architecture and (b) an autoregressive model composed of fully connected
+layers. Unlike existing state-of-the-art methods that operate on the entire
+computational domain, our CNN-based autoencoder computes a lower-dimensional
+basis for solution and condition fields represented on subdomains. Timestepping
+is performed entirely in the latent space, generating embeddings of the
+solution variables from the time history of embeddings of solution and
+condition variables. This approach not only reduces computational complexity
+but also enhances scalability, making it well-suited for large-scale
+simulations. Furthermore, to improve the stability of our rollouts, we employ a
+curriculum learning (CL) approach during the training of the autoregressive
+model. The domain-decomposition strategy enables scaling to out-of-distribution
+domain sizes while maintaining the accuracy of predictions -- a feature not
+easily integrated into popular DL-based approaches for physics simulations. We
+benchmark our model against two widely-used DL architectures, Fourier Neural
+Operator (FNO) and U-Net, and demonstrate that our framework outperforms them
+in terms of accuracy, extrapolation to unseen timesteps, and stability for a
+wide range of use cases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reconstructing physiological signals from fMRI across the adult lifespan 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14453v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14453v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiyu Wang, Ziyuan Xu, Yamin Li, Mara Mather, Roza G. Bayrak, Catie Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interactions between the brain and body are of fundamental importance for
+human behavior and health. Functional magnetic resonance imaging (fMRI)
+captures whole-brain activity noninvasively, and modeling how fMRI signals
+interact with physiological dynamics of the body can provide new insight into
+brain function and offer potential biomarkers of disease. However,
+physiological recordings are not always possible to acquire since they require
+extra equipment and setup, and even when they are, the recorded physiological
+signals may contain substantial artifacts. To overcome this limitation, machine
+learning models have been proposed to directly extract features of respiratory
+and cardiac activity from resting-state fMRI signals. To date, such work has
+been carried out only in healthy young adults and in a pediatric population,
+leaving open questions about the efficacy of these approaches on older adults.
+Here, we propose a novel framework that leverages Transformer-based
+architectures for reconstructing two key physiological signals - low-frequency
+respiratory volume (RV) and heart rate (HR) fluctuations - from fMRI data, and
+test these models on a dataset of individuals aged 36-89 years old. Our
+framework outperforms previously proposed approaches (attaining median
+correlations between predicted and measured signals of r ~ .698 for RV and r ~
+.618 for HR), indicating the potential of leveraging attention mechanisms to
+model fMRI-physiological signal relationships. We also evaluate several model
+training and fine-tuning strategies, and find that incorporating young-adult
+data during training improves the performance when predicting physiological
+signals in the aging cohort. Overall, our approach successfully infers key
+physiological variables directly from fMRI data from individuals across a wide
+range of the adult lifespan.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Symmetry & Critical Points 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14445v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14445v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yossi Arjevani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Critical points of an invariant function may or may not be symmetric. We
+prove, however, that if a symmetric critical point exists, those adjacent to it
+are generically symmetry breaking. This mathematical mechanism is shown to
+carry important implications for our ability to efficiently minimize invariant
+nonconvex functions, in particular those associated with neural networks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Model Parallel Training and Transfer Learning for Convolutional Neural
+  Networks by Domain Decomposition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14442v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14442v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Axel Klawonn, Martin Lanser, Janine Weber
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep convolutional neural networks (CNNs) have been shown to be very
+successful in a wide range of image processing applications. However, due to
+their increasing number of model parameters and an increasing availability of
+large amounts of training data, parallelization strategies to efficiently train
+complex CNNs are necessary. In previous work by the authors, a novel model
+parallel CNN architecture was proposed which is loosely inspired by domain
+decomposition. In particular, the novel network architecture is based on a
+decomposition of the input data into smaller subimages. For each of these
+subimages, local CNNs with a proportionally smaller number of parameters are
+trained in parallel and the resulting local classifications are then aggregated
+in a second step by a dense feedforward neural network (DNN). In the present
+work, we compare the resulting CNN-DNN architecture to less costly alternatives
+to combine the local classifications into a final, global decision.
+Additionally, we investigate the performance of the CNN-DNN trained as one
+coherent model as well as using a transfer learning strategy, where the
+parameters of the pre-trained local CNNs are used as initial values for a
+subsequently trained global coherent CNN-DNN model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Social perception of faces in a vision-language model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14435v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14435v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Carina I. Hausladen, Manuel Knott, Colin F. Camerer, Pietro Perona
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We explore social perception of human faces in CLIP, a widely used
+open-source vision-language model. To this end, we compare the similarity in
+CLIP embeddings between different textual prompts and a set of face images. Our
+textual prompts are constructed from well-validated social psychology terms
+denoting social perception. The face images are synthetic and are
+systematically and independently varied along six dimensions: the legally
+protected attributes of age, gender, and race, as well as facial expression,
+lighting, and pose. Independently and systematically manipulating face
+attributes allows us to study the effect of each on social perception and
+avoids confounds that can occur in wild-collected data due to uncontrolled
+systematic correlations between attributes. Thus, our findings are experimental
+rather than observational. Our main findings are three. First, while CLIP is
+trained on the widest variety of images and texts, it is able to make
+fine-grained human-like social judgments on face images. Second, age, gender,
+and race do systematically impact CLIP's social perception of faces, suggesting
+an undesirable bias in CLIP vis-a-vis legally protected attributes. Most
+strikingly, we find a strong pattern of bias concerning the faces of Black
+women, where CLIP produces extreme values of social perception across different
+ages and facial expressions. Third, facial expression impacts social perception
+more than age and lighting as much as age. The last finding predicts that
+studies that do not control for unprotected visual attributes may reach the
+wrong conclusions on bias. Our novel method of investigation, which is founded
+on the social psychology literature and on the experiments involving the
+manipulation of individual attributes, yields sharper and more reliable
+observations than previous observational methods and may be applied to study
+biases in any vision-language model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Employing Artificial Intelligence to Steer Exascale Workflows with
+  Colmena 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14434v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14434v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Logan Ward, J. Gregory Pauloski, Valerie Hayot-Sasson, Yadu Babuji, Alexander Brace, Ryan Chard, Kyle Chard, Rajeev Thakur, Ian Foster
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Computational workflows are a common class of application on supercomputers,
+yet the loosely coupled and heterogeneous nature of workflows often fails to
+take full advantage of their capabilities. We created Colmena to leverage the
+massive parallelism of a supercomputer by using Artificial Intelligence (AI) to
+learn from and adapt a workflow as it executes. Colmena allows scientists to
+define how their application should respond to events (e.g., task completion)
+as a series of cooperative agents. In this paper, we describe the design of
+Colmena, the challenges we overcame while deploying applications on exascale
+systems, and the science workflows we have enhanced through interweaving AI.
+The scaling challenges we discuss include developing steering strategies that
+maximize node utilization, introducing data fabrics that reduce communication
+overhead of data-intensive tasks, and implementing workflow tasks that cache
+costly operations between invocations. These innovations coupled with a variety
+of application patterns accessible through our agent-based steering model have
+enabled science advances in chemistry, biophysics, and materials science using
+different types of AI. Our vision is that Colmena will spur creative solutions
+that harness AI across many domains of scientific computing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Contextual Bandit with Herding Effects: Algorithms and Recommendation
+  Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14432v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14432v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luyue Xu, Liming Wang, Hong Xie, Mingqiang Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contextual bandits serve as a fundamental algorithmic framework for
+optimizing recommendation decisions online. Though extensive attention has been
+paid to tailoring contextual bandits for recommendation applications, the
+"herding effects" in user feedback have been ignored. These herding effects
+bias user feedback toward historical ratings, breaking down the assumption of
+unbiased feedback inherent in contextual bandits. This paper develops a novel
+variant of the contextual bandit that is tailored to address the feedback bias
+caused by the herding effects. A user feedback model is formulated to capture
+this feedback bias. We design the TS-Conf (Thompson Sampling under Conformity)
+algorithm, which employs posterior sampling to balance the exploration and
+exploitation tradeoff. We prove an upper bound for the regret of the algorithm,
+revealing the impact of herding effects on learning speed. Extensive
+experiments on datasets demonstrate that TS-Conf outperforms four benchmark
+algorithms. Analysis reveals that TS-Conf effectively mitigates the negative
+impact of herding effects, resulting in faster learning and improved
+recommendation accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating saliency scores in point clouds of natural environments by
+  learning surface anomalies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14421v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14421v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Reuma Arav, Dennis Wittich, Franz Rottensteiner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, three-dimensional point clouds are used increasingly to
+document natural environments. Each dataset contains a diverse set of objects,
+at varying shapes and sizes, distributed throughout the data and intricately
+intertwined with the topography. Therefore, regions of interest are difficult
+to find and consequent analyses become a challenge. Inspired from visual
+perception principles, we propose to differentiate objects of interest from the
+cluttered environment by evaluating how much they stand out from their
+surroundings, i.e., their geometric salience. Previous saliency detection
+approaches suggested mostly handcrafted attributes for the task. However, such
+methods fail when the data are too noisy or have high levels of texture. Here
+we propose a learning-based mechanism that accommodates noise and textured
+surfaces. We assume that within the natural environment any change from the
+prevalent surface would suggest a salient object. Thus, we first learn the
+underlying surface and then search for anomalies within it. Initially, a deep
+neural network is trained to reconstruct the surface. Regions where the
+reconstructed part deviates significantly from the original point cloud yield a
+substantial reconstruction error, signifying an anomaly, i.e., saliency. We
+demonstrate the effectiveness of the proposed approach by searching for salient
+features in various natural scenarios, which were acquired by different
+acquisition platforms. We show the strong correlation between the
+reconstruction error and salient objects.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hyperdimensional Computing Empowered Federated Foundation Model over
+  Wireless Networks for Metaverse 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14416v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14416v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yahao Ding, Wen Shang, Minrui Xu, Zhaohui Yang, Ye Hu, Dusit Niyato, Mohammad Shikh-Bahaei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Metaverse, a burgeoning collective virtual space merging augmented
+reality and persistent virtual worlds, necessitates advanced artificial
+intelligence (AI) and communication technologies to support immersive and
+interactive experiences. Federated learning (FL) has emerged as a promising
+technique for collaboratively training AI models while preserving data privacy.
+However, FL faces challenges such as high communication overhead and
+substantial computational demands, particularly for neural network (NN) models.
+To address these issues, we propose an integrated federated split learning and
+hyperdimensional computing (FSL-HDC) framework for emerging foundation models.
+This novel approach reduces communication costs, computation load, and privacy
+risks, making it particularly suitable for resource-constrained edge devices in
+the Metaverse, ensuring real-time responsive interactions. Additionally, we
+introduce an optimization algorithm that concurrently optimizes transmission
+power and bandwidth to minimize the maximum transmission time among all users
+to the server. The simulation results based on the MNIST dataset indicate that
+FSL-HDC achieves an accuracy rate of approximately 87.5%, which is slightly
+lower than that of FL-HDC. However, FSL-HDC exhibits a significantly faster
+convergence speed, approximately 3.733x that of FSL-NN, and demonstrates
+robustness to non-IID data distributions. Moreover, our proposed optimization
+algorithm can reduce the maximum transmission time by up to 64% compared with
+the baseline.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LoG-VMamba: Local-Global Vision Mamba for Medical Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14415v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14415v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Trung Dinh Quoc Dang, Huy Hoang Nguyen, Aleksei Tiulpin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mamba, a State Space Model (SSM), has recently shown competitive performance
+to Convolutional Neural Networks (CNNs) and Transformers in Natural Language
+Processing and general sequence modeling. Various attempts have been made to
+adapt Mamba to Computer Vision tasks, including medical image segmentation
+(MIS). Vision Mamba (VM)-based networks are particularly attractive due to
+their ability to achieve global receptive fields, similar to Vision
+Transformers, while also maintaining linear complexity in the number of tokens.
+However, the existing VM models still struggle to maintain both spatially local
+and global dependencies of tokens in high dimensional arrays due to their
+sequential nature. Employing multiple and/or complicated scanning strategies is
+computationally costly, which hinders applications of SSMs to high-dimensional
+2D and 3D images that are common in MIS problems. In this work, we propose
+Local-Global Vision Mamba, LoG-VMamba, that explicitly enforces spatially
+adjacent tokens to remain nearby on the channel axis, and retains the global
+context in a compressed form. Our method allows the SSMs to access the local
+and global contexts even before reaching the last token while requiring only a
+simple scanning strategy. Our segmentation models are computationally efficient
+and substantially outperform both CNN and Transformers-based baselines on a
+diverse set of 2D and 3D MIS tasks. The implementation of LoG-VMamba is
+available at \url{https://github.com/Oulu-IMEDS/LoG-VMamba}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spectrally Informed Learning of Fluid Flows 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14407v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14407v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin D. Shaffer, Jeremy R. Vorenberg, M. Ani Hsieh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate and efficient fluid flow models are essential for applications
+relating to many physical phenomena including geophysical, aerodynamic, and
+biological systems. While these flows may exhibit rich and multiscale dynamics,
+in many cases underlying low-rank structures exist which describe the bulk of
+the motion. These structures tend to be spatially large and temporally slow,
+and may contain most of the energy in a given flow. The extraction and
+parsimonious representation of these low-rank dynamics from high-dimensional
+data is a key challenge. Inspired by the success of physics-informed machine
+learning methods, we propose a spectrally-informed approach to extract low-rank
+models of fluid flows by leveraging known spectral properties in the learning
+process. We incorporate this knowledge by imposing regularizations on the
+learned dynamics, which bias the training process towards learning
+low-frequency structures with corresponding higher power. We demonstrate the
+effectiveness of this method to improve prediction and produce learned models
+which better match the underlying spectral properties of prototypical fluid
+flows.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Application of Neural Ordinary Differential Equations for ITER Burning
+  Plasma Dynamics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14404v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14404v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zefang Liu, Weston M. Stacey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The dynamics of burning plasmas in tokamaks are crucial for advancing
+controlled thermonuclear fusion. This study introduces the NeuralPlasmaODE, a
+multi-region multi-timescale transport model to simulate the complex energy
+transfer processes in ITER deuterium-tritium (D-T) plasmas. Our model captures
+the interactions between energetic alpha particles, electrons, and ions, which
+are vital for understanding phenomena such as thermal runaway instability. We
+employ neural ordinary differential equations (Neural ODEs) for the numerical
+derivation of diffusivity parameters, enabling precise modeling of energy
+interactions between different plasma regions. By leveraging transfer learning,
+we utilize model parameters derived from DIII-D experimental data, enhancing
+the efficiency and accuracy of our simulations without training from scratch.
+Applying this model to ITER's inductive and non-inductive operational
+scenarios, our results demonstrate that radiation and transport processes
+effectively remove excess heat from the core plasma, preventing thermal runaway
+instability. This study underscores the potential of machine learning in
+advancing our understanding and control of burning plasma dynamics in fusion
+reactors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Language-specific Calibration for Pruning Multilingual Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14398v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14398v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simon Kurz, Zhixue Zhao, Jian-Jia Chen, Lucie Flek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in large language model (LLM) pruning have shown
+state-of-the-art compression results in post-training and retraining-free
+settings while maintaining high predictive performance. However, such research
+mainly considers calibrating pruning using English text, despite the
+multilingual nature of modern LLMs and their frequent uses in non-English
+languages. In this paper, we set out to explore effective strategies for
+calibrating the pruning of multilingual language models. We present the first
+comprehensive empirical study, comparing different calibration languages for
+pruning multilingual models across diverse tasks, models, and state-of-the-art
+pruning techniques. Our results present practical suggestions, for example,
+calibrating in the target language can efficiently yield lower perplexity, but
+does not necessarily benefit downstream tasks. Our further analysis experiments
+unveil that calibration in the target language mainly contributes to preserving
+language-specific features related to fluency and coherence, but might not
+contribute to capturing language-agnostic features such as language
+understanding and reasoning. Last, we provide practical recommendations for
+future practitioners.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CURE4Rec: A Benchmark for Recommendation Unlearning with Deeper
+  Influence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14393v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14393v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaochao Chen, Jiaming Zhang, Yizhao Zhang, Li Zhang, Lingjuan Lyu, Yuyuan Li, Biao Gong, Chenggang Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With increasing privacy concerns in artificial intelligence, regulations have
+mandated the right to be forgotten, granting individuals the right to withdraw
+their data from models. Machine unlearning has emerged as a potential solution
+to enable selective forgetting in models, particularly in recommender systems
+where historical data contains sensitive user information. Despite recent
+advances in recommendation unlearning, evaluating unlearning methods
+comprehensively remains challenging due to the absence of a unified evaluation
+framework and overlooked aspects of deeper influence, e.g., fairness. To
+address these gaps, we propose CURE4Rec, the first comprehensive benchmark for
+recommendation unlearning evaluation. CURE4Rec covers four aspects, i.e.,
+unlearning Completeness, recommendation Utility, unleaRning efficiency, and
+recommendation fairnEss, under three data selection strategies, i.e., core
+data, edge data, and random data. Specifically, we consider the deeper
+influence of unlearning on recommendation fairness and robustness towards data
+with varying impact levels. We construct multiple datasets with CURE4Rec
+evaluation and conduct extensive experiments on existing recommendation
+unlearning methods. Our code is released at
+https://github.com/xiye7lai/CURE4Rec.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reprogramming Foundational Large Language Models(LLMs) for Enterprise
+  Adoption for Spatio-Temporal Forecasting Applications: Unveiling a New Era in
+  Copilot-Guided Cross-Modal Time Series Representation Learning <span class="chip">AAAI-2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14387v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14387v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sakhinana Sagar Srinivas, Chidaksh Ravuru, Geethan Sannidhi, Venkataramana Runkana
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spatio-temporal forecasting plays a crucial role in various sectors such as
+transportation systems, logistics, and supply chain management. However,
+existing methods are limited by their ability to handle large, complex
+datasets. To overcome this limitation, we introduce a hybrid approach that
+combines the strengths of open-source large and small-scale language models
+(LLMs and LMs) with traditional forecasting methods. We augment traditional
+methods with dynamic prompting and a grouped-query, multi-head attention
+mechanism to more effectively capture both intra-series and inter-series
+dependencies in evolving nonlinear time series data. In addition, we facilitate
+on-premises customization by fine-tuning smaller open-source LMs for time
+series trend analysis utilizing descriptions generated by open-source large LMs
+on consumer-grade hardware using Low-Rank Adaptation with Activation Memory
+Reduction (LoRA-AMR) technique to reduce computational overhead and activation
+storage memory demands while preserving inference latency. We combine language
+model processing for time series trend analysis with traditional time series
+representation learning method for cross-modal integration, achieving robust
+and accurate forecasts. The framework effectiveness is demonstrated through
+extensive experiments on various real-world datasets, outperforming existing
+methods by significant margins in terms of forecast accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper published at the Deployable AI (DAI) workshop at AAAI-2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Tree-Structured Composition of Data Augmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14381v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14381v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongyue Li, Kailai Chen, Predrag Radivojac, Hongyang R. Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data augmentation is widely used for training a neural network given little
+labeled data. A common practice of augmentation training is applying a
+composition of multiple transformations sequentially to the data. Existing
+augmentation methods such as RandAugment randomly sample from a list of
+pre-selected transformations, while methods such as AutoAugment apply advanced
+search to optimize over an augmentation set of size $k^d$, which is the number
+of transformation sequences of length $d$, given a list of $k$ transformations.
+  In this paper, we design efficient algorithms whose running time complexity
+is much faster than the worst-case complexity of $O(k^d)$, provably. We propose
+a new algorithm to search for a binary tree-structured composition of $k$
+transformations, where each tree node corresponds to one transformation. The
+binary tree generalizes sequential augmentations, such as the SimCLR
+augmentation scheme for contrastive learning. Using a top-down, recursive
+search procedure, our algorithm achieves a runtime complexity of $O(2^d k)$,
+which is much faster than $O(k^d)$ as $k$ increases above $2$. We apply our
+algorithm to tackle data distributions with heterogeneous subpopulations by
+searching for one tree in each subpopulation and then learning a weighted
+combination, resulting in a forest of trees.
+  We validate our proposed algorithms on numerous graph and image datasets,
+including a multi-label graph classification dataset we collected. The dataset
+exhibits significant variations in the sizes of graphs and their average
+degrees, making it ideal for studying data augmentation. We show that our
+approach can reduce the computation cost by 43% over existing search methods
+while improving performance by 4.3%. The tree structures can be used to
+interpret the relative importance of each transformation, such as identifying
+the important transformations on small vs. large graphs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SelEx: Self-Expertise in Fine-Grained Generalized Category Discovery <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14371v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14371v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sarah Rastegar, Mohammadreza Salehi, Yuki M. Asano, Hazel Doughty, Cees G. M. Snoek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we address Generalized Category Discovery, aiming to
+simultaneously uncover novel categories and accurately classify known ones.
+Traditional methods, which lean heavily on self-supervision and contrastive
+learning, often fall short when distinguishing between fine-grained categories.
+To address this, we introduce a novel concept called `self-expertise', which
+enhances the model's ability to recognize subtle differences and uncover
+unknown categories. Our approach combines unsupervised and supervised
+self-expertise strategies to refine the model's discernment and generalization.
+Initially, hierarchical pseudo-labeling is used to provide `soft supervision',
+improving the effectiveness of self-expertise. Our supervised technique differs
+from traditional methods by utilizing more abstract positive and negative
+samples, aiding in the formation of clusters that can generalize to novel
+categories. Meanwhile, our unsupervised strategy encourages the model to
+sharpen its category distinctions by considering within-category examples as
+`hard' negatives. Supported by theoretical insights, our empirical results
+showcase that our method outperforms existing state-of-the-art techniques in
+Generalized Category Discovery across several fine-grained datasets. Our code
+is available at: https://github.com/SarahRastegar/SelEx.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ECCV 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploiting Conjugate Label Information for Multi-Instance Partial-Label
+  Learning <span class="chip">IJCAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14369v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14369v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Tang, Weijia Zhang, Min-Ling Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-instance partial-label learning (MIPL) addresses scenarios where each
+training sample is represented as a multi-instance bag associated with a
+candidate label set containing one true label and several false positives.
+Existing MIPL algorithms have primarily focused on mapping multi-instance bags
+to candidate label sets for disambiguation, disregarding the intrinsic
+properties of the label space and the supervised information provided by
+non-candidate label sets. In this paper, we propose an algorithm named ELIMIPL,
+i.e., Exploiting conjugate Label Information for Multi-Instance Partial-Label
+learning, which exploits the conjugate label information to improve the
+disambiguation performance. To achieve this, we extract the label information
+embedded in both candidate and non-candidate label sets, incorporating the
+intrinsic properties of the label space. Experimental results obtained from
+benchmark and real-world datasets demonstrate the superiority of the proposed
+ELIMIPL over existing MIPL algorithms and other well-established partial-label
+learning algorithms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at IJCAI 2024. The code can be found at
+  https://github.com/tangw-seu/ELIMIPL</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Embedding is Worth a Thousand Noisy Labels 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14358v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14358v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francesco Di Salvo, Sebastian Doerrich, Ines Rieger, Christian Ledig
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The performance of deep neural networks scales with dataset size and label
+quality, rendering the efficient mitigation of low-quality data annotations
+crucial for building robust and cost-effective systems. Existing strategies to
+address label noise exhibit severe limitations due to computational complexity
+and application dependency. In this work, we propose WANN, a Weighted Adaptive
+Nearest Neighbor approach that builds on self-supervised feature
+representations obtained from foundation models. To guide the weighted voting
+scheme, we introduce a reliability score, which measures the likelihood of a
+data label being correct. WANN outperforms reference methods, including a
+linear layer trained with robust loss functions, on diverse datasets of varying
+size and under various noise types and severities. WANN also exhibits superior
+generalization on imbalanced data compared to both Adaptive-NNs (ANN) and fixed
+k-NNs. Furthermore, the proposed weighting scheme enhances supervised
+dimensionality reduction under noisy labels. This yields a significant boost in
+classification performance with 10x and 100x smaller image embeddings,
+minimizing latency and storage requirements. Our approach, emphasizing
+efficiency and explainability, emerges as a simple, robust solution to overcome
+the inherent limitations of deep neural network training. The code is available
+at https://github.com/francescodisalvo05/wann-noisy-labels .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint submitted to the International Journal of Computer Vision
+  (IJCV)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Assessing Contamination in Large Language Models: Introducing the
+  LogProber method 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14352v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14352v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicolas Yax, Pierre-Yves Oudeyer, Stefano Palminteri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In machine learning, contamination refers to situations where testing data
+leak into the training set. The issue is particularly relevant for the
+evaluation of the performance of Large Language Models (LLMs), which are
+generally trained on gargantuan, and generally opaque, corpora of text scraped
+from the world wide web. Developing tools to detect contamination is therefore
+crucial to be able to fairly and properly track the evolution of the
+performance of LLMs. Most recent works in the field are not tailored to
+quantify contamination on short sequences of text like we find in psychology
+questionnaires. In the present paper we introduce LogProber, a novel,
+efficient, algorithm that we show able to detect contamination using token
+probability in given sentences. In the second part we investigate the
+limitations of the method and discuss how different training methods can
+contaminate models without leaving traces in the token probabilities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Foundation Models for Music: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14340v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14340v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yinghao Ma, Anders Øland, Anton Ragni, Bleiz MacSen Del Sette, Charalampos Saitis, Chris Donahue, Chenghua Lin, Christos Plachouras, Emmanouil Benetos, Elio Quinton, Elona Shatri, Fabio Morreale, Ge Zhang, György Fazekas, Gus Xia, Huan Zhang, Ilaria Manco, Jiawen Huang, Julien Guinot, Liwei Lin, Luca Marinelli, Max W. Y. Lam, Megha Sharma, Qiuqiang Kong, Roger B. Dannenberg, Ruibin Yuan, Shangda Wu, Shih-Lun Wu, Shuqi Dai, Shun Lei, Shiyin Kang, Simon Dixon, Wenhu Chen, Wehhao Huang, Xingjian Du, Xingwei Qu, Xu Tan, Yizhi Li, Zeyue Tian, Zhiyong Wu, Zhizheng Wu, Ziyang Ma, Ziyu Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, foundation models (FMs) such as large language models (LLMs)
+and latent diffusion models (LDMs) have profoundly impacted diverse sectors,
+including music. This comprehensive review examines state-of-the-art (SOTA)
+pre-trained models and foundation models in music, spanning from representation
+learning, generative learning and multimodal learning. We first contextualise
+the significance of music in various industries and trace the evolution of AI
+in music. By delineating the modalities targeted by foundation models, we
+discover many of the music representations are underexplored in FM development.
+Then, emphasis is placed on the lack of versatility of previous methods on
+diverse music applications, along with the potential of FMs in music
+understanding, generation and medical application. By comprehensively exploring
+the details of the model pre-training paradigm, architectural choices,
+tokenisation, finetuning methodologies and controllability, we emphasise the
+important topics that should have been well explored, like instruction tuning
+and in-context learning, scaling law and emergent ability, as well as
+long-sequence modelling etc. A dedicated section presents insights into music
+agents, accompanied by a thorough analysis of datasets and evaluations
+essential for pre-training and downstream tasks. Finally, by underscoring the
+vital importance of ethical considerations, we advocate that following research
+on FM for music should focus more on such issues as interpretability,
+transparency, human responsibility, and copyright issues. The paper offers
+insights into future challenges and trends on FMs for music, aiming to shape
+the trajectory of human-AI collaboration in the music realm.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Machine Learning for Quantifier Selection in cvc5 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14338v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14338v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jan Jakubův, Mikoláš Janota, Jelle Piepenbrock, Josef Urban
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work we considerably improve the state-of-the-art SMT solving on
+first-order quantified problems by efficient machine learning guidance of
+quantifier selection. Quantifiers represent a significant challenge for SMT and
+are technically a source of undecidability. In our approach, we train an
+efficient machine learning model that informs the solver which quantifiers
+should be instantiated and which not. Each quantifier may be instantiated
+multiple times and the set of the active quantifiers changes as the solving
+progresses. Therefore, we invoke the ML predictor many times, during the whole
+run of the solver. To make this efficient, we use fast ML models based on
+gradient boosting decision trees. We integrate our approach into the
+state-of-the-art cvc5 SMT solver and show a considerable increase of the
+system's holdout-set performance after training it on a large set of
+first-order problems collected from the Mizar Mathematical Library.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ One-layer <span class="highlight-title">transformer</span>s fail to solve the induction heads task 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14332v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14332v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Clayton Sanford, Daniel Hsu, Matus Telgarsky
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A simple communication complexity argument proves that no one-layer
+transformer can solve the induction heads task unless its size is exponentially
+larger than the size sufficient for a two-layer transformer.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automated Machine Learning in Insurance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14331v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14331v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Panyi Dong, Zhiyu Quan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine Learning (ML) has gained popularity in actuarial research and
+insurance industrial applications. However, the performance of most ML tasks
+heavily depends on data preprocessing, model selection, and hyperparameter
+optimization, which are considered to be intensive in terms of domain
+knowledge, experience, and manual labor. Automated Machine Learning (AutoML)
+aims to automatically complete the full life-cycle of ML tasks and provides
+state-of-the-art ML models without human intervention or supervision. This
+paper introduces an AutoML workflow that allows users without domain knowledge
+or prior experience to achieve robust and effortless ML deployment by writing
+only a few lines of code. This proposed AutoML is specifically tailored for the
+insurance application, with features like the balancing step in data
+preprocessing, ensemble pipelines, and customized loss functions. These
+features are designed to address the unique challenges of the insurance domain,
+including the imbalanced nature of common insurance datasets. The full code and
+documentation are available on the GitHub repository.
+(https://github.com/PanyiDong/InsurAutoML)
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Streamline tractography of the fetal brain in utero with machine
+  learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14326v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14326v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weide Liu, Camilo Calixto, Simon K. Warfield, Davood Karimi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion-weighted magnetic resonance imaging (dMRI) is the only non-invasive
+tool for studying white matter tracts and structural connectivity of the brain.
+These assessments rely heavily on tractography techniques, which reconstruct
+virtual streamlines representing white matter fibers. Much effort has been
+devoted to improving tractography methodology for adult brains, while
+tractography of the fetal brain has been largely neglected. Fetal tractography
+faces unique difficulties due to low dMRI signal quality, immature and rapidly
+developing brain structures, and paucity of reference data. This work presents
+the first machine learning model for fetal tractography. The model input
+consists of five sources of information: (1) Fiber orientation, inferred from a
+diffusion tensor fit to the dMRI signal; (2) Directions of recent propagation
+steps; (3) Global spatial information, encoded as distances to keypoints in the
+brain cortex; (4) Tissue segmentation information; and (5) Prior information
+about the expected local fiber orientations supplied with an atlas. In order to
+mitigate the local tensor estimation error, a large spatial context around the
+current point in the diffusion tensor image is encoded using convolutional and
+attention neural network modules. Moreover, the diffusion tensor information at
+a hypothetical next point is included in the model input. Filtering rules based
+on anatomically constrained tractography are applied to prune implausible
+streamlines. We trained the model on manually-refined whole-brain fetal
+tractograms and validated the trained model on an independent set of 11 test
+scans with gestational ages between 23 and 36 weeks. Results show that our
+proposed method achieves superior performance across all evaluated tracts. The
+new method can significantly advance the capabilities of dMRI for studying
+normal and abnormal brain development in utero.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Function-Space MCMC for Bayesian Wide Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14325v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14325v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucia Pezzetti, Stefano Favaro, Stefano Pelucchetti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bayesian Neural Networks represent a fascinating confluence of deep learning
+and probabilistic reasoning, offering a compelling framework for understanding
+uncertainty in complex predictive models. In this paper, we investigate the use
+of the preconditioned Crank-Nicolson algorithm and its Langevin version to
+sample from the reparametrised posterior distribution of the weights as the
+widths of Bayesian Neural Networks grow larger. In addition to being robust in
+the infinite-dimensional setting, we prove that the acceptance probabilities of
+the proposed methods approach 1 as the width of the network increases,
+independently of any stepsize tuning. Moreover, we examine and compare how the
+mixing speeds of the underdamped Langevin Monte Carlo, the preconditioned
+Crank-Nicolson and the preconditioned Crank-Nicolson Langevin samplers are
+influenced by changes in the network width in some real-world cases. Our
+findings suggest that, in wide Bayesian Neural Networks configurations, the
+preconditioned Crank-Nicolson method allows for more efficient sampling of the
+reparametrised posterior distribution, as evidenced by a higher effective
+sample size and improved diagnostic results compared with the other analysed
+algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking Knowledge Transfer in Learning Using Privileged Information 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14319v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14319v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Danil Provodin, Bram van den Akker, Christina Katsimerou, Maurits Kaptein, Mykola Pechenizkiy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In supervised machine learning, privileged information (PI) is information
+that is unavailable at inference, but is accessible during training time.
+Research on learning using privileged information (LUPI) aims to transfer the
+knowledge captured in PI onto a model that can perform inference without PI. It
+seems that this extra bit of information ought to make the resulting model
+better. However, finding conclusive theoretical or empirical evidence that
+supports the ability to transfer knowledge using PI has been challenging. In
+this paper, we critically examine the assumptions underlying existing
+theoretical analyses and argue that there is little theoretical justification
+for when LUPI should work. We analyze LUPI methods and reveal that apparent
+improvements in empirical risk of existing research may not directly result
+from PI. Instead, these improvements often stem from dataset anomalies or
+modifications in model design misguidedly attributed to PI. Our experiments for
+a wide variety of application domains further demonstrate that state-of-the-art
+LUPI approaches fail to effectively transfer knowledge from PI. Thus, we
+advocate for practitioners to exercise caution when working with PI to avoid
+unintended inductive biases.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLM-3D Print: Large Language Models To Monitor and Control 3D Printing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14307v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14307v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yayati Jadhav, Peter Pak, Amir Barati Farimani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Industry 4.0 has revolutionized manufacturing by driving digitalization and
+shifting the paradigm toward additive manufacturing (AM). Fused Deposition
+Modeling (FDM), a key AM technology, enables the creation of highly customized,
+cost-effective products with minimal material waste through layer-by-layer
+extrusion, posing a significant challenge to traditional subtractive methods.
+However, the susceptibility of material extrusion techniques to errors often
+requires expert intervention to detect and mitigate defects that can severely
+compromise product quality. While automated error detection and machine
+learning models exist, their generalizability across diverse 3D printer setups,
+firmware, and sensors is limited, and deep learning methods require extensive
+labeled datasets, hindering scalability and adaptability. To address these
+challenges, we present a process monitoring and control framework that
+leverages pre-trained Large Language Models (LLMs) alongside 3D printers to
+detect and address printing defects. The LLM evaluates print quality by
+analyzing images captured after each layer or print segment, identifying
+failure modes and querying the printer for relevant parameters. It then
+generates and executes a corrective action plan. We validated the effectiveness
+of the proposed framework in identifying defects by comparing it against a
+control group of engineers with diverse AM expertise. Our evaluation
+demonstrated that LLM-based agents not only accurately identify common 3D
+printing errors, such as inconsistent extrusion, stringing, warping, and layer
+adhesion, but also effectively determine the parameters causing these failures
+and autonomously correct them without any need for human intervention.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ May the Forgetting Be with You: Alternate Replay for Learning with Noisy
+  Labels <span class="chip">BMVC 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14284v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14284v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Monica Millunzi, Lorenzo Bonicelli, Angelo Porrello, Jacopo Credi, Petter N. Kolm, Simone Calderara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Forgetting presents a significant challenge during incremental training,
+making it particularly demanding for contemporary AI systems to assimilate new
+knowledge in streaming data environments. To address this issue, most
+approaches in Continual Learning (CL) rely on the replay of a restricted buffer
+of past data. However, the presence of noise in real-world scenarios, where
+human annotation is constrained by time limitations or where data is
+automatically gathered from the web, frequently renders these strategies
+vulnerable. In this study, we address the problem of CL under Noisy Labels
+(CLN) by introducing Alternate Experience Replay (AER), which takes advantage
+of forgetting to maintain a clear distinction between clean, complex, and noisy
+samples in the memory buffer. The idea is that complex or mislabeled examples,
+which hardly fit the previously learned data distribution, are most likely to
+be forgotten. To grasp the benefits of such a separation, we equip AER with
+Asymmetric Balanced Sampling (ABS): a new sample selection strategy that
+prioritizes purity on the current task while retaining relevant samples from
+the past. Through extensive computational comparisons, we demonstrate the
+effectiveness of our approach in terms of both accuracy and purity of the
+obtained buffer, resulting in a remarkable average gain of 4.71% points in
+accuracy with respect to existing loss-based purification strategies. Code is
+available at https://github.com/aimagelab/mammoth.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 5 figures. Accepted at the The 35th British Machine Vision
+  Conference 2024 (BMVC 2024), Glasgow, UK</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Uncertainties of Latent Representations in Computer Vision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14281v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14281v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Kirchhof
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Uncertainty quantification is a key pillar of trustworthy machine learning.
+It enables safe reactions under unsafe inputs, like predicting only when the
+machine learning model detects sufficient evidence, discarding anomalous data,
+or emitting warnings when an error is likely to be inbound. This is
+particularly crucial in safety-critical areas like medical image classification
+or self-driving cars. Despite the plethora of proposed uncertainty
+quantification methods achieving increasingly higher scores on performance
+benchmarks, uncertainty estimates are often shied away from in practice. Many
+machine learning projects start from pretrained latent representations that
+come without uncertainty estimates. Uncertainties would need to be trained by
+practitioners on their own, which is notoriously difficult and
+resource-intense.
+  This thesis makes uncertainty estimates easily accessible by adding them to
+the latent representation vectors of pretrained computer vision models. Besides
+proposing approaches rooted in probability and decision theory, such as
+Monte-Carlo InfoNCE (MCInfoNCE) and loss prediction, we delve into both
+theoretical and empirical questions. We show that these unobservable
+uncertainties about unobservable latent representations are indeed provably
+correct. We also provide an uncertainty-aware representation learning (URL)
+benchmark to compare these unobservables against observable ground-truths.
+Finally, we compile our findings to pretrain lightweight representation
+uncertainties on large-scale computer vision models that transfer to unseen
+datasets in a zero-shot manner.
+  Our findings do not only advance the current theoretical understanding of
+uncertainties over latent variables, but also facilitate the access to
+uncertainty quantification for future researchers inside and outside the field,
+enabling straightforward but trustworthy machine learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Doctoral thesis</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 1-Bit FQT: Pushing the Limit of Fully Quantized Training to 1-bit 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14267v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14267v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chang Gao, Jianfei Chen, Kang Zhao, Jiaqi Wang, Liping Jing
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fully quantized training (FQT) accelerates the training of deep neural
+networks by quantizing the activations, weights, and gradients into lower
+precision. To explore the ultimate limit of FQT (the lowest achievable
+precision), we make a first attempt to 1-bit FQT. We provide a theoretical
+analysis of FQT based on Adam and SGD, revealing that the gradient variance
+influences the convergence of FQT. Building on these theoretical results, we
+introduce an Activation Gradient Pruning (AGP) strategy. The strategy leverages
+the heterogeneity of gradients by pruning less informative gradients and
+enhancing the numerical precision of remaining gradients to mitigate gradient
+variance. Additionally, we propose Sample Channel joint Quantization (SCQ),
+which utilizes different quantization strategies in the computation of weight
+gradients and activation gradients to ensure that the method is friendly to
+low-bitwidth hardware. Finally, we present a framework to deploy our algorithm.
+For fine-tuning VGGNet-16 and ResNet-18 on multiple datasets, our algorithm
+achieves an average accuracy improvement of approximately 6%, compared to
+per-sample quantization. Moreover, our training speedup can reach a maximum of
+5.13x compared to full precision training.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HyperSBINN: A Hypernetwork-Enhanced Systems Biology-Informed Neural
+  Network for Efficient Drug Cardiosafety Assessment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14266v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14266v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Inass Soukarieh, Gerhard Hessler, Hervé Minoux, Marcel Mohr, Friedemann Schmidt, Jan Wenzel, Pierre Barbillon, Hugo Gangloff, Pierre Gloaguen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mathematical modeling in systems toxicology enables a comprehensive
+understanding of the effects of pharmaceutical substances on cardiac health.
+However, the complexity of these models limits their widespread application in
+early drug discovery. In this paper, we introduce a novel approach to solving
+parameterized models of cardiac action potentials by combining meta-learning
+techniques with Systems Biology-Informed Neural Networks (SBINNs). The proposed
+method, HyperSBINN, effectively addresses the challenge of predicting the
+effects of various compounds at different concentrations on cardiac action
+potentials, outperforming traditional differential equation solvers in speed.
+Our model efficiently handles scenarios with limited data and complex
+parameterized differential equations. The HyperSBINN model demonstrates robust
+performance in predicting APD90 values, indicating its potential as a reliable
+tool for modeling cardiac electrophysiology and aiding in preclinical drug
+development. This framework represents an advancement in computational
+modeling, offering a scalable and efficient solution for simulating and
+understanding complex biological systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Integrated Brain Connectivity Analysis with fMRI, DTI, and sMRI Powered
+  by Interpretable Graph Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14254v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14254v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gang Qu, Ziyu Zhou, Vince D. Calhoun, Aiying Zhang, Yu-Ping Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal neuroimaging modeling has becomes a widely used approach but
+confronts considerable challenges due to heterogeneity, which encompasses
+variability in data types, scales, and formats across modalities. This
+variability necessitates the deployment of advanced computational methods to
+integrate and interpret these diverse datasets within a cohesive analytical
+framework. In our research, we amalgamate functional magnetic resonance
+imaging, diffusion tensor imaging, and structural MRI into a cohesive
+framework. This integration capitalizes on the unique strengths of each
+modality and their inherent interconnections, aiming for a comprehensive
+understanding of the brain's connectivity and anatomical characteristics.
+Utilizing the Glasser atlas for parcellation, we integrate imaging derived
+features from various modalities: functional connectivity from fMRI, structural
+connectivity from DTI, and anatomical features from sMRI within consistent
+regions. Our approach incorporates a masking strategy to differentially weight
+neural connections, thereby facilitating a holistic amalgamation of multimodal
+imaging data. This technique enhances interpretability at connectivity level,
+transcending traditional analyses centered on singular regional attributes. The
+model is applied to the Human Connectome Project's Development study to
+elucidate the associations between multimodal imaging and cognitive functions
+throughout youth. The analysis demonstrates improved predictive accuracy and
+uncovers crucial anatomical features and essential neural connections,
+deepening our understanding of brain structure and function.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Evaluation of Explanation Methods for Black-Box Detectors of
+  Machine-Generated Text 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14252v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14252v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Loris Schoenegger, Yuxi Xia, Benjamin Roth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The increasing difficulty to distinguish language-model-generated from
+human-written text has led to the development of detectors of machine-generated
+text (MGT). However, in many contexts, a black-box prediction is not
+sufficient, it is equally important to know on what grounds a detector made
+that prediction. Explanation methods that estimate feature importance promise
+to provide indications of which parts of an input are used by classifiers for
+prediction. However, the quality of different explanation methods has not
+previously been assessed for detectors of MGT. This study conducts the first
+systematic evaluation of explanation quality for this task. The dimensions of
+faithfulness and stability are assessed with five automated experiments, and
+usefulness is evaluated in a user study. We use a dataset of ChatGPT-generated
+and human-written documents, and pair predictions of three existing
+language-model-based detectors with the corresponding SHAP, LIME, and Anchor
+explanations. We find that SHAP performs best in terms of faithfulness,
+stability, and in helping users to predict the detector's behavior. In
+contrast, LIME, perceived as most useful by users, scores the worst in terms of
+user performance at predicting the detectors' behavior.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DSTI at LLMs4OL 2024 Task A: Intrinsic versus extrinsic knowledge for
+  type classification <span class="chip">ISWC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14236v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14236v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanna Abi Akl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce semantic towers, an extrinsic knowledge representation method,
+and compare it to intrinsic knowledge in large language models for ontology
+learning. Our experiments show a trade-off between performance and semantic
+grounding for extrinsic knowledge compared to a fine-tuned model intrinsic
+knowledge. We report our findings on the Large Language Models for Ontology
+Learning (LLMs4OL) 2024 challenge.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 4 figures, accepted for the LLMs4OL challenge at the
+  International Semantic Web Conference (ISWC) 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FSDEM: Feature Selection Dynamic Evaluation Metric 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14234v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14234v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Rajabinasab, Anton D. Lautrup, Tobias Hyrup, Arthur Zimek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Expressive evaluation metrics are indispensable for informative experiments
+in all areas, and while several metrics are established in some areas, in
+others, such as feature selection, only indirect or otherwise limited
+evaluation metrics are found. In this paper, we propose a novel evaluation
+metric to address several problems of its predecessors and allow for flexible
+and reliable evaluation of feature selection algorithms. The proposed metric is
+a dynamic metric with two properties that can be used to evaluate both the
+performance and the stability of a feature selection algorithm. We conduct
+several empirical experiments to illustrate the use of the proposed metric in
+the successful evaluation of feature selection algorithms. We also provide a
+comparison and analysis to show the different aspects involved in the
+evaluation of the feature selection algorithms. The results indicate that the
+proposed metric is successful in carrying out the evaluation task for feature
+selection algorithms.
+  This paper is an extended version of a paper accepted at SISAP 2024.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Short version of this paper is accepted at 17th International
+  Conference on Similarity Search and Applications, SISAP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gallery-Aware Uncertainty Estimation For Open-Set Face Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14229v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14229v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leonid Erlygin, Alexey Zaytsev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurately estimating image quality and model robustness improvement are
+critical challenges in unconstrained face recognition, which can be addressed
+through uncertainty estimation via probabilistic face embeddings. Previous
+research mainly focused on uncertainty estimation in face verification, leaving
+the open-set face recognition task underexplored. In open-set face recognition,
+one seeks to classify an image, which could also be unknown. Here, the low
+variance of probabilistic embedding does not imply a low error probability: an
+image embedding could be close to several classes in a gallery, thus yielding
+high uncertainty. We propose a method aware of two sources of ambiguity in the
+open-set recognition system: (1) the gallery uncertainty caused by overlapping
+classes and (2) the uncertainty of the face embeddings. To detect both types,
+we use a Bayesian probabilistic model of embedding distribution, which provides
+a principled uncertainty estimate. Challenging open-set face recognition
+datasets, such as IJB-C, serve as a testbed for our method. We also propose a
+new open-set recognition protocol for whale and dolphin identification. The
+proposed approach better identifies recognition errors than uncertainty
+estimation methods based solely on image quality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Provable Imbalanced Point Clustering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14225v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14225v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Denisov, Dan Feldman, Shlomi Dolev, Michael Segal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We suggest efficient and provable methods to compute an approximation for
+imbalanced point clustering, that is, fitting $k$-centers to a set of points in
+$\mathbb{R}^d$, for any $d,k\geq 1$. To this end, we utilize \emph{coresets},
+which, in the context of the paper, are essentially weighted sets of points in
+$\mathbb{R}^d$ that approximate the fitting loss for every model in a given
+set, up to a multiplicative factor of $1\pm\varepsilon$. We provide [Section 3
+and Section E in the appendix] experiments that show the empirical contribution
+of our suggested methods for real images (novel and reference), synthetic data,
+and real-world data. We also propose choice clustering, which by combining
+clustering algorithms yields better performance than each one separately.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Lemon and Orange Disease Classification using CNN-Extracted Features and
+  Machine Learning Classifier 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14206v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14206v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Khandoker Nosiba Arifin, Sayma Akter Rupa, Md Musfique Anwar, Israt Jahan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lemons and oranges, both are the most economically significant citrus fruits
+globally. The production of lemons and oranges is severely affected due to
+diseases in its growth stages. Fruit quality has degraded due to the presence
+of flaws. Thus, it is necessary to diagnose the disease accurately so that we
+can avoid major loss of lemons and oranges. To improve citrus farming, we
+proposed a disease classification approach for lemons and oranges. This
+approach would enable early disease detection and intervention, reduce yield
+losses, and optimize resource allocation. For the initial modeling of disease
+classification, the research uses innovative deep learning architectures such
+as VGG16, VGG19 and ResNet50. In addition, for achieving better accuracy, the
+basic machine learning algorithms used for classification problems include
+Random Forest, Naive Bayes, K-Nearest Neighbors (KNN) and Logistic Regression.
+The lemon and orange fruits diseases are classified more accurately (95.0% for
+lemon and 99.69% for orange) by the model. The model's base features were
+extracted from the ResNet50 pre-trained model and the diseases are classified
+by the Logistic Regression which beats the performance given by VGG16 and VGG19
+for other classifiers. Experimental outcomes show that the proposed model also
+outperforms existing models in which most of them classified the diseases using
+the Softmax classifier without using any individual classifiers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Representative Arm Identification: A fixed confidence approach to
+  identify cluster representatives 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14195v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14195v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sarvesh Gharat, Aniket Yadav, Nikhil Karamchandani, Jayakrishnan Nair
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the representative arm identification (RAI) problem in the
+multi-armed bandits (MAB) framework, wherein we have a collection of arms, each
+associated with an unknown reward distribution. An underlying instance is
+defined by a partitioning of the arms into clusters of predefined sizes, such
+that for any $j > i$, all arms in cluster $i$ have a larger mean reward than
+those in cluster $j$. The goal in RAI is to reliably identify a certain
+prespecified number of arms from each cluster, while using as few arm pulls as
+possible. The RAI problem covers as special cases several well-studied MAB
+problems such as identifying the best arm or any $M$ out of the top $K$, as
+well as both full and coarse ranking. We start by providing an
+instance-dependent lower bound on the sample complexity of any feasible
+algorithm for this setting. We then propose two algorithms, based on the idea
+of confidence intervals, and provide high probability upper bounds on their
+sample complexity, which orderwise match the lower bound. Finally, we do an
+empirical comparison of both algorithms along with an LUCB-type alternative on
+both synthetic and real-world datasets, and demonstrate the superior
+performance of our proposed schemes in most cases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>We analyse a clustered multi-armed bandit formulation, where the
+  learning objective is to identify representative arms from each cluster, in a
+  fixed confidence setting</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robot Navigation with Entity-Based Collision Avoidance using Deep
+  Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14183v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14183v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yury Kolomeytsev, Dmitry Golembiovsky
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Efficient navigation in dynamic environments is crucial for autonomous robots
+interacting with various environmental entities, including both moving agents
+and static obstacles. In this study, we present a novel methodology that
+enhances the robot's interaction with different types of agents and obstacles
+based on specific safety requirements. This approach uses information about the
+entity types, improving collision avoidance and ensuring safer navigation. We
+introduce a new reward function that penalizes the robot for collisions with
+different entities such as adults, bicyclists, children, and static obstacles,
+and additionally encourages the robot's proximity to the goal. It also
+penalizes the robot for being close to entities, and the safe distance also
+depends on the entity type. Additionally, we propose an optimized algorithm for
+training and testing, which significantly accelerates train, validation, and
+test steps and enables training in complex environments. Comprehensive
+experiments conducted using simulation demonstrate that our approach
+consistently outperforms conventional navigation and collision avoidance
+methods, including state-of-the-art techniques. To sum up, this work
+contributes to enhancing the safety and efficiency of navigation systems for
+autonomous robots in dynamic, crowded environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Application of Disentanglement to Map Registration Problem 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14152v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14152v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hae Jin Song, Patrycja Krawczuk, Po-Hsuan Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Geospatial data come from various sources, such as satellites, aircraft, and
+LiDAR. The variability of the source is not limited to the types of data
+acquisition techniques, as we have maps from different time periods. To
+incorporate these data for a coherent analysis, it is essential to first align
+different "styles" of geospatial data to its matching images that point to the
+same location on the surface of the Earth. In this paper, we approach the image
+registration as a two-step process of (1) extracting geospatial contents
+invariant to visual (and any other non-content-related) information, and (2)
+matching the data based on such (purely) geospatial contents. We hypothesize
+that a combination of $\beta$-VAE-like architecture [2] and adversarial
+training will achieve both the disentanglement of the geographic information
+and artistic styles and generation of new map tiles by composing the encoded
+geographic information with any artistic style.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TSAK: Two-Stage Semantic-Aware Knowledge Distillation for Efficient
+  Wearable Modality and Model Optimization in Manufacturing Lines <span class="chip">ICPR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14146v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14146v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hymalai Bello, Daniel Geißler, Sungho Suh, Bo Zhou, Paul Lukowicz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Smaller machine learning models, with less complex architectures and sensor
+inputs, can benefit wearable sensor-based human activity recognition (HAR)
+systems in many ways, from complexity and cost to battery life. In the specific
+case of smart factories, optimizing human-robot collaboration hinges on the
+implementation of cutting-edge, human-centric AI systems. To this end, workers'
+activity recognition enables accurate quantification of performance metrics,
+improving efficiency holistically. We present a two-stage semantic-aware
+knowledge distillation (KD) approach, TSAK, for efficient, privacy-aware, and
+wearable HAR in manufacturing lines, which reduces the input sensor modalities
+as well as the machine learning model size, while reaching similar recognition
+performance as a larger multi-modal and multi-positional teacher model. The
+first stage incorporates a teacher classifier model encoding attention, causal,
+and combined representations. The second stage encompasses a semantic
+classifier merging the three representations from the first stage. To evaluate
+TSAK, we recorded a multi-modal dataset at a smart factory testbed with
+wearable and privacy-aware sensors (IMU and capacitive) located on both
+workers' hands. In addition, we evaluated our approach on OpenPack, the only
+available open dataset mimicking the wearable sensor placements on both hands
+in the manufacturing HAR scenario. We compared several KD strategies with
+different representations to regulate the training process of a smaller student
+model. Compared to the larger teacher model, the student model takes fewer
+sensor channels from a single hand, has 79% fewer parameters, runs 8.88 times
+faster, and requires 96.6% less computing power (FLOPS).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in 27th International Conference on Pattern Recognition
+  (ICPR)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neighborhood and Global Perturbations Supported SAM in Federated
+  Learning: From Local Tweaks To Global Awareness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14144v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14144v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Boyuan Li, Zihao Peng, Yafei Li, Mingliang Xu, Shengbo Chen, Baofeng Ji, Cong Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) can be coordinated under the orchestration of a
+central server to collaboratively build a privacy-preserving model without the
+need for data exchange. However, participant data heterogeneity leads to local
+optima divergence, subsequently affecting convergence outcomes. Recent research
+has focused on global sharpness-aware minimization (SAM) and dynamic
+regularization techniques to enhance consistency between global and local
+generalization and optimization objectives. Nonetheless, the estimation of
+global SAM introduces additional computational and memory overhead, while
+dynamic regularization suffers from bias in the local and global dual variables
+due to training isolation. In this paper, we propose a novel FL algorithm,
+FedTOGA, designed to consider optimization and generalization objectives while
+maintaining minimal uplink communication overhead. By linking local
+perturbations to global updates, global generalization consistency is improved.
+Additionally, global updates are used to correct local dynamic regularizers,
+reducing dual variables bias and enhancing optimization consistency. Global
+updates are passively received by clients, reducing overhead. We also propose
+neighborhood perturbation to approximate local perturbation, analyzing its
+strengths and limitations. Theoretical analysis shows FedTOGA achieves faster
+convergence $O(1/T)$ under non-convex functions. Empirical studies demonstrate
+that FedTOGA outperforms state-of-the-art algorithms, with a 1\% accuracy
+increase and 30\% faster convergence, achieving state-of-the-art.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 2D-Malafide: Adversarial Attacks Against Face Deepfake Detection Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14143v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14143v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chiara Galdi, Michele Panariello, Massimiliano Todisco, Nicholas Evans
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce 2D-Malafide, a novel and lightweight adversarial attack designed
+to deceive face deepfake detection systems. Building upon the concept of 1D
+convolutional perturbations explored in the speech domain, our method leverages
+2D convolutional filters to craft perturbations which significantly degrade the
+performance of state-of-the-art face deepfake detectors. Unlike traditional
+additive noise approaches, 2D-Malafide optimises a small number of filter
+coefficients to generate robust adversarial perturbations which are
+transferable across different face images. Experiments, conducted using the
+FaceForensics++ dataset, demonstrate that 2D-Malafide substantially degrades
+detection performance in both white-box and black-box settings, with larger
+filter sizes having the greatest impact. Additionally, we report an
+explainability analysis using GradCAM which illustrates how 2D-Malafide
+misleads detection systems by altering the image areas used most for
+classification. Our findings highlight the vulnerability of current deepfake
+detection systems to convolutional adversarial attacks as well as the need for
+future work to enhance detection robustness through improved image fidelity
+constraints.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at BIOSIG 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring the Potential of Large Language Models for Heterophilic Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14134v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14134v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxia Wu, Shujie Li, Yuan Fang, Chuan Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) are essential for various graph-based learning
+tasks. Notably, classical GNN architectures operate under the assumption of
+homophily, which posits that connected nodes are likely to share similar
+features. However, this assumption limits the effectiveness of GNNs in handling
+heterophilic graphs where connected nodes often exhibit dissimilar
+characteristics. Existing approaches for homophily graphs such as non-local
+neighbor extension and architectural refinement overlook the rich textual data
+associated with nodes, which could unlock deeper insights into these
+heterophilic contexts. With advancements in Large Language Models (LLMs), there
+is significant promise to enhance GNNs by leveraging the extensive open-world
+knowledge within LLMs to more effectively interpret and utilize textual data
+for characterizing heterophilic graphs. In this work, we explore the potential
+of LLMs for modeling heterophilic graphs and propose a novel two-stage
+framework: LLM-enhanced edge discriminator and LLM-guided edge reweighting.
+Specifically, in the first stage, we fine-tune the LLM to better identify
+homophilic and heterophilic edges based on the textual information of their
+nodes. In the second stage, we adaptively manage message propagation in GNNs
+for different edge types based on node features, structures, and heterophilic
+or homophilic characteristics. To cope with the computational demands when
+deploying LLMs in practical scenarios, we further explore model distillation
+techniques to fine-tune smaller, more efficient models that maintain
+competitive performance. Extensive experiments validate the effectiveness of
+our framework, demonstrating the feasibility of using LLMs to enhance GNNs for
+node classification on heterophilic graphs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Theoretical Proportion Label Perturbation for Learning from Label
+  Proportions in Large Bags <span class="chip">ECAI2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14130v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14130v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shunsuke Kubo, Shinnosuke Matsuo, Daiki Suehiro, Kazuhiro Terada, Hiroaki Ito, Akihiko Yoshizawa, Ryoma Bise
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning from label proportions (LLP) is a kind of weakly supervised learning
+that trains an instance-level classifier from label proportions of bags, which
+consist of sets of instances without using instance labels. A challenge in LLP
+arises when the number of instances in a bag (bag size) is numerous, making the
+traditional LLP methods difficult due to GPU memory limitations. This study
+aims to develop an LLP method capable of learning from bags with large sizes.
+In our method, smaller bags (mini-bags) are generated by sampling instances
+from large-sized bags (original bags), and these mini-bags are used in place of
+the original bags. However, the proportion of a mini-bag is unknown and differs
+from that of the original bag, leading to overfitting. To address this issue,
+we propose a perturbation method for the proportion labels of sampled mini-bags
+to mitigate overfitting to noisy label proportions. This perturbation is added
+based on the multivariate hypergeometric distribution, which is statistically
+modeled. Additionally, loss weighting is implemented to reduce the negative
+impact of proportions sampled from the tail of the distribution. Experimental
+results demonstrate that the proportion label perturbation and loss weighting
+achieve classification accuracy comparable to that obtained without sampling.
+Our codes are available at https://github.com/stainlessnight/LLP-LargeBags.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ECAI2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Fairness through Reweighting: A Path to Attain the Sufficiency
+  Rule <span class="chip">ECAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14126v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14126v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuan Zhao, Klaus Broelemann, Salvatore Ruggieri, Gjergji Kasneci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce an innovative approach to enhancing the empirical risk
+minimization (ERM) process in model training through a refined reweighting
+scheme of the training data to enhance fairness. This scheme aims to uphold the
+sufficiency rule in fairness by ensuring that optimal predictors maintain
+consistency across diverse sub-groups. We employ a bilevel formulation to
+address this challenge, wherein we explore sample reweighting strategies.
+Unlike conventional methods that hinge on model size, our formulation bases
+generalization complexity on the space of sample weights. We discretize the
+weights to improve training speed. Empirical validation of our method showcases
+its effectiveness and robustness, revealing a consistent improvement in the
+balance between prediction performance and fairness metrics across various
+experiments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted at ECAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Lifelong Learning Embeddings: An Algorithmic Approach to
+  Dynamically Extend Embeddings <span class="chip">KDD2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14118v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14118v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miguel Alves Gomes, Philipp Meisen, Tobias Meisen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid evolution of technology has transformed business operations and
+customer interactions worldwide, with personalization emerging as a key
+opportunity for e-commerce companies to engage customers more effectively. The
+application of machine learning, particularly that of deep learning models, has
+gained significant traction due to its ability to rapidly recognize patterns in
+large datasets, thereby offering numerous possibilities for personalization.
+These models use embeddings to map discrete information, such as product IDs,
+into a latent vector space, a method increasingly popular in recent years.
+However, e-commerce's dynamic nature, characterized by frequent new product
+introductions, poses challenges for these embeddings, which typically require
+fixed dimensions and inputs, leading to the need for periodic retraining from
+scratch. This paper introduces a modular algorithm that extends embedding input
+size while preserving learned knowledge, addressing the challenges posed by
+e-commerce's dynamism. The proposed algorithm also incorporates strategies to
+mitigate the cold start problem associated with new products. The results of
+initial experiments suggest that this method outperforms traditional
+embeddings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted Extended Abstract for 3rd Workshop on End-End Customer
+  Journey Optimization at KDD2024, Barcelona, Spain</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hierarchical Learning and Computing over Space-Ground Integrated
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14116v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14116v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingyang Zhu, Yuanming Shi, Yong Zhou, Chunxiao Jiang, Linling Kuang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Space-ground integrated networks hold great promise for providing global
+connectivity, particularly in remote areas where large amounts of valuable data
+are generated by Internet of Things (IoT) devices, but lacking terrestrial
+communication infrastructure. The massive data is conventionally transferred to
+the cloud server for centralized artificial intelligence (AI) models training,
+raising huge communication overhead and privacy concerns. To address this, we
+propose a hierarchical learning and computing framework, which leverages the
+lowlatency characteristic of low-earth-orbit (LEO) satellites and the global
+coverage of geostationary-earth-orbit (GEO) satellites, to provide global
+aggregation services for locally trained models on ground IoT devices. Due to
+the time-varying nature of satellite network topology and the energy
+constraints of LEO satellites, efficiently aggregating the received local
+models from ground devices on LEO satellites is highly challenging. By
+leveraging the predictability of inter-satellite connectivity, modeling the
+space network as a directed graph, we formulate a network energy minimization
+problem for model aggregation, which turns out to be a Directed Steiner Tree
+(DST) problem. We propose a topologyaware energy-efficient routing (TAEER)
+algorithm to solve the DST problem by finding a minimum spanning arborescence
+on a substitute directed graph. Extensive simulations under realworld
+space-ground integrated network settings demonstrate that the proposed TAEER
+algorithm significantly reduces energy consumption and outperforms benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ReLExS: Reinforcement Learning Explanations for Stackelberg No-Regret
+  Learners 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14086v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14086v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangge Huang, Jingyuan Li, Jiaqing Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the constraint of a no regret follower, will the players in a two-player
+Stackelberg game still reach Stackelberg equilibrium? We first show when the
+follower strategy is either reward-average or transform-reward-average, the two
+players can always get the Stackelberg Equilibrium. Then, we extend that the
+players can achieve the Stackelberg equilibrium in the two-player game under
+the no regret constraint. Also, we show a strict upper bound of the follower's
+utility difference between with and without no regret constraint. Moreover, in
+constant-sum two-player Stackelberg games with non-regret action sequences, we
+ensure the total optimal utility of the game remains also bounded.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 3 figures. Technical Report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SONICS: Synthetic Or Not -- Identifying Counterfeit Songs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14080v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14080v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Awsafur Rahman, Zaber Ibn Abdul Hakim, Najibul Haque Sarker, Bishmoy Paul, Shaikh Anowarul Fattah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent surge in AI-generated songs presents exciting possibilities and
+challenges. While these tools democratize music creation, they also necessitate
+the ability to distinguish between human-composed and AI-generated songs for
+safeguarding artistic integrity and content curation. Existing research and
+datasets in fake song detection only focus on singing voice deepfake detection
+(SVDD), where the vocals are AI-generated but the instrumental music is sourced
+from real songs. However, this approach is inadequate for contemporary
+end-to-end AI-generated songs where all components (vocals, lyrics, music, and
+style) could be AI-generated. Additionally, existing datasets lack lyrics-music
+diversity, long-duration songs, and open fake songs. To address these gaps, we
+introduce SONICS, a novel dataset for end-to-end Synthetic Song Detection
+(SSD), comprising over 97k songs with over 49k synthetic songs from popular
+platforms like Suno and Udio. Furthermore, we highlight the importance of
+modeling long-range temporal dependencies in songs for effective authenticity
+detection, an aspect overlooked in existing methods. To capture these patterns,
+we propose a novel model, SpecTTTra, that is up to 3 times faster and 6 times
+more memory efficient compared to popular CNN and Transformer-based models
+while maintaining competitive performance. Finally, we offer both AI-based and
+Human evaluation benchmarks, addressing another deficiency in current research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Score-based change point detection via tracking the best of infinitely
+  many experts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14073v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14073v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anna Markovich, Nikita Puchkin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We suggest a novel algorithm for online change point detection based on
+sequential score function estimation and tracking the best expert approach. The
+core of the procedure is a version of the fixed share forecaster for the case
+of infinite number of experts and quadratic loss functions. The algorithm shows
+a promising performance in numerical experiments on artificial and real-world
+data sets. We also derive new upper bounds on the dynamic regret of the fixed
+share forecaster with varying parameter, which are of independent interest.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>43 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bridging the gap between Learning-to-plan, Motion Primitives and Safe
+  Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14063v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14063v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Piotr Kicki, Davide Tateo, Puze Liu, Jonas Guenster, Jan Peters, Krzysztof Walas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Trajectory planning under kinodynamic constraints is fundamental for advanced
+robotics applications that require dexterous, reactive, and rapid skills in
+complex environments. These constraints, which may represent task, safety, or
+actuator limitations, are essential for ensuring the proper functioning of
+robotic platforms and preventing unexpected behaviors. Recent advances in
+kinodynamic planning demonstrate that learning-to-plan techniques can generate
+complex and reactive motions under intricate constraints. However, these
+techniques necessitate the analytical modeling of both the robot and the entire
+task, a limiting assumption when systems are extremely complex or when
+constructing accurate task models is prohibitive. This paper addresses this
+limitation by combining learning-to-plan methods with reinforcement learning,
+resulting in a novel integration of black-box learning of motion primitives and
+optimization. We evaluate our approach against state-of-the-art safe
+reinforcement learning methods, showing that our technique, particularly when
+exploiting task structure, outperforms baseline methods in challenging
+scenarios such as planning to hit in robot air hockey. This work demonstrates
+the potential of our integrated approach to enhance the performance and safety
+of robots operating under complex kinodynamic constraints.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PAGE: Parametric Generative Explainer for Graph Neural Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14042v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14042v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Qiu, Wei Liu, Jun Wang, Ruixuan Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This article introduces PAGE, a parameterized generative interpretive
+framework. PAGE is capable of providing faithful explanations for any graph
+neural network without necessitating prior knowledge or internal details.
+Specifically, we train the auto-encoder to generate explanatory substructures
+by designing appropriate training strategy. Due to the dimensionality reduction
+of features in the latent space of the auto-encoder, it becomes easier to
+extract causal features leading to the model's output, which can be easily
+employed to generate explanations. To accomplish this, we introduce an
+additional discriminator to capture the causality between latent causal
+features and the model's output. By designing appropriate optimization
+objectives, the well-trained discriminator can be employed to constrain the
+encoder in generating enhanced causal features. Finally, these features are
+mapped to substructures of the input graph through the decoder to serve as
+explanations. Compared to existing methods, PAGE operates at the sample scale
+rather than nodes or edges, eliminating the need for perturbation or encoding
+processes as seen in previous methods. Experimental results on both
+artificially synthesized and real-world datasets demonstrate that our approach
+not only exhibits the highest faithfulness and accuracy but also significantly
+outperforms baseline models in terms of efficiency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Re-Mix: Optimizing Data Mixtures for Large Scale Imitation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14037v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14037v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joey Hejna, Chethan Bhateja, Yichen Jian, Karl Pertsch, Dorsa Sadigh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Increasingly large imitation learning datasets are being collected with the
+goal of training foundation models for robotics. However, despite the fact that
+data selection has been of utmost importance in vision and natural language
+processing, little work in robotics has questioned what data such models should
+actually be trained on. In this work we investigate how to weigh different
+subsets or ``domains'' of robotics datasets for robot foundation model
+pre-training. Concrete, we use distributionally robust optimization (DRO) to
+maximize worst-case performance across all possible downstream domains. Our
+method, Re-Mix, addresses the wide range of challenges that arise when applying
+DRO to robotics datasets including variability in action spaces and dynamics
+across different datasets. Re-Mix employs early stopping, action normalization,
+and discretization to counteract these issues. Through extensive
+experimentation on the largest open-source robot manipulation dataset, the Open
+X-Embodiment dataset, we demonstrate that data curation can have an outsized
+impact on downstream performance. Specifically, domain weights learned by
+Re-Mix outperform uniform weights by 38\% on average and outperform
+human-selected weights by 32\% on datasets used to train existing generalist
+robot policies, specifically the RT-X models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SurGen: Text-Guided Diffusion Model for Surgical Video Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14028v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14028v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joseph Cho, Samuel Schmidgall, Cyril Zakka, Mrudang Mathur, Rohan Shad, William Hiesinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion-based video generation models have made significant strides,
+producing outputs with improved visual fidelity, temporal coherence, and user
+control. These advancements hold great promise for improving surgical education
+by enabling more realistic, diverse, and interactive simulation environments.
+In this study, we introduce SurGen, a text-guided diffusion model tailored for
+surgical video synthesis, producing the highest resolution and longest duration
+videos among existing surgical video generation models. We validate the visual
+and temporal quality of the outputs using standard image and video generation
+metrics. Additionally, we assess their alignment to the corresponding text
+prompts through a deep learning classifier trained on surgical data. Our
+results demonstrate the potential of diffusion models to serve as valuable
+educational tools for surgical trainees.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Item Response Theory-based R Module for Algorithm Portfolio Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14025v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14025v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brodie Oldfield, Sevvandi Kandanaarachchi, Ziqi Xu, Mario Andrés Muñoz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Experimental evaluation is crucial in AI research, especially for assessing
+algorithms across diverse tasks. Many studies often evaluate a limited set of
+algorithms, failing to fully understand their strengths and weaknesses within a
+comprehensive portfolio. This paper introduces an Item Response Theory (IRT)
+based analysis tool for algorithm portfolio evaluation called AIRT-Module.
+Traditionally used in educational psychometrics, IRT models test question
+difficulty and student ability using responses to test questions. Adapting IRT
+to algorithm evaluation, the AIRT-Module contains a Shiny web application and
+the R package airt. AIRT-Module uses algorithm performance measures to compute
+anomalousness, consistency, and difficulty limits for an algorithm and the
+difficulty of test instances. The strengths and weaknesses of algorithms are
+visualised using the difficulty spectrum of the test instances. AIRT-Module
+offers a detailed understanding of algorithm capabilities across varied test
+instances, thus enhancing comprehensive AI method assessment. It is available
+at https://sevvandi.shinyapps.io/AIRT/ .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 Pages, 6 Figures. Submitted to SoftwareX</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Category-Theoretical and Topos-Theoretical Frameworks in Machine
+  Learning: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14014v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14014v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiyang Jia, Guohong Peng, Zheng Yang, Tianhao Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this survey, we provide an overview of category theory-derived machine
+learning from four mainstream perspectives: gradient-based learning,
+probability-based learning, invariance and equivalence-based learning, and
+topos-based learning. For the first three topics, we primarily review research
+in the past five years, updating and expanding on the previous survey by
+Shiebler et al.. The fourth topic, which delves into higher category theory,
+particularly topos theory, is surveyed for the first time in this paper. In
+certain machine learning methods, the compositionality of functors plays a
+vital role, prompting the development of specific categorical frameworks.
+However, when considering how the global properties of a network reflect in
+local structures and how geometric properties are expressed with logic, the
+topos structure becomes particularly significant and profound.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Water Quality Time-Series Prediction in Hong Kong using
+  Sentinel-2 MSI Data and Google Earth Engine Cloud Computing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14010v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14010v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rohin Sood, Kevin Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Effective water quality monitoring in coastal regions is crucial due to the
+progressive deterioration caused by pollution and human activities. To address
+this, this study develops time-series models to predict chlorophyll-a (Chl-a),
+suspended solids (SS), and turbidity using Sentinel-2 satellite data and Google
+Earth Engine (GEE) in the coastal regions of Hong Kong. Leveraging Long
+Short-Term Memory (LSTM) Recurrent Neural Networks, the study incorporates
+extensive temporal datasets to enhance prediction accuracy. The models utilize
+spectral data from Sentinel-2, focusing on optically active components, and
+demonstrate that selected variables closely align with the spectral
+characteristics of Chl-a and SS. The results indicate improved predictive
+performance over previous methods, highlighting the potential for remote
+sensing technology in continuous and comprehensive water quality assessment.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Decentralized Federated Learning with Model Caching on Mobile Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14001v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14001v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoyu Wang, Guojun Xiong, Houwei Cao, Jian Li, Yong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) aims to train a shared model using data and
+computation power on distributed agents coordinated by a central server.
+Decentralized FL (DFL) utilizes local model exchange and aggregation between
+agents to reduce the communication and computation overheads on the central
+server. However, when agents are mobile, the communication opportunity between
+agents can be sporadic, largely hindering the convergence and accuracy of DFL.
+In this paper, we study delay-tolerant model spreading and aggregation enabled
+by model caching on mobile agents. Each agent stores not only its own model,
+but also models of agents encountered in the recent past. When two agents meet,
+they exchange their own models as well as the cached models. Local model
+aggregation works on all models in the cache. We theoretically analyze the
+convergence of DFL with cached models, explicitly taking into account the model
+staleness introduced by caching. We design and compare different model caching
+algorithms for different DFL and mobility scenarios. We conduct detailed case
+studies in a vehicular network to systematically investigate the interplay
+between agent mobility, cache staleness, and model convergence. In our
+experiments, cached DFL converges quickly, and significantly outperforms DFL
+without caching.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dual-CBA: Improving Online Continual Learning via Dual Continual Bias
+  Adaptors from a Bi-level Optimization Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13991v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13991v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quanziang Wang, Renzhen Wang, Yichen Wu, Xixi Jia, Minghao Zhou, Deyu Meng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In online continual learning (CL), models trained on changing distributions
+easily forget previously learned knowledge and bias toward newly received
+tasks. To address this issue, we present Continual Bias Adaptor (CBA), a
+bi-level framework that augments the classification network to adapt to
+catastrophic distribution shifts during training, enabling the network to
+achieve a stable consolidation of all seen tasks. However, the CBA module
+adjusts distribution shifts in a class-specific manner, exacerbating the
+stability gap issue and, to some extent, fails to meet the need for continual
+testing in online CL. To mitigate this challenge, we further propose a novel
+class-agnostic CBA module that separately aggregates the posterior
+probabilities of classes from new and old tasks, and applies a stable
+adjustment to the resulting posterior probabilities. We combine the two kinds
+of CBA modules into a unified Dual-CBA module, which thus is capable of
+adapting to catastrophic distribution shifts and simultaneously meets the
+real-time testing requirements of online CL. Besides, we propose Incremental
+Batch Normalization (IBN), a tailored BN module to re-estimate its population
+statistics for alleviating the feature bias arising from the inner loop
+optimization problem of our bi-level framework. To validate the effectiveness
+of the proposed method, we theoretically provide some insights into how it
+mitigates catastrophic distribution shifts, and empirically demonstrate its
+superiority through extensive experiments based on four rehearsal-based
+baselines and three public continual learning benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Question answering system of bridge design specification based on large
+  language model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13282v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13282v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leye Zhang, Xiangxiang Tian, Hongjun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper constructs question answering system for bridge design
+specification based on large language model. Three implementation schemes are
+tried: full fine-tuning of the Bert pretrained model, parameter-efficient
+fine-tuning of the Bert pretrained model, and self-built language model from
+scratch. Through the self-built question and answer task dataset, based on the
+tensorflow and keras deep learning platform framework, the model is constructed
+and trained to predict the start position and end position of the answer in the
+bridge design specification given by the user. The experimental results show
+that full fine-tuning of the Bert pretrained model achieves 100% accuracy in
+the training-dataset, validation-dataset and test-dataset, and the system can
+extract the answers from the bridge design specification given by the user to
+answer various questions of the user; While parameter-efficient fine-tuning of
+the Bert pretrained model and self-built language model from scratch perform
+well in the training-dataset, their generalization ability in the test-dataset
+needs to be improved. The research of this paper provides a useful reference
+for the development of question answering system in professional field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AgentMove: Predicting Human Mobility Anywhere Using Large Language Model
+  based Agentic Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13986v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13986v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jie Feng, Yuwei Du, Jie Zhao, Yong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human mobility prediction plays a crucial role in various real-world
+applications. Although deep learning based models have shown promising results
+over the past decade, their reliance on extensive private mobility data for
+training and their inability to perform zero-shot predictions, have hindered
+further advancements. Recently, attempts have been made to apply large language
+models (LLMs) to mobility prediction task. However, their performance has been
+constrained by the absence of a systematic design of workflow. They directly
+generate the final output using LLMs, which limits the potential of LLMs to
+uncover complex mobility patterns and underestimates their extensive reserve of
+global geospatial knowledge. In this paper, we introduce AgentMove, a
+systematic agentic prediction framework to achieve generalized mobility
+prediction for any cities worldwide. In AgentMove, we first decompose the
+mobility prediction task into three sub-tasks and then design corresponding
+modules to complete these subtasks, including spatial-temporal memory for
+individual mobility pattern mining, world knowledge generator for modeling the
+effects of urban structure and collective knowledge extractor for capturing the
+shared patterns among population. Finally, we combine the results of three
+modules and conduct a reasoning step to generate the final predictions.
+Extensive experiments on mobility data from two sources in 12 cities
+demonstrate that AgentMove outperforms the best baseline more than 8% in
+various metrics and it shows robust predictions with various LLMs as base and
+also less geographical bias across cities. Codes and data can be found in
+https://github.com/tsinghua-fib-lab/AgentMove.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Nemesis: Normalizing the Soft-<span class="highlight-title">prompt</span> Vectors of Vision-Language Models <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13979v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13979v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuai Fu, Xiequn Wang, Qiushi Huang, Yu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the prevalence of large-scale pretrained vision-language models (VLMs),
+such as CLIP, soft-prompt tuning has become a popular method for adapting these
+models to various downstream tasks. However, few works delve into the inherent
+properties of learnable soft-prompt vectors, specifically the impact of their
+norms to the performance of VLMs. This motivates us to pose an unexplored
+research question: ``Do we need to normalize the soft prompts in VLMs?'' To
+fill this research gap, we first uncover a phenomenon, called the
+\textbf{Low-Norm Effect} by performing extensive corruption experiments,
+suggesting that reducing the norms of certain learned prompts occasionally
+enhances the performance of VLMs, while increasing them often degrades it. To
+harness this effect, we propose a novel method named \textbf{N}ormalizing
+th\textbf{e} soft-pro\textbf{m}pt v\textbf{e}ctors of vi\textbf{si}on-language
+model\textbf{s} (\textbf{Nemesis}) to normalize soft-prompt vectors in VLMs. To
+the best of our knowledge, our work is the first to systematically investigate
+the role of norms of soft-prompt vector in VLMs, offering valuable insights for
+future research in soft-prompt tuning. The code is available at
+\texttt{\href{https://github.com/ShyFoo/Nemesis}{https://github.com/ShyFoo/Nemesis}}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICLR 2024 (Spotlight)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Synthetic Benchmark to Explore Limitations of Localized Drift
+  Detections <span class="chip">KDD 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14687v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14687v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Flavio Giobergia, Eliana Pastor, Luca de Alfaro, Elena Baralis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Concept drift is a common phenomenon in data streams where the statistical
+properties of the target variable change over time. Traditionally, drift is
+assumed to occur globally, affecting the entire dataset uniformly. However,
+this assumption does not always hold true in real-world scenarios where only
+specific subpopulations within the data may experience drift. This paper
+explores the concept of localized drift and evaluates the performance of
+several drift detection techniques in identifying such localized changes. We
+introduce a synthetic dataset based on the Agrawal generator, where drift is
+induced in a randomly chosen subgroup. Our experiments demonstrate that
+commonly adopted drift detection methods may fail to detect drift when it is
+confined to a small subpopulation. We propose and test various drift detection
+approaches to quantify their effectiveness in this localized drift scenario. We
+make the source code for the generation of the synthetic benchmark available at
+https://github.com/fgiobergia/subgroup-agrawal-drift.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper accepted at DELTA Workshop @ KDD 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Model-Based Reinforcement Learning for Control of Strongly-Disturbed
+  Unsteady Aerodynamic Flows 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14685v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14685v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhecheng Liu, Diederik Beckers, Jeff D. Eldredge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The intrinsic high dimension of fluid dynamics is an inherent challenge to
+control of aerodynamic flows, and this is further complicated by a flow's
+nonlinear response to strong disturbances. Deep reinforcement learning, which
+takes advantage of the exploratory aspects of reinforcement learning (RL) and
+the rich nonlinearity of a deep neural network, provides a promising approach
+to discover feasible control strategies. However, the typical model-free
+approach to reinforcement learning requires a significant amount of interaction
+between the flow environment and the RL agent during training, and this high
+training cost impedes its development and application. In this work, we propose
+a model-based reinforcement learning (MBRL) approach by incorporating a novel
+reduced-order model as a surrogate for the full environment. The model consists
+of a physics-augmented autoencoder, which compresses high-dimensional CFD flow
+field snaphsots into a three-dimensional latent space, and a latent dynamics
+model that is trained to accurately predict the long-time dynamics of
+trajectories in the latent space in response to action sequences. The
+robustness and generalizability of the model is demonstrated in two distinct
+flow environments, a pitching airfoil in a highly disturbed environment and a
+vertical-axis wind turbine in a disturbance-free environment. Based on the
+trained model in the first problem, we realize an MBRL strategy to mitigate
+lift variation during gust-airfoil encounters. We demonstrate that the policy
+learned in the reduced-order environment translates to an effective control
+strategy in the full CFD environment.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Detecting Interpretable Subgroup Drifts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14682v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14682v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Flavio Giobergia, Eliana Pastor, Luca de Alfaro, Elena Baralis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ability to detect and adapt to changes in data distributions is crucial
+to maintain the accuracy and reliability of machine learning models. Detection
+is generally approached by observing the drift of model performance from a
+global point of view. However, drifts occurring in (fine-grained) data
+subgroups may go unnoticed when monitoring global drift. We take a different
+perspective, and introduce methods for observing drift at the finer granularity
+of subgroups. Relevant data subgroups are identified during training and
+monitored efficiently throughout the model's life. Performance drifts in any
+subgroup are detected, quantified and characterized so as to provide an
+interpretable summary of the model behavior over time. Experimental results
+confirm that our subgroup-level drift analysis identifies drifts that do not
+show at the (coarser) global dataset level. The proposed approach provides a
+valuable tool for monitoring model performance in dynamic real-world
+applications, offering insights into the evolving nature of data and ultimately
+contributing to more robust and adaptive models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Currently under submission</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Neural Network Interpretability Through Conductance-Based
+  Information Plane Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14681v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14681v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaouad Dabounou, Amine Baazzouz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Information Plane is a conceptual framework used to analyze the flow of
+information in neural networks, but traditional methods based on activations
+may not fully capture the dynamics of information processing. This paper
+introduces a new approach that uses layer conductance, a measure of sensitivity
+to input features, to enhance the Information Plane analysis. By incorporating
+gradient-based contributions, we provide a more precise characterization of
+information dynamics within the network. The proposed conductance-based
+Information Plane and a new Information Transformation Efficiency (ITE) metric
+are evaluated on pretrained ResNet50 and VGG16 models using the ImageNet
+dataset. Our results demonstrate the ability to identify critical hidden layers
+that contribute significantly to model performance and interpretability, giving
+insights into information compression, preservation, and utilization across
+layers. The conductance-based approach offers a granular perspective on feature
+attribution, enhancing our understanding of the decision-making processes
+within neural networks. Furthermore, our empirical findings challenge certain
+theoretical predictions of the Information Bottleneck theory, highlighting the
+complexities of information dynamics in real-world data scenarios. The proposed
+method not only advances our understanding of information dynamics in neural
+networks but also has the potential to significantly impact the broader field
+of Artificial Intelligence by enabling the development of more interpretable,
+efficient, and robust models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On-Chip Learning with Memristor-Based Neural Networks: Assessing
+  Accuracy and Efficiency Under Device Variations, Conductance Errors, and
+  Input Noise 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14680v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14680v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        M. Reza Eslami, Dhiman Biswas, Soheib Takhtardeshir, Sarah S. Sharif, Yaser M. Banad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a memristor-based compute-in-memory hardware accelerator
+for on-chip training and inference, focusing on its accuracy and efficiency
+against device variations, conductance errors, and input noise. Utilizing
+realistic SPICE models of commercially available silver-based metal
+self-directed channel (M-SDC) memristors, the study incorporates inherent
+device non-idealities into the circuit simulations. The hardware, consisting of
+30 memristors and 4 neurons, utilizes three different M-SDC structures with
+tungsten, chromium, and carbon media to perform binary image classification
+tasks. An on-chip training algorithm precisely tunes memristor conductance to
+achieve target weights. Results show that incorporating moderate noise (<15%)
+during training enhances robustness to device variations and noisy input data,
+achieving up to 97% accuracy despite conductance variations and input noises.
+The network tolerates a 10% conductance error without significant accuracy
+loss. Notably, omitting the initial memristor reset pulse during training
+considerably reduces training time and energy consumption. The hardware
+designed with chromium-based memristors exhibits superior performance,
+achieving a training time of 2.4 seconds and an energy consumption of 18.9 mJ.
+This research provides insights for developing robust and energy-efficient
+memristor-based neural networks for on-chip learning in edge applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bridging the Gap: Unpacking the Hidden Challenges in Knowledge
+  Distillation for Online Ranking Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14678v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14678v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikhil Khani, Shuo Yang, Aniruddh Nath, Yang Liu, Pendo Abbo, Li Wei, Shawn Andrews, Maciej Kula, Jarrod Kahn, Zhe Zhao, Lichan Hong, Ed Chi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge Distillation (KD) is a powerful approach for compressing a large
+model into a smaller, more efficient model, particularly beneficial for
+latency-sensitive applications like recommender systems. However, current KD
+research predominantly focuses on Computer Vision (CV) and NLP tasks,
+overlooking unique data characteristics and challenges inherent to recommender
+systems. This paper addresses these overlooked challenges, specifically: (1)
+mitigating data distribution shifts between teacher and student models, (2)
+efficiently identifying optimal teacher configurations within time and
+budgetary constraints, and (3) enabling computationally efficient and rapid
+sharing of teacher labels to support multiple students. We present a robust KD
+system developed and rigorously evaluated on multiple large-scale personalized
+video recommendation systems within Google. Our live experiment results
+demonstrate significant improvements in student model performance while
+ensuring consistent and reliable generation of high quality teacher labels from
+a continuous data stream of data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can Optimization Trajectories Explain Multi-Task Transfer? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14677v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14677v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Mueller, Mark Dredze, Nicholas Andrews
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the widespread adoption of multi-task training in deep learning,
+little is understood about how multi-task learning (MTL) affects
+generalization. Prior work has conjectured that the negative effects of MTL are
+due to optimization challenges that arise during training, and many
+optimization methods have been proposed to improve multi-task performance.
+However, recent work has shown that these methods fail to consistently improve
+multi-task generalization. In this work, we seek to improve our understanding
+of these failures by empirically studying how MTL impacts the optimization of
+tasks, and whether this impact can explain the effects of MTL on
+generalization. We show that MTL results in a generalization gap-a gap in
+generalization at comparable training loss-between single-task and multi-task
+trajectories early into training. However, we find that factors of the
+optimization trajectory previously proposed to explain generalization gaps in
+single-task settings cannot explain the generalization gaps between single-task
+and multi-task models. Moreover, we show that the amount of gradient conflict
+between tasks is correlated with negative effects to task optimization, but is
+not predictive of generalization. Our work sheds light on the underlying causes
+for failures in MTL and, importantly, raises questions about the role of
+general purpose multi-task optimization algorithms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Pre-print</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLM Pruning and Distillation in Practice: The Minitron Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.11796v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.11796v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sharath Turuvekere Sreenivas, Saurav Muralidharan, Raviraj Joshi, Marcin Chochowski, Mostofa Patwary, Mohammad Shoeybi, Bryan Catanzaro, Jan Kautz, Pavlo Molchanov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a comprehensive report on compressing the Llama 3.1 8B and Mistral
+NeMo 12B models to 4B and 8B parameters, respectively, using pruning and
+distillation. We explore two distinct pruning strategies: (1) depth pruning and
+(2) joint hidden/attention/MLP (width) pruning, and evaluate the results on
+common benchmarks from the LM Evaluation Harness. The models are then aligned
+with NeMo Aligner and tested in instruct-tuned versions. This approach produces
+a compelling 4B model from Llama 3.1 8B and a state-of-the-art
+Mistral-NeMo-Minitron-8B (MN-Minitron-8B for brevity) model from Mistral NeMo
+12B. We found that with no access to the original data, it is beneficial to
+slightly fine-tune teacher models on the distillation dataset. We open-source
+our base model weights on Hugging Face with a permissive license.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>v2: Added missing references. Cleaned up runtime performance section</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond Scale: The Diversity Coefficient as a Data Quality Metric for
+  Variability in Natural Language Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.13840v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.13840v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brando Miranda, Alycia Lee, Sudharsan Sundar, Allison Casasola, Sanmi Koyejo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current trends in pre-training Large Language Models (LLMs) primarily focus
+on the scaling of model and dataset size. While the quality of pre-training
+data is considered an important factor for training powerful LLMs, it remains a
+nebulous concept that has not been rigorously characterized. To this end, we
+propose a formalization of one key aspect of data quality -- measuring the
+variability of natural language data -- specifically via a measure we call the
+diversity coefficient. Our empirical analysis shows that the proposed diversity
+coefficient aligns with the intuitive properties of diversity and variability,
+e.g., it increases as the number of latent concepts increases. Then, we measure
+the diversity coefficient of publicly available pre-training datasets and
+demonstrate that their formal diversity is high compared to theoretical lower
+and upper bounds. Finally, we conduct a comprehensive set of controlled
+interventional experiments with GPT-2 and LLaMAv2 that demonstrate the
+diversity coefficient of pre-training data characterizes useful aspects of
+downstream model evaluation performance -- totaling 44 models of various sizes
+(51M to 7B parameters). We conclude that our formal notion of diversity is an
+important aspect of data quality that captures variability and causally leads
+to improved evaluation performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improved Uncertainty Estimation of Graph Neural Network Potentials Using
+  Engineered Latent Space Distances 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.10844v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.10844v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joseph Musielewicz, Janice Lan, Matt Uyttendaele, John R. Kitchin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks (GNNs) have been shown to be astonishingly capable
+models for molecular property prediction, particularly as surrogates for
+expensive density functional theory calculations of relaxed energy for novel
+material discovery. However, one limitation of GNNs in this context is the lack
+of useful uncertainty prediction methods, as this is critical to the material
+discovery pipeline. In this work, we show that uncertainty quantification for
+relaxed energy calculations is more complex than uncertainty quantification for
+other kinds of molecular property prediction, due to the effect that structure
+optimizations have on the error distribution. We propose that distribution-free
+techniques are more useful tools for assessing calibration, recalibrating, and
+developing uncertainty prediction methods for GNNs performing relaxed energy
+calculations. We also develop a relaxed energy task for evaluating uncertainty
+methods for equivariant GNNs, based on distribution-free recalibration and
+using the Open Catalyst Project dataset. We benchmark a set of popular
+uncertainty prediction methods on this task, and show that latent distance
+methods, with our novel improvements, are the most well-calibrated and
+economical approach for relaxed energy calculations. Finally, we demonstrate
+that our latent space distance method produces results which align with our
+expectations on a clustering example, and on specific equation of state and
+adsorbate coverage examples from outside the training dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tracing Privacy Leakage of Language Models to Training Data via Adjusted
+  Influence Functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.10468v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.10468v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinxin Liu, Zao Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The responses generated by Large Language Models (LLMs) can include sensitive
+information from individuals and organizations, leading to potential privacy
+leakage. This work implements Influence Functions (IFs) to trace privacy
+leakage back to the training data, thereby mitigating privacy concerns of
+Language Models (LMs). However, we notice that current IFs struggle to
+accurately estimate the influence of tokens with large gradient norms,
+potentially overestimating their influence. When tracing the most influential
+samples, this leads to frequently tracing back to samples with large gradient
+norm tokens, overshadowing the actual most influential samples even if their
+influences are well estimated. To address this issue, we propose Heuristically
+Adjusted IF (HAIF), which reduces the weight of tokens with large gradient
+norms, thereby significantly improving the accuracy of tracing the most
+influential samples. To establish easily obtained groundtruth for tracing
+privacy leakage, we construct two datasets, PII-E and PII-CR, representing two
+distinct scenarios: one with identical text in the model outputs and
+pre-training data, and the other where models leverage their reasoning
+abilities to generate text divergent from pre-training data. HAIF significantly
+improves tracing accuracy, enhancing it by 20.96% to 73.71% on the PII-E
+dataset and 3.21% to 45.93% on the PII-CR dataset, compared to the best SOTA
+IFs against various GPT-2 and QWen-1.5 models. HAIF also outperforms SOTA IFs
+on real-world pretraining data CLUECorpus2020, demonstrating strong robustness
+regardless prompt and response lengths.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tackling GenAI Copyright Issues: Originality Estimation and
+  Genericization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03341v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03341v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hiroaki Chiba-Okabe, Weijie J. Su
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid progress of generative AI technology has sparked significant
+copyright concerns, leading to numerous lawsuits filed against AI developers.
+While various techniques for mitigating copyright issues have been studied,
+significant risks remain. Here, we propose a genericization method that
+modifies the outputs of a generative model to make them more generic and less
+likely to infringe copyright. To achieve this, we introduce a metric for
+quantifying the level of originality of data in a manner that is consistent
+with the legal framework. This metric can be practically estimated by drawing
+samples from a generative model, which is then used for the genericization
+process. As a practical implementation, we introduce PREGen, which combines our
+genericization method with an existing mitigation technique. Experiments
+demonstrate that our genericization method successfully modifies the output of
+a text-to-image generative model so that it produces more generic,
+copyright-compliant images. Compared to the existing method, PREGen reduces the
+likelihood of generating copyrighted characters by more than half when the
+names of copyrighted characters are used as the prompt, dramatically improving
+the performance. Additionally, while generative models can produce copyrighted
+characters even when their names are not directly mentioned in the prompt,
+PREGen almost entirely prevents the generation of such characters in these
+cases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Model-Stealing Attacks Against Inductive Graph Neural Networks <span class="chip">ECAI - 27</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.12295v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.12295v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marcin Podhajski, Jan Dubiński, Franziska Boenisch, Adam Dziedzic, Agnieszka Pregowska And Tomasz Michalak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) are recognized as potent tools for processing
+real-world data organized in graph structures. Especially inductive GNNs, which
+allow for the processing of graph-structured data without relying on predefined
+graph structures, are becoming increasingly important in a wide range of
+applications. As such these networks become attractive targets for
+model-stealing attacks where an adversary seeks to replicate the functionality
+of the targeted network. Significant efforts have been devoted to developing
+model-stealing attacks that extract models trained on images and texts.
+However, little attention has been given to stealing GNNs trained on graph
+data. This paper identifies a new method of performing unsupervised
+model-stealing attacks against inductive GNNs, utilizing graph contrastive
+learning and spectral graph augmentations to efficiently extract information
+from the targeted model. The new type of attack is thoroughly evaluated on six
+datasets and the results show that our approach outperforms the current
+state-of-the-art by Shen et al. (2021). In particular, our attack surpasses the
+baseline across all benchmarks, attaining superior fidelity and downstream
+accuracy of the stolen model while necessitating fewer queries directed toward
+the target model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ECAI - 27TH EUROPEAN CONFERENCE ON ARTIFICIAL
+  INTELLIGENCE</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Prediction Instability in Machine Learning Ensembles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.03194v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.03194v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jeremy Kedziora
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In machine learning ensembles predictions from multiple models are
+aggregated. Despite widespread use and strong performance of ensembles in
+applied problems little is known about the mathematical properties of
+aggregating models and associated consequences for safe, explainable use of
+such models. In this paper we prove a theorem that shows that any ensemble will
+exhibit at least one of the following forms of prediction instability. It will
+either ignore agreement among all underlying models, change its mind when none
+of the underlying models have done so, or be manipulable through inclusion or
+exclusion of options it would never actually predict. As a consequence,
+ensemble aggregation procedures will always need to balance the benefits of
+information use against the risk of these prediction instabilities. This
+analysis also sheds light on what specific forms of prediction instability to
+expect from particular ensemble algorithms; for example popular tree ensembles
+like random forest, or xgboost will violate basic, intuitive fairness
+properties. Finally, we show that this can be ameliorated by using consistent
+models in asymptotic conditions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Dataset</span> and Benchmark for Hospital Course Summarization with Adapted
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.05720v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.05720v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Asad Aali, Dave Van Veen, Yamin Ishraq Arefeen, Jason Hom, Christian Bluethgen, Eduardo Pontes Reis, Sergios Gatidis, Namuun Clifford, Joseph Daws, Arash S. Tehrani, Jangwon Kim, Akshay S. Chaudhari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Brief hospital course (BHC) summaries are clinical documents that summarize a
+patient's hospital stay. While large language models (LLMs) depict remarkable
+capabilities in automating real-world tasks, their capabilities for healthcare
+applications such as synthesizing BHCs from clinical notes have not been shown.
+We introduce a novel pre-processed dataset, the MIMIC-IV-BHC, encapsulating
+clinical note and brief hospital course (BHC) pairs to adapt LLMs for BHC
+synthesis. Furthermore, we introduce a benchmark of the summarization
+performance of two general-purpose LLMs and three healthcare-adapted LLMs.
+  Using clinical notes as input, we apply prompting-based (using in-context
+learning) and fine-tuning-based adaptation strategies to three open-source LLMs
+(Clinical-T5-Large, Llama2-13B, FLAN-UL2) and two proprietary LLMs (GPT-3.5,
+GPT-4). We evaluate these LLMs across multiple context-length inputs using
+natural language similarity metrics. We further conduct a clinical study with
+five clinicians, comparing clinician-written and LLM-generated BHCs across 30
+samples, focusing on their potential to enhance clinical decision-making
+through improved summary quality. We observe that the Llama2-13B fine-tuned LLM
+outperforms other domain-adapted models given quantitative evaluation metrics
+of BLEU and BERT-Score. GPT-4 with in-context learning shows more robustness to
+increasing context lengths of clinical note inputs than fine-tuned Llama2-13B.
+Despite comparable quantitative metrics, the reader study depicts a significant
+preference for summaries generated by GPT-4 with in-context learning compared
+to both Llama2-13B fine-tuned summaries and the original summaries,
+highlighting the need for qualitative clinical evaluation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Graph-SCP: Accelerating Set Cover Problems with Graph Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07979v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07979v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zohair Shafi, Benjamin A. Miller, Tina Eliassi-Rad, Rajmonda S. Caceres
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning (ML) approaches are increasingly being used to accelerate
+combinatorial optimization (CO) problems. We investigate the Set Cover Problem
+(SCP) and propose Graph-SCP, a graph neural network method that augments
+existing optimization solvers by learning to identify a much smaller
+sub-problem that contains the solution space. Graph-SCP uses both supervised
+learning from prior solved instances and unsupervised learning aimed at
+minimizing the SCP objective. We evaluate the performance of Graph-SCP on
+synthetically weighted and unweighted SCP instances with diverse problem
+characteristics and complexities, and on instances from the OR Library, a
+canonical benchmark for SCP. We show that Graph-SCP reduces the problem size by
+60-80% and achieves runtime speedups of up to 10x on average when compared to
+Gurobi (a state-of-the-art commercial solver), while maintaining solution
+quality. This is in contrast to fast greedy solutions that significantly
+compromise solution quality to achieve guaranteed polynomial runtime. We
+showcase Graph-SCP's ability to generalize to larger problem sizes, training on
+SCP instances with up to 3,000 subsets and testing on SCP instances with up to
+10,000 subsets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Binocular Model: A deep learning solution for online melt pool
+  temperature analysis using dual-wavelength Imaging Pyrometry 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.11126v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.11126v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Javid Akhavan, Chaitanya Krishna Vallabh, Xiayun Zhao, Souran Manoochehri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In metal Additive Manufacturing (AM), monitoring the temperature of the Melt
+Pool (MP) is crucial for ensuring part quality, process stability, defect
+prevention, and overall process optimization. Traditional methods, are slow to
+converge and require extensive manual effort to translate data into actionable
+insights, rendering them impractical for real-time monitoring and control. To
+address this challenge, we propose an Artificial Intelligence (AI)-based
+solution aimed at reducing manual data processing reliance and improving the
+efficiency of transitioning from data to insight. In our study, we utilize a
+dataset comprising dual-wavelength real-time process monitoring data and
+corresponding temperature maps. We introduce a deep learning model called the
+"Binocular model," which exploits dual input observations to perform a precise
+analysis of MP temperature in Laser Powder Bed Fusion (L-PBF). Through advanced
+deep learning techniques, we seamlessly convert raw data into temperature maps,
+significantly streamlining the process and enabling batch processing at a rate
+of up to 750 frames per second, approximately 1000 times faster than
+conventional methods. Our Binocular model achieves high accuracy in temperature
+estimation, evidenced by a 0.95 R-squared score, while simultaneously enhancing
+processing efficiency by a factor of $\sim1000x$ times. This model directly
+addresses the challenge of real-time MP temperature monitoring and offers
+insights into the encountered constraints and the benefits of our Deep
+Learning-based approach. By combining efficiency and precision, our work
+contributes to the advancement of temperature monitoring in L-PBF, thus driving
+progress in the field of metal AM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Graph Reinforcement Learning for Power Grids: A Comprehensive <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.04522v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.04522v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohamed Hassouna, Clara Holzhüter, Pawel Lytaev, Josephine Thomas, Bernhard Sick, Christoph Scholz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rise of renewable energy and distributed generation requires new
+approaches to overcome the limitations of traditional methods. In this context,
+Graph Neural Networks are promising due to their ability to learn from
+graph-structured data. Combined with Reinforcement Learning, they can serve as
+control approaches to determine remedial network actions. This review analyses
+how Graph Reinforcement Learning (GRL) can improve representation learning and
+decision making in power grid use cases. Although GRL has demonstrated
+adaptability to unpredictable events and noisy data, it is primarily at a
+proof-of-concept stage. We highlight open challenges and limitations with
+respect to real-world applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LoQT: Low Rank Adapters for Quantized Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.16528v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.16528v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sebastian Loeschcke, Mads Toftrup, Michael J. Kastoryano, Serge Belongie, Vésteinn Snæbjarnarson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training of large neural networks requires significant computational
+resources. Despite advances using low-rank adapters and quantization,
+pretraining of models such as LLMs on consumer hardware has not been possible
+without model sharding, offloading during training, or per-layer gradient
+updates. To address these limitations, we propose LoQT, a method for
+efficiently training quantized models. LoQT uses gradient-based tensor
+factorization to initialize low-rank trainable weight matrices that are
+periodically merged into quantized full-rank weight matrices. Our approach is
+suitable for both pretraining and fine-tuning of models, which we demonstrate
+experimentally for language modeling and downstream task adaptation. We find
+that LoQT enables efficient training of models up to 7B parameters on a
+consumer-grade 24GB GPU. We also demonstrate the feasibility of training a 13B
+parameter model using per-layer gradient updates on the same hardware.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bridging the Usability Gap: Theoretical and Methodological Advances for
+  Spectral Learning of Hidden Markov Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.07437v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.07437v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoyuan Ma, Jordan Rodu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Baum-Welch (B-W) algorithm is the most widely accepted method for
+inferring hidden Markov models (HMM). However, it is prone to getting stuck in
+local optima, and can be too slow for many real-time applications. Spectral
+learning of HMMs (SHMM), based on the method of moments (MOM) has been proposed
+in the literature to overcome these obstacles. Despite its promises, asymptotic
+theory for SHMM has been elusive, and the long-run performance of SHMM can
+degrade due to unchecked propagation of error. In this paper, we (1) provide an
+asymptotic distribution for the approximate error of the likelihood estimated
+by SHMM, (2) propose a novel algorithm called projected SHMM (PSHMM) that
+mitigates the problem of error propagation, and (3) develop online learning
+variants of both SHMM and PSHMM that accommodate potential nonstationarity. We
+compare the performance of SHMM with PSHMM and estimation through the B-W
+algorithm on both simulated data and data from real world applications, and
+find that PSHMM not only retains the computational advantages of SHMM, but also
+provides more robust estimation and forecasting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Field theory for optimal signal propagation in ResNets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.07715v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.07715v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kirsten Fischer, David Dahmen, Moritz Helias
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Residual networks have significantly better trainability and thus performance
+than feed-forward networks at large depth. Introducing skip connections
+facilitates signal propagation to deeper layers. In addition, previous works
+found that adding a scaling parameter for the residual branch further improves
+generalization performance. While they empirically identified a particularly
+beneficial range of values for this scaling parameter, the associated
+performance improvement and its universality across network hyperparameters yet
+need to be understood. For feed-forward networks, finite-size theories have led
+to important insights with regard to signal propagation and hyperparameter
+tuning. We here derive a systematic finite-size field theory for residual
+networks to study signal propagation and its dependence on the scaling for the
+residual branch. We derive analytical expressions for the response function, a
+measure for the network's sensitivity to inputs, and show that for deep
+networks the empirically found values for the scaling parameter lie within the
+range of maximal sensitivity. Furthermore, we obtain an analytical expression
+for the optimal scaling parameter that depends only weakly on other network
+hyperparameters, such as the weight variance, thereby explaining its
+universality across hyperparameters. Overall, this work provides a theoretical
+framework to study ResNets at finite size.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 8 figures, under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Pediatric TSC-Related Epilepsy Classification from Clinical MR Images
+  Using Quantum Neural Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.12615v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.12615v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ling Lin, Yihang Zhou, Zhanqi Hu, Dian Jiang, Congcong Liu, Shuo Zhou, Yanjie Zhu, Jianxiang Liao, Dong Liang, Hairong Zheng, Haifeng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tuberous sclerosis complex (TSC) manifests as a multisystem disorder with
+significant neurological implications. This study addresses the critical need
+for robust classification models tailored to TSC in pediatric patients,
+introducing QResNet,a novel deep learning model seamlessly integrating
+conventional convolutional neural networks with quantum neural networks. The
+model incorporates a two-layer quantum layer (QL), comprising ZZFeatureMap and
+Ansatz layers, strategically designed for processing classical data within a
+quantum framework. A comprehensive evaluation, demonstrates the superior
+performance of QResNet in TSC MRI image classification compared to conventional
+3D-ResNet models. These compelling findings underscore the potential of quantum
+computing to revolutionize medical imaging and diagnostics.Remarkably, this
+method surpasses conventional CNNs in accuracy and Area Under the Curve (AUC)
+metrics with the current dataset. Future research endeavors may focus on
+exploring the scalability and practical implementation of quantum algorithms in
+real-world medical imaging scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages,4 figures,2 tables,presented at ISBI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ When accurate prediction models yield harmful self-fulfilling prophecies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.01210v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.01210v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wouter A. C. van Amsterdam, Nan van Geloven, Jesse H. Krijthe, Rajesh Ranganath, Giovanni Ciná
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prediction models are popular in medical research and practice. By predicting
+an outcome of interest for specific patients, these models may help inform
+difficult treatment decisions, and are often hailed as the poster children for
+personalized, data-driven healthcare. We show however, that using prediction
+models for decision making can lead to harmful decisions, even when the
+predictions exhibit good discrimination after deployment. These models are
+harmful self-fulfilling prophecies: their deployment harms a group of patients
+but the worse outcome of these patients does not invalidate the predictive
+power of the model. Our main result is a formal characterization of a set of
+such prediction models. Next we show that models that are well calibrated
+before and after deployment are useless for decision making as they made no
+change in the data distribution. These results point to the need to revise
+standard practices for validation, deployment and evaluation of prediction
+models that are used in medical decisions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OLGA: One-cLass Graph Autoencoder 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.09131v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.09131v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        M. P. S. Gôlo, J. G. B. M. Junior, D. F. Silva, R. M. Marcacini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One-class learning (OCL) comprises a set of techniques applied when
+real-world problems have a single class of interest. The usual procedure for
+OCL is learning a hypersphere that comprises instances of this class and,
+ideally, repels unseen instances from any other classes. Besides, several OCL
+algorithms for graphs have been proposed since graph representation learning
+has succeeded in various fields. These methods may use a two-step strategy,
+initially representing the graph and, in a second step, classifying its nodes.
+On the other hand, end-to-end methods learn the node representations while
+classifying the nodes in one learning process. We highlight three main gaps in
+the literature on OCL for graphs: (i) non-customized representations for OCL;
+(ii) the lack of constraints on hypersphere parameters learning; and (iii) the
+methods' lack of interpretability and visualization. We propose One-cLass Graph
+Autoencoder (OLGA). OLGA is end-to-end and learns the representations for the
+graph nodes while encapsulating the interest instances by combining two loss
+functions. We propose a new hypersphere loss function to encapsulate the
+interest instances. OLGA combines this new hypersphere loss with the graph
+autoencoder reconstruction loss to improve model learning. OLGA achieved
+state-of-the-art results and outperformed six other methods with a
+statistically significant difference from five methods. Moreover, OLGA learns
+low-dimensional representations maintaining the classification performance with
+an interpretable model representation learning and results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PDEBENCH: An Extensive Benchmark for Scientific Machine Learning <span class="chip">NeurIPS 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.07182v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.07182v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Makoto Takamoto, Timothy Praditia, Raphael Leiteritz, Dan MacKinlay, Francesco Alesiani, Dirk Pflüger, Mathias Niepert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning-based modeling of physical systems has experienced increased
+interest in recent years. Despite some impressive progress, there is still a
+lack of benchmarks for Scientific ML that are easy to use but still challenging
+and representative of a wide range of problems. We introduce PDEBench, a
+benchmark suite of time-dependent simulation tasks based on Partial
+Differential Equations (PDEs). PDEBench comprises both code and data to
+benchmark the performance of novel machine learning models against both
+classical numerical simulations and machine learning baselines. Our proposed
+set of benchmark problems contribute the following unique features: (1) A much
+wider range of PDEs compared to existing benchmarks, ranging from relatively
+common examples to more realistic and difficult problems; (2) much larger
+ready-to-use datasets compared to prior work, comprising multiple simulation
+runs across a larger number of initial and boundary conditions and PDE
+parameters; (3) more extensible source codes with user-friendly APIs for data
+generation and baseline results with popular machine learning models (FNO,
+U-Net, PINN, Gradient-Based Inverse Method). PDEBench allows researchers to
+extend the benchmark freely for their own purposes using a standardized API and
+to compare the performance of new models to existing baseline methods. We also
+propose new evaluation metrics with the aim to provide a more holistic
+understanding of learning methods in the context of Scientific ML. With those
+metrics we identify tasks which are challenging for recent ML methods and
+propose these tasks as future challenges for the community. The code is
+available at https://github.com/pdebench/PDEBench.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages (main body) + 34 pages (supplemental material), accepted for
+  publication in NeurIPS 2022 Track Datasets and Benchmarks</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Early Prediction of Causes (not Effects) in Healthcare by Long-Term
+  Clinical Time Series Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.03816v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.03816v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Staniek, Marius Fracarolli, Michael Hagmann, Stefan Riezler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning for early syndrome diagnosis aims to solve the intricate
+task of predicting a ground truth label that most often is the outcome (effect)
+of a medical consensus definition applied to observed clinical measurements
+(causes), given clinical measurements observed several hours before. Instead of
+focusing on the prediction of the future effect, we propose to directly predict
+the causes via time series forecasting (TSF) of clinical variables and
+determine the effect by applying the gold standard consensus definition to the
+forecasted values. This method has the invaluable advantage of being
+straightforwardly interpretable to clinical practitioners, and because model
+training does not rely on a particular label anymore, the forecasted data can
+be used to predict any consensus-based label. We exemplify our method by means
+of long-term TSF with Transformer models, with a focus on accurate prediction
+of sparse clinical variables involved in the SOFA-based Sepsis-3 definition and
+the new Simplified Acute Physiology Score (SAPS-II) definition. Our experiments
+are conducted on two datasets and show that contrary to recent proposals which
+advocate set function encoders for time series and direct multi-step decoders,
+best results are achieved by a combination of standard dense encoders with
+iterative multi-step decoders. The key for success of iterative multi-step
+decoding can be attributed to its ability to capture cross-variate dependencies
+and to a student forcing training strategy that teaches the model to rely on
+its own previous time step predictions for the next time step prediction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at Machine Learning for Healthcare (MLHC), Toronto, 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hierarchical Generative Modeling of Melodic Vocal Contours in Hindustani
+  Classical Music 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.12658v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.12658v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nithya Shikarpur, Krishna Maneesha Dendukuri, Yusong Wu, Antoine Caillon, Cheng-Zhi Anna Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hindustani music is a performance-driven oral tradition that exhibits the
+rendition of rich melodic patterns. In this paper, we focus on generative
+modeling of singers' vocal melodies extracted from audio recordings, as the
+voice is musically prominent within the tradition. Prior generative work in
+Hindustani music models melodies as coarse discrete symbols which fails to
+capture the rich expressive melodic intricacies of singing. Thus, we propose to
+use a finely quantized pitch contour, as an intermediate representation for
+hierarchical audio modeling. We propose GaMaDHaNi, a modular two-level
+hierarchy, consisting of a generative model on pitch contours, and a pitch
+contour to audio synthesis model. We compare our approach to non-hierarchical
+audio models and hierarchical models that use a self-supervised intermediate
+representation, through a listening test and qualitative analysis. We also
+evaluate audio model's ability to faithfully represent the pitch contour input
+using Pearson correlation coefficient. By using pitch contours as an
+intermediate representation, we show that our model may be better equipped to
+listen and respond to musicians in a human-AI collaborative setting by
+highlighting two potential interaction use cases (1) primed generation, and (2)
+coarse pitch conditioning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at International Society for Music Information Retrieval
+  (ISMIR) 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Continuum Limits of Ollivier's Ricci Curvature on data clouds: pointwise
+  consistency and global lower bounds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02378v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02378v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicolas Garcia Trillos, Melanie Weber
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Let $M$ denote a low-dimensional manifold embedded in Euclidean space and let
+${X}= \{ x_1, \dots, x_n \}$ be a collection of points uniformly sampled from
+it. We study the relationship between the curvature of a random geometric graph
+built from ${X}$ and the curvature of the manifold $M$ via continuum limits of
+Ollivier's discrete Ricci curvature. We prove pointwise, non-asymptotic
+consistency results and also show that if $M$ has Ricci curvature bounded from
+below by a positive constant, then the random geometric graph will inherit this
+global structural property with high probability. We discuss applications of
+the global discrete curvature bounds to contraction properties of heat kernels
+on graphs, as well as implications for manifold learning from data clouds. In
+particular, we show that our consistency results allow for estimating the
+intrinsic curvature of a manifold by first estimating concrete extrinsic
+quantities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Causal Chambers: Real Physical Systems as a Testbed for AI
+  Methodology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.11341v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.11341v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juan L. Gamella, Jonas Peters, Peter Bühlmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In some fields of AI, machine learning and statistics, the validation of new
+methods and algorithms is often hindered by the scarcity of suitable real-world
+datasets. Researchers must often turn to simulated data, which yields limited
+information about the applicability of the proposed methods to real problems.
+As a step forward, we have constructed two devices that allow us to quickly and
+inexpensively produce large datasets from non-trivial but well-understood
+physical systems. The devices, which we call causal chambers, are
+computer-controlled laboratories that allow us to manipulate and measure an
+array of variables from these physical systems, providing a rich testbed for
+algorithms from a variety of fields. We illustrate potential applications
+through a series of case studies in fields such as causal discovery,
+out-of-distribution generalization, change point detection, independent
+component analysis, and symbolic regression. For applications to causal
+inference, the chambers allow us to carefully perform interventions. We also
+provide and empirically validate a causal model of each chamber, which can be
+used as ground truth for different tasks. All hardware and software is made
+open source, and the datasets are publicly available at causalchamber.org or
+through the Python package causalchamber.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>40 pages, 20 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Generation of Hidden Outliers for Improved Outlier Detection <span class="chip">KDD</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.03846v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.03846v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jose Cribeiro-Ramallo, Vadim Arzamasov, Klemens Böhm
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Outlier generation is a popular technique used for solving important outlier
+detection tasks. Generating outliers with realistic behavior is challenging.
+Popular existing methods tend to disregard the 'multiple views' property of
+outliers in high-dimensional spaces. The only existing method accounting for
+this property falls short in efficiency and effectiveness. We propose BISECT, a
+new outlier generation method that creates realistic outliers mimicking said
+property. To do so, BISECT employs a novel proposition introduced in this
+article stating how to efficiently generate said realistic outliers. Our method
+has better guarantees and complexity than the current methodology for
+recreating 'multiple views'. We use the synthetic outliers generated by BISECT
+to effectively enhance outlier detection in diverse datasets, for multiple use
+cases. For instance, oversampling with BISECT reduced the error by up to 3
+times when compared with the baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. Full paper is scheduled to appear in TKDD; Updated results
+  in table 4</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Urban Region <span class="highlight-title">Pre-train</span>ing and <span class="highlight-title">Prompt</span>ing: A Graph-based Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.05920v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.05920v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiahui Jin, Yifan Song, Dong Kan, Haojia Zhu, Xiangguo Sun, Zhicheng Li, Xigang Sun, Jinghui Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Urban region representation is crucial for various urban downstream tasks.
+However, despite the proliferation of methods and their success, acquiring
+general urban region knowledge and adapting to different tasks remains
+challenging. Previous work often neglects the spatial structures and functional
+layouts between entities, limiting their ability to capture transferable
+knowledge across regions. Further, these methods struggle to adapt effectively
+to specific downstream tasks, as they do not adequately address the unique
+features and relationships required for different downstream tasks. In this
+paper, we propose a $\textbf{G}$raph-based $\textbf{U}$rban $\textbf{R}$egion
+$\textbf{P}$re-training and $\textbf{P}$rompting framework ($\textbf{GURPP}$)
+for region representation learning. Specifically, we first construct an urban
+region graph that integrates detailed spatial entity data for more effective
+urban region representation. Then, we develop a subgraph-centric urban region
+pre-training model to capture the heterogeneous and transferable patterns of
+interactions among entities. To further enhance the adaptability of these
+embeddings to different tasks, we design two graph-based prompting methods to
+incorporate explicit/hidden task knowledge. Extensive experiments on various
+urban region prediction tasks and different cities demonstrate the superior
+performance of our GURPP framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bayesian neural networks via MCMC: a Python-based tutorial 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.02595v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.02595v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rohitash Chandra, Joshua Simmons
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bayesian inference provides a methodology for parameter estimation and
+uncertainty quantification in machine learning and deep learning methods.
+Variational inference and Markov Chain Monte-Carlo (MCMC) sampling methods are
+used to implement Bayesian inference. In the past three decades, MCMC sampling
+methods have faced some challenges in being adapted to larger models (such as
+in deep learning) and big data problems. Advanced proposal distributions that
+incorporate gradients, such as a Langevin proposal distribution, provide a
+means to address some of the limitations of MCMC sampling for Bayesian neural
+networks. Furthermore, MCMC methods have typically been constrained to
+statisticians and currently not well-known among deep learning researchers. We
+present a tutorial for MCMC methods that covers simple Bayesian linear and
+logistic models, and Bayesian neural networks. The aim of this tutorial is to
+bridge the gap between theory and implementation via coding, given a general
+sparsity of libraries and tutorials to this end. This tutorial provides code in
+Python with data and instructions that enable their use and extension. We
+provide results for some benchmark problems showing the strengths and
+weaknesses of implementing the respective Bayesian models via MCMC. We
+highlight the challenges in sampling multi-modal posterior distributions for
+the case of Bayesian neural networks and the need for further improvement of
+convergence diagnosis methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE Access (2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Demystifying the Recency Heuristic in Temporal-Difference Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.12284v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.12284v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brett Daley, Marlos C. Machado, Martha White
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recency heuristic in reinforcement learning is the assumption that
+stimuli that occurred closer in time to an acquired reward should be more
+heavily reinforced. The recency heuristic is one of the key assumptions made by
+TD($\lambda$), which reinforces recent experiences according to an
+exponentially decaying weighting. In fact, all other widely used return
+estimators for TD learning, such as $n$-step returns, satisfy a weaker (i.e.,
+non-monotonic) recency heuristic. Why is the recency heuristic effective for
+temporal credit assignment? What happens when credit is assigned in a way that
+violates this heuristic? In this paper, we analyze the specific mathematical
+implications of adopting the recency heuristic in TD learning. We prove that
+any return estimator satisfying this heuristic: 1) is guaranteed to converge to
+the correct value function, 2) has a relatively fast contraction rate, and 3)
+has a long window of effective credit assignment, yet bounded worst-case
+variance. We also give a counterexample where on-policy, tabular TD methods
+violating the recency heuristic diverge. Our results offer some of the first
+theoretical evidence that credit assignment based on the recency heuristic
+facilitates learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>RLC 2024. 18 pages, 8 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Be Persistent: Towards a Unified Solution for Mitigating Shortcuts in
+  Deep Learning <span class="chip">ECAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.11237v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.11237v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hadi M. Dolatabadi, Sarah M. Erfani, Christopher Leckie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks (DNNs) are vulnerable to shortcut learning: rather than
+learning the intended task, they tend to draw inconclusive relationships
+between their inputs and outputs. Shortcut learning is ubiquitous among many
+failure cases of neural networks, and traces of this phenomenon can be seen in
+their generalizability issues, domain shift, adversarial vulnerability, and
+even bias towards majority groups. In this paper, we argue that this
+commonality in the cause of various DNN issues creates a significant
+opportunity that should be leveraged to find a unified solution for shortcut
+learning. To this end, we outline the recent advances in topological data
+analysis (TDA), and persistent homology (PH) in particular, to sketch a unified
+roadmap for detecting shortcuts in deep learning. We demonstrate our arguments
+by investigating the topological features of computational graphs in DNNs using
+two cases of unlearnable examples and bias in decision-making as our test
+studies. Our analysis of these two failure cases of DNNs reveals that finding a
+unified solution for shortcut learning in DNNs is not out of reach, and TDA can
+play a significant role in forming such a framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the 2024 European Conference on Artificial Intelligence
+  (ECAI)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Solar Active Regions Detection Via 2D Circular Kernel Time Series
+  Transformation, Entropy and Machine Learning Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.08270v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.08270v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Irewola Aaron Oludehinwa, Andrei Velichko, Maksim Belyaev, Olasunkanmi I. Olusola
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study proposes an enhancement to the existing method for detecting Solar
+Active Regions (ARs). Our technique tracks ARs using images from the
+Atmospheric Imaging Assembly (AIA) of NASA's Solar Dynamics Observatory (SDO).
+It involves a 2D circular kernel time series transformation, combined with
+Statistical and Entropy measures, and a Machine Learning (ML) approach. The
+technique transforms the circular area around pixels in the SDO AIA images into
+one-dimensional time series (1-DTS). Statistical measures (Median Value, Xmed;
+95th Percentile, X95) and Entropy measures (Distribution Entropy, DisEn; Fuzzy
+Entropy, FuzzyEn) are used as feature selection methods (FSM 1), alongside a
+method applying 1-DTS elements directly as features (FSM 2). The ML algorithm
+classifies these series into three categories: no Active Region (nARs type 1,
+class 1), non-flaring Regions outside active regions with brightness (nARs type
+2, class 2), and flaring Active Regions (ARs, class 3). The ML model achieves a
+classification accuracy of 0.900 and 0.914 for Entropy and Statistical
+measures, respectively. Notably, Fuzzy Entropy shows the highest classification
+accuracy (AKF=0.895), surpassing DisEn (AKF=0.738), X95 (AKF=0.873), and Xmed
+(AKF=0.840). This indicates the high effectiveness of Entropy and Statistical
+measures for AR detection in SDO AIA images. FSM 2 captures a similar
+distribution of flaring AR activities as FSM 1. Additionally, we introduce a
+generalizing characteristic of AR activities (GSA), finding a direct agreement
+between increased AR activities and higher GSA values. The Python code
+implementation of the proposed method is available in supplementary material.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages, 10 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimistic Online Non-stochastic Control via FTRL 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.03309v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.03309v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Naram Mhaisen, George Iosifidis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper brings the concept of ``optimism" to the new and promising
+framework of online Non-stochastic Control (NSC). Namely, we study how NSC can
+benefit from a prediction oracle of unknown quality responsible for forecasting
+future costs. The posed problem is first reduced to an optimistic learning with
+delayed feedback problem, which is handled through the Optimistic Follow the
+Regularized Leader (OFTRL) algorithmic family. This reduction enables the
+design of \texttt{OptFTRL-C}, the first Disturbance Action Controller (DAC)
+with optimistic policy regret bounds. These new bounds are commensurate with
+the oracle's accuracy, ranging from $\mathcal{O}(1)$ for perfect predictions to
+the order-optimal $\mathcal{O}(\sqrt{T})$ even when all predictions fail. By
+addressing the challenge of incorporating untrusted predictions into online
+control, this work contributes to the advancement of the NSC framework and
+paves the way toward effective and robust learning-based controllers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>to appear in the proceedings of IEEE CDC 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HG<span class="highlight-title">PROMPT</span>: Bridging Homogeneous and Heterogeneous Graphs for Few-shot
+  <span class="highlight-title">Prompt</span> Learning <span class="chip">AAAI2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.01878v8">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.01878v8.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingtong Yu, Yuan Fang, Zemin Liu, Xinming Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks (GNNs) and heterogeneous graph neural networks (HGNNs)
+are prominent techniques for homogeneous and heterogeneous graph representation
+learning, yet their performance in an end-to-end supervised framework greatly
+depends on the availability of task-specific supervision. To reduce the
+labeling cost, pre-training on self-supervised pretext tasks has become a
+popular paradigm,but there is often a gap between the pre-trained model and
+downstream tasks, stemming from the divergence in their objectives. To bridge
+the gap, prompt learning has risen as a promising direction especially in
+few-shot settings, without the need to fully fine-tune the pre-trained model.
+While there has been some early exploration of prompt-based learning on graphs,
+they primarily deal with homogeneous graphs, ignoring the heterogeneous graphs
+that are prevalent in downstream applications. In this paper, we propose
+HGPROMPT, a novel pre-training and prompting framework to unify not only
+pre-training and downstream tasks but also homogeneous and heterogeneous graphs
+via a dual-template design. Moreover, we propose dual-prompt in HGPROMPT to
+assist a downstream task in locating the most relevant prior to bridge the gaps
+caused by not only feature variations but also heterogeneity differences across
+tasks. Finally, we thoroughly evaluate and analyze HGPROMPT through extensive
+experiments on three public datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI2024 main track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generalized Graph <span class="highlight-title">Prompt</span>: Toward a Unification of <span class="highlight-title">Pre-Train</span>ing and
+  Downstream Tasks on Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.15317v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.15317v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingtong Yu, Zhenghao Liu, Yuan Fang, Zemin Liu, Sihong Chen, Xinming Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks have emerged as a powerful tool for graph
+representation learning, but their performance heavily relies on abundant
+task-specific supervision. To reduce labeling requirement, the "pre-train,
+prompt" paradigms have become increasingly common. However, existing study of
+prompting on graphs is limited, lacking a universal treatment to appeal to
+different downstream tasks. In this paper, we propose GraphPrompt, a novel
+pre-training and prompting framework on graphs. GraphPrompt not only unifies
+pre-training and downstream tasks into a common task template but also employs
+a learnable prompt to assist a downstream task in locating the most relevant
+knowledge from the pre-trained model in a task-specific manner. To further
+enhance GraphPrompt in these two stages, we extend it into GraphPrompt+ with
+two major enhancements. First, we generalize several popular graph pre-training
+tasks beyond simple link prediction to broaden the compatibility with our task
+template. Second, we propose a more generalized prompt design that incorporates
+a series of prompt vectors within every layer of the pre-trained graph encoder,
+in order to capitalize on the hierarchical information across different layers
+beyond just the readout layer. Finally, we conduct extensive experiments on
+five public datasets to evaluate and analyze GraphPrompt and GraphPrompt+.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE TKDE. Extension of "GraphPrompt: Unifying
+  Pre-Training and Downstream Tasks for Graph Neural Networks". arXiv admin
+  note: substantial text overlap with arXiv:2302.08043</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MultiG<span class="highlight-title">Prompt</span> for Multi-Task <span class="highlight-title">Pre-Train</span>ing and <span class="highlight-title">Prompt</span>ing on Graphs <span class="chip">WWW2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.03731v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.03731v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingtong Yu, Chang Zhou, Yuan Fang, Xinming Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graphs can inherently model interconnected objects on the Web, thereby
+facilitating a series of Web applications, such as web analyzing and content
+recommendation. Recently, Graph Neural Networks (GNNs) have emerged as a
+mainstream technique for graph representation learning. However, their efficacy
+within an end-to-end supervised framework is significantly tied to the
+availabilityof task-specific labels. To mitigate labeling costs and enhance
+robustness in few-shot settings, pre-training on self-supervised tasks has
+emerged as a promising method, while prompting has been proposed to further
+narrow the objective gap between pretext and downstream tasks. Although there
+has been some initial exploration of prompt-based learning on graphs, they
+primarily leverage a single pretext task, resulting in a limited subset of
+general knowledge that could be learned from the pre-training data. Hence, in
+this paper, we propose MultiGPrompt, a novel multi-task pre-training and
+prompting framework to exploit multiple pretext tasks for more comprehensive
+pre-trained knowledge. First, in pre-training, we design a set of pretext
+tokens to synergize multiple pretext tasks. Second, we propose a dual-prompt
+mechanism consisting of composed and open prompts to leverage task-specific and
+global pre-training knowledge, to guide downstream tasks in few-shot settings.
+Finally, we conduct extensive experiments on six public datasets to evaluate
+and analyze MultiGPrompt.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>WWW2024 research track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Averaging $n$-step Returns Reduces Variance in Reinforcement Learning <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.03903v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.03903v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brett Daley, Martha White, Marlos C. Machado
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multistep returns, such as $n$-step returns and $\lambda$-returns, are
+commonly used to improve the sample efficiency of reinforcement learning (RL)
+methods. The variance of the multistep returns becomes the limiting factor in
+their length; looking too far into the future increases variance and reverses
+the benefits of multistep learning. In our work, we demonstrate the ability of
+compound returns -- weighted averages of $n$-step returns -- to reduce
+variance. We prove for the first time that any compound return with the same
+contraction modulus as a given $n$-step return has strictly lower variance. We
+additionally prove that this variance-reduction property improves the
+finite-sample complexity of temporal-difference learning under linear function
+approximation. Because general compound returns can be expensive to implement,
+we introduce two-bootstrap returns which reduce variance while remaining
+efficient, even when using minibatched experience replay. We conduct
+experiments showing that compound returns often increase the sample efficiency
+of $n$-step deep RL agents like DQN and PPO.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024. 27 pages, 7 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Out-of-Distribution Generalization of Trajectory Prediction
+  for Autonomous Driving via Polynomial Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.13431v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.13431v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yue Yao, Shengchao Yan, Daniel Goehring, Wolfram Burgard, Joerg Reichardt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robustness against Out-of-Distribution (OoD) samples is a key performance
+indicator of a trajectory prediction model. However, the development and
+ranking of state-of-the-art (SotA) models are driven by their In-Distribution
+(ID) performance on individual competition datasets. We present an OoD testing
+protocol that homogenizes datasets and prediction tasks across two large-scale
+motion datasets. We introduce a novel prediction algorithm based on polynomial
+representations for agent trajectory and road geometry on both the input and
+output sides of the model. With a much smaller model size, training effort, and
+inference time, we reach near SotA performance for ID testing and significantly
+improve robustness in OoD testing. Within our OoD testing protocol, we further
+study two augmentation strategies of SotA models and their effects on model
+generalization. Highlighting the contrast between ID and OoD performance, we
+suggest adding OoD testing to the evaluation criteria of trajectory prediction
+models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Data-Centric Perspective on Evaluating Machine Learning Models for
+  Tabular Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.02112v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.02112v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrej Tschalzev, Sascha Marton, Stefan Lüdtke, Christian Bartelt, Heiner Stuckenschmidt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tabular data is prevalent in real-world machine learning applications, and
+new models for supervised learning of tabular data are frequently proposed.
+Comparative studies assessing the performance of models typically consist of
+model-centric evaluation setups with overly standardized data preprocessing.
+This paper demonstrates that such model-centric evaluations are biased, as
+real-world modeling pipelines often require dataset-specific preprocessing and
+feature engineering. Therefore, we propose a data-centric evaluation framework.
+We select 10 relevant datasets from Kaggle competitions and implement
+expert-level preprocessing pipelines for each dataset. We conduct experiments
+with different preprocessing pipelines and hyperparameter optimization (HPO)
+regimes to quantify the impact of model selection, HPO, feature engineering,
+and test-time adaptation. Our main findings are: 1. After dataset-specific
+feature engineering, model rankings change considerably, performance
+differences decrease, and the importance of model selection reduces. 2. Recent
+models, despite their measurable progress, still significantly benefit from
+manual feature engineering. This holds true for both tree-based models and
+neural networks. 3. While tabular data is typically considered static, samples
+are often collected over time, and adapting to distribution shifts can be
+important even in supposedly static data. These insights suggest that research
+efforts should be directed toward a data-centric perspective, acknowledging
+that tabular data requires feature engineering and often exhibits temporal
+characteristics. Our framework is available under:
+https://github.com/atschalz/dc_tabeval.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Delving into Differentially Private <span class="highlight-title">Transformer</span> <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.18194v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.18194v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youlong Ding, Xueyang Wu, Yining Meng, Yonggang Luo, Hao Wang, Weike Pan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning with differential privacy (DP) has garnered significant
+attention over the past years, leading to the development of numerous methods
+aimed at enhancing model accuracy and training efficiency. This paper delves
+into the problem of training Transformer models with differential privacy. Our
+treatment is modular: the logic is to `reduce' the problem of training DP
+Transformer to the more basic problem of training DP vanilla neural nets. The
+latter is better understood and amenable to many model-agnostic methods. Such
+`reduction' is done by first identifying the hardness unique to DP Transformer
+training: the attention distraction phenomenon and a lack of compatibility with
+existing techniques for efficient gradient clipping. To deal with these two
+issues, we propose the Re-Attention Mechanism and Phantom Clipping,
+respectively. We believe that our work not only casts new light on training DP
+Transformers but also promotes a modular treatment to advance research in the
+field of differentially private deep learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Helios: An extremely low power event-based gesture recognition for
+  always-on smart eyewear <span class="chip">ECCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.05206v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.05206v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Prarthana Bhattacharyya, Joshua Mitton, Ryan Page, Owen Morgan, Ben Menzies, Gabriel Homewood, Kemi Jacobs, Paolo Baesso, David Trickett, Chris Mair, Taru Muhonen, Rory Clark, Louis Berridge, Richard Vigars, Iain Wallace
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces Helios, the first extremely low-power, real-time,
+event-based hand gesture recognition system designed for all-day on smart
+eyewear. As augmented reality (AR) evolves, current smart glasses like the Meta
+Ray-Bans prioritize visual and wearable comfort at the expense of
+functionality. Existing human-machine interfaces (HMIs) in these devices, such
+as capacitive touch and voice controls, present limitations in ergonomics,
+privacy and power consumption. Helios addresses these challenges by leveraging
+natural hand interactions for a more intuitive and comfortable user experience.
+Our system utilizes a extremely low-power and compact 3mmx4mm/20mW event camera
+to perform natural hand-based gesture recognition for always-on smart eyewear.
+The camera's output is processed by a convolutional neural network (CNN)
+running on a NXP Nano UltraLite compute platform, consuming less than 350mW.
+Helios can recognize seven classes of gestures, including subtle microgestures
+like swipes and pinches, with 91% accuracy. We also demonstrate real-time
+performance across 20 users at a remarkably low latency of 60ms. Our user
+testing results align with the positive feedback we received during our recent
+successful demo at AWE-USA-2024.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ECCV-Integrating Computer Vision in Smart Eyewear, 2024.
+  18 pages, 10 figures. First three authors contributed equally to this paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From Weak to Strong Sound Event Labels using Adaptive Change-Point
+  Detection and Active Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.08525v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.08525v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        John Martinsson, Olof Mogren, Maria Sandsten, Tuomas Virtanen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose an adaptive change point detection method (A-CPD) for machine
+guided weak label annotation of audio recording segments. The goal is to
+maximize the amount of information gained about the temporal activations of the
+target sounds. For each unlabeled audio recording, we use a prediction model to
+derive a probability curve used to guide annotation. The prediction model is
+initially pre-trained on available annotated sound event data with classes that
+are disjoint from the classes in the unlabeled dataset. The prediction model
+then gradually adapts to the annotations provided by the annotator in an active
+learning loop. We derive query segments to guide the weak label annotator
+towards strong labels, using change point detection on these probabilities. We
+show that it is possible to derive strong labels of high quality with a limited
+annotation budget, and show favorable results for A-CPD when compared to two
+baseline query segment strategies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at EUSIPCO 2024 (nominated best student paper)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Could Chemical LLMs benefit from Message Passing <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.08334v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.08334v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaqing Xie, Ziheng Chi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pretrained language models (LMs) showcase significant capabilities in
+processing molecular text, while concurrently, message passing neural networks
+(MPNNs) demonstrate resilience and versatility in the domain of molecular
+science. Despite these advancements, we find there are limited studies
+investigating the bidirectional interactions between molecular structures and
+their corresponding textual representations. Therefore, in this paper, we
+propose two strategies to evaluate whether an information integration can
+enhance the performance: contrast learning, which involves utilizing an MPNN to
+supervise the training of the LM, and fusion, which exploits information from
+both models. Our empirical analysis reveals that the integration approaches
+exhibit superior performance compared to baselines when applied to smaller
+molecular graphs, while these integration approaches do not yield performance
+enhancements on large scale graphs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACL @ Languages and Molecules 2024. In Proceedings of ACL
+  2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Effects of Irrelevant Variables in Treatment Effect Estimation
+  with Deep Disentanglement <span class="chip">ECAI-2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.20003v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.20003v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmad Saeed Khan, Erik Schaffernicht, Johannes Andreas Stork
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Estimating treatment effects from observational data is paramount in
+healthcare, education, and economics, but current deep disentanglement-based
+methods to address selection bias are insufficiently handling irrelevant
+variables. We demonstrate in experiments that this leads to prediction errors.
+We disentangle pre-treatment variables with a deep embedding method and
+explicitly identify and represent irrelevant variables, additionally to
+instrumental, confounding and adjustment latent factors. To this end, we
+introduce a reconstruction objective and create an embedding space for
+irrelevant variables using an attached autoencoder. Instead of relying on
+serendipitous suppression of irrelevant variables as in previous deep
+disentanglement approaches, we explicitly force irrelevant variables into this
+embedding space and employ orthogonalization to prevent irrelevant information
+from leaking into the latent space representations of the other factors. Our
+experiments with synthetic and real-world benchmark datasets show that we can
+better identify irrelevant variables and more precisely predict treatment
+effects than previous methods, while prediction quality degrades less when
+additional irrelevant variables are introduced.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper is accepted at ECAI-2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TabRepo: A Large Scale Repository of Tabular Model Evaluations and its
+  AutoML Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.02971v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.02971v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Salinas, Nick Erickson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce TabRepo, a new dataset of tabular model evaluations and
+predictions. TabRepo contains the predictions and metrics of 1310 models
+evaluated on 200 classification and regression datasets. We illustrate the
+benefit of our dataset in multiple ways. First, we show that it allows to
+perform analysis such as comparing Hyperparameter Optimization against current
+AutoML systems while also considering ensembling at marginal cost by using
+precomputed model predictions. Second, we show that our dataset can be readily
+leveraged to perform transfer-learning. In particular, we show that applying
+standard transfer-learning techniques allows to outperform current
+state-of-the-art tabular systems in accuracy, runtime and latency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Symplectic Bregman divergences 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.12961v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.12961v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Frank Nielsen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a generalization of Bregman divergences in symplectic vector
+spaces that we term symplectic Bregman divergences. Symplectic Bregman
+divergences are derived from a symplectic generalization of the Fenchel-Young
+inequality which relies on the notion of symplectic subdifferentials. The
+symplectic Fenchel-Young inequality is obtained using the symplectic Fenchel
+transform which is defined with respect to a linear symplectic form. When the
+symplectic form is built from an inner product, we show that the corresponding
+symplectic Bregman divergences amount to ordinary Bregman divergences with
+respect to composite inner products. Some potential applications of symplectic
+divergences in geometric mechanics, information geometry, and learning dynamics
+in machine learning are touched upon.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Compressed Federated Reinforcement Learning with a Generative Model <span class="chip">ECML-PKDD 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.10635v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.10635v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Beikmohammadi, Sarit Khirirat, Sindri Magnússon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning has recently gained unprecedented popularity, yet it
+still grapples with sample inefficiency. Addressing this challenge, federated
+reinforcement learning (FedRL) has emerged, wherein agents collaboratively
+learn a single policy by aggregating local estimations. However, this
+aggregation step incurs significant communication costs. In this paper, we
+propose CompFedRL, a communication-efficient FedRL approach incorporating both
+\textit{periodic aggregation} and (direct/error-feedback) compression
+mechanisms. Specifically, we consider compressed federated $Q$-learning with a
+generative model setup, where a central server learns an optimal $Q$-function
+by periodically aggregating compressed $Q$-estimates from local agents. For the
+first time, we characterize the impact of these two mechanisms (which have
+remained elusive) by providing a finite-time analysis of our algorithm,
+demonstrating strong convergence behaviors when utilizing either direct or
+error-feedback compression. Our bounds indicate improved solution accuracy
+concerning the number of agents and other federated hyperparameters while
+simultaneously reducing communication costs. To corroborate our theory, we also
+conduct in-depth numerical experiments to verify our findings, considering
+Top-$K$ and Sparsified-$K$ sparsification operators.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>European Conference on Machine Learning and Principles and Practice
+  of Knowledge Discovery in Databases (ECML-PKDD 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SMILE: Zero-Shot Sparse Mixture of Low-Rank Experts Construction From
+  <span class="highlight-title">Pre-Train</span>ed Foundation Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.10174v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.10174v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anke Tang, Li Shen, Yong Luo, Shuai Xie, Han Hu, Lefei Zhang, Bo Du, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep model training on extensive datasets is increasingly becoming
+cost-prohibitive, prompting the widespread adoption of deep model fusion
+techniques to leverage knowledge from pre-existing models. From simple weight
+averaging to more sophisticated methods like AdaMerging, model fusion
+effectively improves model performance and accelerates the development of new
+models. However, potential interference between parameters of individual models
+and the lack of interpretability in the fusion progress remain significant
+challenges. Existing methods often try to resolve the parameter interference
+issue by evaluating attributes of parameters, such as their magnitude or sign,
+or by parameter pruning. In this study, we begin by examining the fine-tuning
+of linear layers through the lens of subspace analysis and explicitly define
+parameter interference as an optimization problem to shed light on this
+subject. Subsequently, we introduce an innovative approach to model fusion
+called zero-shot Sparse MIxture of Low-rank Experts (SMILE) construction, which
+allows for the upscaling of source models into an MoE model without extra data
+or further training. Our approach relies on the observation that fine-tuning
+mostly keeps the important parts from the pre-training, but it uses less
+significant or unused areas to adapt to new tasks. Also, the issue of parameter
+interference, which is intrinsically intractable in the original parameter
+space, can be managed by expanding the dimensions. We conduct extensive
+experiments across diverse scenarios, such as image classification and text
+generation tasks, using full fine-tuning and LoRA fine-tuning, and we apply our
+method to large language models (CLIP models, Flan-T5 models, and Mistral-7B
+models), highlighting the adaptability and scalability of SMILE. Code is
+available at https://github.com/tanganke/fusion_bench
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code is available at https://github.com/tanganke/fusion_bench</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Investigating Feature and Model Importance in Android Malware Detection:
+  An Implemented <span class="highlight-title">Survey</span> and Experimental Comparison of ML-Based Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.12778v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.12778v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Muzaffar, Hani Ragab Hassen, Hind Zantout, Michael A Lones
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The popularity of Android means it is a common target for malware. Over the
+years, various studies have found that machine learning models can effectively
+discriminate malware from benign applications. However, as the operating system
+evolves, so does malware, bringing into question the findings of these previous
+studies, many of which report very high accuracies using small, outdated, and
+often imbalanced datasets. In this paper, we reimplement 18 representative past
+works and reevaluate them using a balanced, relevant, and up-to-date dataset
+comprising 124,000 applications. We also carry out new experiments designed to
+fill holes in existing knowledge, and use our findings to identify the most
+effective features and models to use for Android malware detection within a
+contemporary environment. We show that high detection accuracies (up to 96.8%)
+can be achieved using features extracted through static analysis alone,
+yielding a modest benefit (1%) from using far more expensive dynamic analysis.
+API calls and opcodes are the most productive static and TCP network traffic
+provide the most predictive dynamic features. Random forests are generally the
+most effective model, outperforming more complex deep learning approaches.
+Whilst directly combining static and dynamic features is generally ineffective,
+ensembling models separately leads to performances comparable to the best
+models but using less brittle features.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the good reliability of an interval-based metric to validate
+  prediction uncertainty for machine learning regression tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13089v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13089v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pascal Pernot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This short study presents an opportunistic approach to a (more) reliable
+validation method for prediction uncertainty average calibration. Considering
+that variance-based calibration metrics (ZMS, NLL, RCE...) are quite sensitive
+to the presence of heavy tails in the uncertainty and error distributions, a
+shift is proposed to an interval-based metric, the Prediction Interval Coverage
+Probability (PICP). It is shown on a large ensemble of molecular properties
+datasets that (1) sets of z-scores are well represented by Student's-$t(\nu)$
+distributions, $\nu$ being the number of degrees of freedom; (2) accurate
+estimation of 95 $\%$ prediction intervals can be obtained by the simple
+$2\sigma$ rule for $\nu>3$; and (3) the resulting PICPs are more quickly and
+reliably tested than variance-based calibration metrics. Overall, this method
+enables to test 20 $\%$ more datasets than ZMS testing. Conditional calibration
+is also assessed using the PICP approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dynamic Domains, Dynamic Solutions: DPCore for Continual Test-Time
+  Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.10737v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.10737v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunbei Zhang, Akshay Mehra, Jihun Hamm
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continual Test-Time Adaptation (CTTA) seeks to adapt a source pre-trained
+model to continually changing, unlabeled target domains. Existing TTA methods
+are typically designed for environments where domain changes occur sequentially
+and can struggle in more dynamic scenarios, as illustrated in Figure
+\ref{fig:settings}. Inspired by the principles of online K-Means, we introduce
+a novel approach to CTTA through visual prompting. We propose a \emph{Dynamic
+Prompt Coreset} that not only preserves knowledge from previously visited
+domains but also accommodates learning from new potential domains. This is
+complemented by a distance-based \emph{Weight Updating Mechanism} that ensures
+the coreset remains current and relevant. Our approach employs a fixed model
+architecture alongside the coreset and an innovative updating system to
+effectively mitigate challenges such as catastrophic forgetting and error
+accumulation. Extensive testing on four widely-used benchmarks demonstrates
+that our method consistently outperforms state-of-the-art alternatives in both
+classification and segmentation CTTA tasks across the structured and dynamic
+CTTA settings, with $99\%$ fewer trainable parameters.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving SMOTE via Fusing Conditional VAE for Data-adaptive Noise
+  Filtering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19757v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19757v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sungchul Hong, Seunghwan An, Jong-June Jeon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in a generative neural network model extend the development
+of data augmentation methods. However, the augmentation methods based on the
+modern generative models fail to achieve notable performance for class
+imbalance data compared to the conventional model, Synthetic Minority
+Oversampling Technique (SMOTE). We investigate the problem of the generative
+model for imbalanced classification and introduce a framework to enhance the
+SMOTE algorithm using Variational Autoencoders (VAE). Our approach
+systematically quantifies the density of data points in a low-dimensional
+latent space using the VAE, simultaneously incorporating information on class
+labels and classification difficulty. Then, the data points potentially
+degrading the augmentation are systematically excluded, and the neighboring
+observations are directly augmented on the data space. Empirical studies on
+several imbalanced datasets represent that this simple process innovatively
+improves the conventional SMOTE algorithm over the deep learning models.
+Consequently, we conclude that the selection of minority data and the
+interpolation in the data space are beneficial for imbalanced classification
+problems with a relatively small number of data points.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Teaching AI the Anatomy Behind the Scan: Addressing Anatomical Flaws in
+  Medical Image Segmentation with Learnable Prior 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.18878v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.18878v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Young Seok Jeon, Hongfei Yang, Huazhu Fu, Mengling Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Imposing key anatomical features, such as the number of organs, their shapes
+and relative positions, is crucial for building a robust multi-organ
+segmentation model. Current attempts to incorporate anatomical features include
+broadening the effective receptive field (ERF) size with data-intensive
+modules, or introducing anatomical constraints that scales poorly to
+multi-organ segmentation. We introduce a novel architecture called the
+Anatomy-Informed Cascaded Segmentation Network (AIC-Net). AIC-Net incorporates
+a learnable input termed "Anatomical Prior", which can be adapted to
+patient-specific anatomy using a differentiable spatial deformation. The
+deformed prior later guides decoder layers towards more anatomy-informed
+predictions. We repeat this process at a local patch level to enhance the
+representation of intricate objects, resulting in a cascaded network structure.
+AIC-Net is a general method that enhances any existing segmentation models to
+be more anatomy-aware. We have validated the performance of AIC-Net, with
+various backbones, on two multi-organ segmentation tasks: abdominal organs and
+vertebrae. For each respective task, our benchmarks demonstrate improved dice
+score and Hausdorff distance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SparseGrow: Addressing Growth-Induced Forgetting in Task-Agnostic
+  Continual Learning <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.10566v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.10566v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuqing Zhao, Divya Saxena, Jiannong Cao, Xiaoyun Liu, Changlin Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In continual learning (CL), model growth enhances adaptability over new data,
+improving knowledge retention for more tasks. However, improper model growth
+can lead to severe degradation of previously learned knowledge, an issue we
+name as growth-induced forgetting (GIFt), especially in task-agnostic CL using
+entire grown model for inference. Existing works, despite adopting model growth
+and random initialization for better adaptability, often fail to recognize the
+presence of GIFt caused by improper model growth. This oversight limits
+comprehensive control of forgetting and hinders full utilization of model
+growth. We are the first in CL to identify this issue and conduct an in-depth
+study on root cause of GIFt, where layer expansion stands out among model
+growth strategies, widening layers without affecting model functionality. Yet,
+direct adoption of layer expansion presents challenges. It lacks data-driven
+control and initialization of expanded parameters to balance adaptability and
+knowledge retention. This paper presents a novel SparseGrow approach to
+overcome the issue of GIFt while enhancing adaptability over new data.
+SparseGrow employs data-driven sparse layer expansion to control efficient
+parameter usage during growth, reducing GIFt from excessive growth and
+functionality changes. It also combines sparse growth with on-data
+initialization at training late-stage to create partially 0-valued expansions
+that fit learned distribution, enhancing retention and adaptability. To further
+minimize forgetting, freezing is applied by calculating the sparse mask,
+allowing data-driven preservation of important parameters. Through experiments
+across datasets with various settings, cases and task numbers, we demonstrate
+the necessity of layer expansion and showcase the effectiveness of SparseGrow
+in overcoming GIFt, highlighting its adaptability and knowledge retention for
+incremental tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been submitted to the AAAI conference. If accepted,
+  the final version will be updated to reflect the conference proceedings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AdapTable: Test-Time Adaptation for Tabular Data via Shift-Aware
+  Uncertainty Calibrator and Label Distribution Handler <span class="chip">AAAI 2025</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.10784v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.10784v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Changhun Kim, Taewon Kim, Seungyeon Woo, June Yong Yang, Eunho Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In real-world scenarios, tabular data often suffer from distribution shifts
+that threaten the performance of machine learning models. Despite its
+prevalence and importance, handling distribution shifts in the tabular domain
+remains underexplored due to the inherent challenges within the tabular data
+itself. In this sense, test-time adaptation (TTA) offers a promising solution
+by adapting models to target data without accessing source data, crucial for
+privacy-sensitive tabular domains. However, existing TTA methods either 1)
+overlook the nature of tabular distribution shifts, often involving label
+distribution shifts, or 2) impose architectural constraints on the model,
+leading to a lack of applicability. To this end, we propose AdapTable, a novel
+TTA framework for tabular data. AdapTable operates in two stages: 1)
+calibrating model predictions using a shift-aware uncertainty calibrator, and
+2) adjusting these predictions to match the target label distribution with a
+label distribution handler. We validate the effectiveness of AdapTable through
+theoretical analysis and extensive experiments on various distribution shift
+scenarios. Our results demonstrate AdapTable's ability to handle various
+real-world distribution shifts, achieving up to a 16% improvement on the HELOC
+dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review at AAAI 2025</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Aligning Cyber Space with Physical World: A Comprehensive <span class="highlight-title">Survey</span> on
+  Embodied AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.06886v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.06886v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Liu, Weixing Chen, Yongjie Bai, Xiaodan Liang, Guanbin Li, Wen Gao, Liang Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Embodied Artificial Intelligence (Embodied AI) is crucial for achieving
+Artificial General Intelligence (AGI) and serves as a foundation for various
+applications that bridge cyberspace and the physical world. Recently, the
+emergence of Multi-modal Large Models (MLMs) and World Models (WMs) have
+attracted significant attention due to their remarkable perception,
+interaction, and reasoning capabilities, making them a promising architecture
+for the brain of embodied agents. However, there is no comprehensive survey for
+Embodied AI in the era of MLMs. In this survey, we give a comprehensive
+exploration of the latest advancements in Embodied AI. Our analysis firstly
+navigates through the forefront of representative works of embodied robots and
+simulators, to fully understand the research focuses and their limitations.
+Then, we analyze four main research targets: 1) embodied perception, 2)
+embodied interaction, 3) embodied agent, and 4) sim-to-real adaptation,
+covering the state-of-the-art methods, essential paradigms, and comprehensive
+datasets. Additionally, we explore the complexities of MLMs in virtual and real
+embodied agents, highlighting their significance in facilitating interactions
+in dynamic digital and physical environments. Finally, we summarize the
+challenges and limitations of embodied AI and discuss their potential future
+directions. We hope this survey will serve as a foundational reference for the
+research community and inspire continued innovation. The associated project can
+be found at https://github.com/HCPLab-SYSU/Embodied_AI_Paper_List.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The first comprehensive review of Embodied AI in the era of MLMs, 39
+  pages. We also provide the paper list for Embodied AI:
+  https://github.com/HCPLab-SYSU/Embodied_AI_Paper_List</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reduce Computational Complexity for Convolutional Layers by Skipping
+  Zeros 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.15951v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.15951v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiyi Zhang, Pengfei Zhang, Zhuopin Xu, Qi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Convolutional neural networks necessitate good algorithms to reduce
+complexity, and sufficient utilization of parallel processors for acceleration.
+Within convolutional layers, there are three types of operators: convolution
+used in forward propagation, deconvolution and dilated-convolution utilized in
+backward propagation. During the execution of these operators, zeros are
+typically added to tensors, leading to redundant calculations and unnecessary
+strain on hardware. To circumvent these inefficiencies, we propose the C-K-S
+algorithm, accompanied by efficient GPU implementations. C-K-S trims filters to
+exclude zero-padding. For deconvolution and dilated-convolution, C-K-S
+transforms sparse tensors into dense tensors, and standardizes the local
+computational rules to simplify the hardware control. The experimental results
+demonstrate that C-K-S offers good performance in terms of speed and
+convergence, surpassing the capabilities of PyTorch and cuDNN in certain
+scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond KAN: Introducing KarSein for Adaptive High-Order Feature
+  Interaction Modeling in CTR Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.08713v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.08713v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunxiao Shi, Wujiang Xu, Mingyu Jin, Haimin Zhang, Qiang Wu, Yongfeng Zhang, Min Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modeling feature interactions is crucial for click-through rate (CTR)
+prediction, particularly when it comes to high-order explicit interactions.
+Traditional methods struggle with this task because they often predefine a
+maximum interaction order, which relies heavily on prior knowledge and can
+limit the model's effectiveness. Additionally, modeling high-order interactions
+typically leads to increased computational costs. Therefore, the challenge lies
+in adaptively modeling high-order feature interactions while maintaining
+efficiency. To address this issue, we introduce Kolmogorov-Arnold Represented
+Sparse Efficient Interaction Network (KarSein), designed to optimize both
+predictive accuracy and computational efficiency. We firstly identify
+limitations of directly applying Kolmogorov-Arnold Networks (KAN) to CTR and
+then introduce KarSein to overcome these issues. It features a novel
+architecture that reduces the computational costs of KAN and supports embedding
+vectors as feature inputs. Additionally, KarSein employs guided symbolic
+regression to address the challenge of KAN in spontaneously learning
+multiplicative relationships. Extensive experiments demonstrate KarSein's
+superior performance, achieving significant predictive accuracy with minimal
+computational overhead. Furthermore, KarSein maintains strong global
+explainability while enabling the removal of redundant features, resulting in a
+sparse network structure. These advantages also position KarSein as a promising
+method for efficient inference.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>KarSein for CTR</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Performative Prediction with Neural Networks <span class="chip">AISTATS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.06879v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.06879v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mehrnaz Mofakhami, Ioannis Mitliagkas, Gauthier Gidel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Performative prediction is a framework for learning models that influence the
+data they intend to predict. We focus on finding classifiers that are
+performatively stable, i.e. optimal for the data distribution they induce.
+Standard convergence results for finding a performatively stable classifier
+with the method of repeated risk minimization assume that the data distribution
+is Lipschitz continuous to the model's parameters. Under this assumption, the
+loss must be strongly convex and smooth in these parameters; otherwise, the
+method will diverge for some problems. In this work, we instead assume that the
+data distribution is Lipschitz continuous with respect to the model's
+predictions, a more natural assumption for performative systems. As a result,
+we are able to significantly relax the assumptions on the loss function. In
+particular, we do not need to assume convexity with respect to the model's
+parameters. As an illustration, we introduce a resampling procedure that models
+realistic distribution shifts and show that it satisfies our assumptions. We
+support our theory by showing that one can learn performatively stable
+classifiers with neural networks making predictions about real data that shift
+according to our proposed procedure.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at AISTATS 2023; Theoretical results extended</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Linear multidimensional regression with interactive fixed-effects 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.11691v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.11691v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hugo Freeman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper studies a linear and additively separable model for
+multidimensional panel data of three or more dimensions with unobserved
+interactive fixed effects. Two approaches are considered to account for these
+unobserved interactive fixed-effects when estimating coefficients on the
+observed covariates. First, the model is embedded within the standard two
+dimensional panel framework and restrictions are formed under which the factor
+structure methods in Bai (2009) lead to consistent estimation of model
+parameters, but at slow rates of convergence. The second approach develops a
+kernel weighted fixed-effects method that is more robust to the
+multidimensional nature of the problem and can achieve the parametric rate of
+consistency under certain conditions. Theoretical results and simulations show
+some benefits to standard two-dimensional panel methods when the structure of
+the interactive fixed-effect term is known, but also highlight how the kernel
+weighted method performs well without knowledge of this structure. The methods
+are implemented to estimate the demand elasticity for beer.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ uMedSum: A Unified Framework for Advancing Medical Abstractive
+  Summarization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.12095v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.12095v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aishik Nagar, Yutong Liu, Andy T. Liu, Viktor Schlegel, Vijay Prakash Dwivedi, Arun-Kumar Kaliya-Perumal, Guna Pratheep Kalanchiam, Yili Tang, Robby T. Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical abstractive summarization faces the challenge of balancing
+faithfulness and informativeness. Current methods often sacrifice key
+information for faithfulness or introduce confabulations when prioritizing
+informativeness. While recent advancements in techniques like in-context
+learning (ICL) and fine-tuning have improved medical summarization, they often
+overlook crucial aspects such as faithfulness and informativeness without
+considering advanced methods like model reasoning and self-improvement.
+Moreover, the field lacks a unified benchmark, hindering systematic evaluation
+due to varied metrics and datasets. This paper addresses these gaps by
+presenting a comprehensive benchmark of six advanced abstractive summarization
+methods across three diverse datasets using five standardized metrics. Building
+on these findings, we propose uMedSum, a modular hybrid summarization framework
+that introduces novel approaches for sequential confabulation removal followed
+by key missing information addition, ensuring both faithfulness and
+informativeness. Our work improves upon previous GPT-4-based state-of-the-art
+(SOTA) medical summarization methods, significantly outperforming them in both
+quantitative metrics and qualitative domain expert evaluations. Notably, we
+achieve an average relative performance improvement of 11.8% in reference-free
+metrics over the previous SOTA. Doctors prefer uMedSum's summaries 6 times more
+than previous SOTA in difficult cases where there are chances of confabulations
+or missing information. These results highlight uMedSum's effectiveness and
+generalizability across various datasets and metrics, marking a significant
+advancement in medical summarization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PALM: Pushing Adaptive Learning Rate Mechanisms for Continual Test-Time
+  Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.10650v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.10650v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sarthak Kumar Maharana, Baoming Zhang, Yunhui Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Real-world vision models in dynamic environments face rapid shifts in domain
+distributions, leading to decreased recognition performance. Using unlabeled
+test data, continual test-time adaptation (CTTA) directly adjusts a pre-trained
+source discriminative model to these changing domains. A highly effective CTTA
+method involves applying layer-wise adaptive learning rates for selectively
+adapting pre-trained layers. However, it suffers from the poor estimation of
+domain shift and the inaccuracies arising from the pseudo-labels. This work
+aims to overcome these limitations by identifying layers for adaptation via
+quantifying model prediction uncertainty without relying on pseudo-labels. We
+utilize the magnitude of gradients as a metric, calculated by backpropagating
+the KL divergence between the softmax output and a uniform distribution, to
+select layers for further adaptation. Subsequently, for the parameters
+exclusively belonging to these selected layers, with the remaining ones frozen,
+we evaluate their sensitivity to approximate the domain shift and adjust their
+learning rates accordingly. We conduct extensive image classification
+experiments on CIFAR-10C, CIFAR-100C, and ImageNet-C, demonstrating the
+superior efficacy of our method compared to prior approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Visual Analysis of Multi-outcome Causal Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.02679v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.02679v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mengjie Fan, Jinlu Yu, Daniel Weiskopf, Nan Cao, Huai-Yu Wang, Liang Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a visual analysis method for multiple causal graphs with
+different outcome variables, namely, multi-outcome causal graphs. Multi-outcome
+causal graphs are important in healthcare for understanding multimorbidity and
+comorbidity. To support the visual analysis, we collaborated with medical
+experts to devise two comparative visualization techniques at different stages
+of the analysis process. First, a progressive visualization method is proposed
+for comparing multiple state-of-the-art causal discovery algorithms. The method
+can handle mixed-type datasets comprising both continuous and categorical
+variables and assist in the creation of a fine-tuned causal graph of a single
+outcome. Second, a comparative graph layout technique and specialized visual
+encodings are devised for the quick comparison of multiple causal graphs. In
+our visual analysis approach, analysts start by building individual causal
+graphs for each outcome variable, and then, multi-outcome causal graphs are
+generated and visualized with our comparative technique for analyzing
+differences and commonalities of these causal graphs. Evaluation includes
+quantitative measurements on benchmark datasets, a case study with a medical
+expert, and expert user studies with real-world health research data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Primal-Dual-Assisted Penalty Approach to Bilevel Optimization with
+  Coupled Constraints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.10148v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.10148v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liuyuan Jiang, Quan Xiao, Victor M. Tenorio, Fernando Real-Rojas, Antonio G. Marques, Tianyi Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interest in bilevel optimization has grown in recent years, partially due to
+its applications to tackle challenging machine-learning problems. Several
+exciting recent works have been centered around developing efficient
+gradient-based algorithms that can solve bilevel optimization problems with
+provable guarantees. However, the existing literature mainly focuses on bilevel
+problems either without constraints, or featuring only simple constraints that
+do not couple variables across the upper and lower levels, excluding a range of
+complex applications. Our paper studies this challenging but less explored
+scenario and develops a (fully) first-order algorithm, which we term BLOCC, to
+tackle BiLevel Optimization problems with Coupled Constraints. We establish
+rigorous convergence theory for the proposed algorithm and demonstrate its
+effectiveness on two well-known real-world applications - hyperparameter
+selection in support vector machine (SVM) and infrastructure planning in
+transportation networks using the real data from the city of Seville.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In this version, we have made the following updates: (1) Added a
+  sensitivity analysis of the algorithm's hyperparameters (stepsize and penalty
+  constant) in Appendix G. (2) Included a computational complexity analysis and
+  comparison in Appendix H. (3) Explicitly stated the inner-loop stepsizes in
+  Remarks 2 and 3</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Stem-Agnostic Single-Decoder System for Music Source Separation Beyond
+  Four Stems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.18747v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.18747v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karn N. Watcharasupat, Alexander Lerch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite significant recent progress across multiple subtasks of audio source
+separation, few music source separation systems support separation beyond the
+four-stem vocals, drums, bass, and other (VDBO) setup. Of the very few current
+systems that support source separation beyond this setup, most continue to rely
+on an inflexible decoder setup that can only support a fixed pre-defined set of
+stems. Increasing stem support in these inflexible systems correspondingly
+requires increasing computational complexity, rendering extensions of these
+systems computationally infeasible for long-tail instruments. In this work, we
+propose Banquet, a system that allows source separation of multiple stems using
+just one decoder. A bandsplit source separation model is extended to work in a
+query-based setup in tandem with a music instrument recognition PaSST model. On
+the MoisesDB dataset, Banquet, at only 24.9 M trainable parameters, approached
+the performance level of the significantly more complex 6-stem Hybrid
+Transformer Demucs on VDBO stems and outperformed it on guitar and piano. The
+query-based setup allows for the separation of narrow instrument classes such
+as clean acoustic guitars, and can be successfully applied to the extraction of
+less common stems such as reeds and organs. Implementation is available at
+https://github.com/kwatcharasupat/query-bandit.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the 25th International Society for Music Information
+  Retrieval Conference (ISMIR 2024). Camera-ready version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Remastering Divide and Remaster: A Cinematic Audio Source Separation
+  <span class="highlight-title">Dataset</span> with Multilingual Support 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.07275v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.07275v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karn N. Watcharasupat, Chih-Wei Wu, Iroro Orife
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cinematic audio source separation (CASS), as a problem of extracting the
+dialogue, music, and effects stems from their mixture, is a relatively new
+subtask of audio source separation. To date, only one publicly available
+dataset exists for CASS, that is, the Divide and Remaster (DnR) dataset, which
+is currently at version 2. While DnR v2 has been an incredibly useful resource
+for CASS, several areas of improvement have been identified, particularly
+through its use in the 2023 Sound Demixing Challenge. In this work, we develop
+version 3 of the DnR dataset, addressing issues relating to vocal content in
+non-dialogue stems, loudness distributions, mastering process, and linguistic
+diversity. In particular, the dialogue stem of DnR v3 includes speech content
+from more than 30 languages from multiple families including but not limited to
+the Germanic, Romance, Indo-Aryan, Dravidian, Malayo-Polynesian, and Bantu
+families. Benchmark results using the Bandit model indicated that training on
+multilingual data yields significant generalizability to the model even in
+languages with low data availability. Even in languages with high data
+availability, the multilingual model often performs on par or better than
+dedicated models trained on monolingual CASS datasets. Dataset and model
+implementation will be made available at
+https://github.com/kwatcharasupat/source-separation-landing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the 5th IEEE International Symposium on the Internet of
+  Sounds. Camera-ready version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unveiling Nonlinear Dynamics in Catastrophe Bond Pricing: A Machine
+  Learning Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.00697v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.00697v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaowei Chen, Hong Li, Yufan Lu, Rui Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper explores the implications of using machine learning models in the
+pricing of catastrophe (CAT) bonds. By integrating advanced machine learning
+techniques, our approach uncovers nonlinear relationships and complex
+interactions between key risk factors and CAT bond spreads -- dynamics that are
+often overlooked by traditional linear regression models. Using primary market
+CAT bond transaction records between January 1999 and March 2021, our findings
+demonstrate that machine learning models not only enhance the accuracy of CAT
+bond pricing but also provide a deeper understanding of how various risk
+factors interact and influence bond prices in a nonlinear way. These findings
+suggest that investors and issuers can benefit from incorporating machine
+learning to better capture the intricate interplay between risk factors when
+pricing CAT bonds. The results also highlight the potential for machine
+learning models to refine our understanding of asset pricing in markets
+characterized by complex risk structures.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Facing the Music: Tackling Singing Voice Separation in Cinematic Audio
+  Source Separation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.03588v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.03588v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karn N. Watcharasupat, Chih-Wei Wu, Iroro Orife
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cinematic audio source separation (CASS), as a standalone problem of
+extracting individual stems from their mixture, is a fairly new subtask of
+audio source separation. A typical setup of CASS is a three-stem problem, with
+the aim of separating the mixture into the dialogue (DX), music (MX), and
+effects (FX) stems. Given the creative nature of cinematic sound production,
+however, several edge cases exist; some sound sources do not fit neatly in any
+of these three stems, necessitating the use of additional auxiliary stems in
+production. One very common edge case is the singing voice in film audio, which
+may belong in either the DX or MX or neither, depending heavily on the
+cinematic context. In this work, we demonstrate a very straightforward
+extension of the dedicated-decoder Bandit and query-based single-decoder
+Banquet models to a four-stem problem, treating non-musical dialogue,
+instrumental music, singing voice, and effects as separate stems.
+Interestingly, the query-based Banquet model outperformed the dedicated-decoder
+Bandit model. We hypothesized that this is due to a better feature alignment at
+the bottleneck as enforced by the band-agnostic FiLM layer. Dataset and model
+implementation will be made available at
+https://github.com/kwatcharasupat/source-separation-landing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to the Late-Breaking Demo Session of the 25th International
+  Society for Music Information Retrieval (ISMIR) Conference, 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Predicting O-GlcNAcylation Sites in Mammalian Proteins with <span class="highlight-title">Transformer</span>s
+  and RNNs Trained with a New Loss Function 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.17131v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.17131v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pedro Seber
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Glycosylation, a protein modification, has multiple essential functional and
+structural roles. O-GlcNAcylation, a subtype of glycosylation, has the
+potential to be an important target for therapeutics, but methods to reliably
+predict O-GlcNAcylation sites had not been available until 2023; a 2021 review
+correctly noted that published models were insufficient and failed to
+generalize. Moreover, many are no longer usable. In 2023, a considerably better
+RNN model with an F$_1$ score of 36.17% and an MCC of 34.57% on a large dataset
+was published. This article first sought to improve these metrics using
+transformer encoders. While transformers displayed high performance on this
+dataset, their performance was inferior to that of the previously published
+RNN. We then created a new loss function, which we call the weighted focal
+differentiable MCC, to improve the performance of classification models. RNN
+models trained with this new function display superior performance to models
+trained using the weighted cross-entropy loss; this new function can also be
+used to fine-tune trained models. A two-cell RNN trained with this loss
+achieves state-of-the-art performance in O-GlcNAcylation site prediction with
+an F$_1$ score of 38.88% and an MCC of 38.20% on that large dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The merged-staircase property: a necessary and nearly sufficient
+  condition for SGD learning of sparse functions on two-layer neural networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2202.08658v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2202.08658v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emmanuel Abbe, Enric Boix-Adsera, Theodor Misiakiewicz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is currently known how to characterize functions that neural networks can
+learn with SGD for two extremal parameterizations: neural networks in the
+linear regime, and neural networks with no structural constraints. However, for
+the main parametrization of interest (non-linear but regular networks) no tight
+characterization has yet been achieved, despite significant developments.
+  We take a step in this direction by considering depth-2 neural networks
+trained by SGD in the mean-field regime. We consider functions on binary inputs
+that depend on a latent low-dimensional subspace (i.e., small number of
+coordinates). This regime is of interest since it is poorly understood how
+neural networks routinely tackle high-dimensional datasets and adapt to latent
+low-dimensional structure without suffering from the curse of dimensionality.
+Accordingly, we study SGD-learnability with $O(d)$ sample complexity in a large
+ambient dimension $d$.
+  Our main results characterize a hierarchical property, the "merged-staircase
+property", that is both necessary and nearly sufficient for learning in this
+setting.
+  We further show that non-linear training is necessary: for this class of
+functions, linear methods on any feature map (e.g., the NTK) are not capable of
+learning efficiently. The key tools are a new "dimension-free" dynamics
+approximation result that applies to functions defined on a latent space of
+low-dimension, a proof of global convergence based on polynomial identity
+testing, and an improvement of lower bounds against linear methods for
+non-almost orthogonal functions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Natural Mitigation of Catastrophic Interference: Continual Learning in
+  Power-Law Learning Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10393v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10393v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Atith Gandhi, Raj Sanjay Shah, Vijay Marupudi, Sashank Varma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural networks often suffer from catastrophic interference (CI): performance
+on previously learned tasks drops off significantly when learning a new task.
+This contrasts strongly with humans, who can continually learn new tasks
+without appreciably forgetting previous tasks. Prior work has explored various
+techniques for mitigating CI and promoting continual learning such as
+regularization, rehearsal, generative replay, and context-specific components.
+This paper takes a different approach, one guided by cognitive science research
+showing that in naturalistic environments, the probability of encountering a
+task decreases as a power-law of the time since it was last performed. We argue
+that techniques for mitigating CI should be compared against the intrinsic
+mitigation in simulated naturalistic learning environments. Thus, we evaluate
+the extent of the natural mitigation of CI when training models in power-law
+environments, similar to those humans face. Our results show that natural
+rehearsal environments are better at mitigating CI than existing methods,
+calling for the need for better evaluation processes. The benefits of this
+environment include simplicity, rehearsal that is agnostic to both tasks and
+models, and the lack of a need for extra neural circuitry. In addition, we
+explore popular mitigation techniques in power-law environments to create new
+baselines for continual learning research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PolyRouter: A Multi-LLM Querying System 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.12320v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.12320v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dimitris Stripelis, Zijian Hu, Jipeng Zhang, Zhaozhuo Xu, Alay Dilipbhai Shah, Han Jin, Yuhang Yao, Salman Avestimehr, Chaoyang He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rapid growth of Large Language Models (LLMs) across various domains,
+numerous new LLMs have emerged, each possessing domain-specific expertise. This
+proliferation has highlighted the need for quick, high-quality, and
+cost-effective LLM query response methods. Yet, no single LLM exists to
+efficiently balance this trilemma. Some models are powerful but extremely
+costly, while others are fast and inexpensive but qualitatively inferior. To
+address this challenge, we present PolyRouter, a non-monolithic LLM querying
+system that seamlessly integrates various LLM experts into a single query
+interface and dynamically routes incoming queries to the most high-performant
+expert based on query's requirements. Through extensive experiments, we
+demonstrate that when compared to standalone expert models, PolyRouter improves
+query efficiency by up to 40%, and leads to significant cost reductions of up
+to 30%, while maintaining or enhancing model performance by up to 10%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 7 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">5</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Digital Fingerprinting on Multimedia: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14155v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14155v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wendi Chen, Wensheng Gan, Philip S. Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The explosive growth of multimedia content in the digital economy era has
+brought challenges in content recognition, copyright protection, and data
+management. As an emerging content management technology, perceptual hash-based
+digital fingerprints, serving as compact summaries of multimedia content, have
+been widely adopted for efficient multimedia content identification and
+retrieval across different modalities (e.g., text, image, video, audio),
+attracting significant attention from both academia and industry. Despite the
+increasing applications of digital fingerprints, there is a lack of systematic
+and comprehensive literature review on multimedia digital fingerprints. This
+survey aims to fill this gap and provide an important resource for researchers
+studying the details and related advancements of multimedia digital
+fingerprints. The survey first introduces the definition, characteristics, and
+related concepts (including hash functions, granularity, similarity measures,
+etc.) of digital fingerprints. It then focuses on analyzing and summarizing the
+algorithms for extracting unimodal fingerprints of different types of digital
+content, including text fingerprints, image fingerprints, video fingerprints,
+and audio fingerprints. Particularly, it provides an in-depth review and
+summary of deep learning-based fingerprints. Additionally, the survey
+elaborates on the various practical applications of digital fingerprints and
+outlines the challenges and potential future research directions. The goal is
+to promote the continued development of multimedia digital fingerprint
+research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. 5 figures, 7 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HABD: a houma alliance book ancient handwritten character recognition
+  database 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14084v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14084v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoyu Yuan, Xiaohua Huang, Zibo Zhang, Yabo Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Houma Alliance Book, one of history's earliest calligraphic examples, was
+unearthed in the 1970s. These artifacts were meticulously organized,
+reproduced, and copied by the Shanxi Provincial Institute of Cultural Relics.
+However, because of their ancient origins and severe ink erosion, identifying
+characters in the Houma Alliance Book is challenging, necessitating the use of
+digital technology. In this paper, we propose a new ancient handwritten
+character recognition database for the Houma alliance book, along with a novel
+benchmark based on deep learning architectures. More specifically, a collection
+of 26,732 characters samples from the Houma Alliance Book were gathered,
+encompassing 327 different types of ancient characters through iterative
+annotation. Furthermore, benchmark algorithms were proposed by combining four
+deep neural network classifiers with two data augmentation methods. This
+research provides valuable resources and technical support for further studies
+on the Houma Alliance Book and other ancient characters. This contributes to
+our understanding of ancient culture and history, as well as the preservation
+and inheritance of humanity's cultural heritage.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revisiting Image Captioning Training Paradigm via Direct CLIP-based
+  Optimization <span class="chip">BMVC 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.14547v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.14547v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicholas Moratelli, Davide Caffagni, Marcella Cornia, Lorenzo Baraldi, Rita Cucchiara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The conventional training approach for image captioning involves pre-training
+a network using teacher forcing and subsequent fine-tuning with Self-Critical
+Sequence Training to maximize hand-crafted captioning metrics. However, when
+attempting to optimize modern and higher-quality metrics like CLIP-Score and
+PAC-Score, this training method often encounters instability and fails to
+acquire the genuine descriptive capabilities needed to produce fluent and
+informative captions. In this paper, we propose a new training paradigm termed
+Direct CLIP-Based Optimization (DiCO). Our approach jointly learns and
+optimizes a reward model that is distilled from a learnable captioning
+evaluator with high human correlation. This is done by solving a weighted
+classification problem directly inside the captioner. At the same time, DiCO
+prevents divergence from the original model, ensuring that fluency is
+maintained. DiCO not only exhibits improved stability and enhanced quality in
+the generated captions but also aligns more closely with human preferences
+compared to existing methods, especially in modern metrics. Additionally, it
+maintains competitive performance in traditional metrics. Our source code and
+trained models are publicly available at https://github.com/aimagelab/DiCO.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>BMVC 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TSC-PCAC: Voxel <span class="highlight-title">Transformer</span> and Sparse Convolution Based Point Cloud
+  Attribute Compression for 3D Broadcasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.04284v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.04284v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zixi Guo, Yun Zhang, Linwei Zhu, Hanli Wang, Gangyi Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Point cloud has been the mainstream representation for advanced 3D
+applications, such as virtual reality and augmented reality. However, the
+massive data amounts of point clouds is one of the most challenging issues for
+transmission and storage. In this paper, we propose an end-to-end voxel
+Transformer and Sparse Convolution based Point Cloud Attribute Compression
+(TSC-PCAC) for 3D broadcasting. Firstly, we present a framework of the
+TSC-PCAC, which include Transformer and Sparse Convolutional Module (TSCM)
+based variational autoencoder and channel context module. Secondly, we propose
+a two-stage TSCM, where the first stage focuses on modeling local dependencies
+and feature representations of the point clouds, and the second stage captures
+global features through spatial and channel pooling encompassing larger
+receptive fields. This module effectively extracts global and local interpoint
+relevance to reduce informational redundancy. Thirdly, we design a TSCM based
+channel context module to exploit interchannel correlations, which improves the
+predicted probability distribution of quantized latent representations and thus
+reduces the bitrate. Experimental results indicate that the proposed TSC-PCAC
+method achieves an average of 38.53%, 21.30%, and 11.19% Bjontegaard Delta
+bitrate reductions compared to the Sparse-PCAC, NF-PCAC, and G-PCC v23 methods,
+respectively. The encoding/decoding time costs are reduced up to 97.68%/98.78%
+on average compared to the Sparse-PCAC. The source code and the trained models
+of the TSC-PCAC are available at https://github.com/igizuxo/TSC-PCAC.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MaVEn: An Effective Multi-granularity Hybrid Visual Encoding Framework
+  for Multimodal Large Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.12321v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.12321v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaoya Jiang, Jia Hongrui, Haiyang Xu, Wei Ye, Mengfan Dong, Ming Yan, Ji Zhang, Fei Huang, Shikun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents MaVEn, an innovative Multi-granularity Visual Encoding
+framework designed to enhance the capabilities of Multimodal Large Language
+Models (MLLMs) in multi-image reasoning. Current MLLMs primarily focus on
+single-image visual understanding, limiting their ability to interpret and
+integrate information across multiple images. MaVEn addresses this limitation
+by combining discrete visual symbol sequences, which abstract coarse-grained
+semantic concepts, with traditional continuous representation sequences that
+model fine-grained features. This dual approach bridges the semantic gap
+between visual and textual data, thereby improving the model's ability to
+process and interpret information from multiple images effectively.
+Additionally, we design a dynamic reduction mechanism by for long-sequence
+continuous features to enhance multi-image processing efficiency. Experimental
+results demonstrate that MaVEn significantly enhances MLLMs' understanding in
+complex multi-image scenarios, while also improving performance in single-image
+contexts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2024-08-25T00:00:00Z">2024-08-25</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">29</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bidirectional Awareness Induction in Autoregressive Seq2Seq Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13959v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13959v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jia Cheng Hu, Roberto Cavicchioli, Alessandro Capotondi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autoregressive Sequence-To-Sequence models are the foundation of many Deep
+Learning achievements in major research fields such as Vision and Natural
+Language Processing. Despite that, they still present significant limitations.
+For instance, when errors occur in the early steps of the prediction, the whole
+output is severely affected. Such reliance on previously predicted tokens and
+the inherent computational unfriendliness of sequential algorithms, motivated
+researchers to explore different architectures and methods in the search for
+bidirectional approaches. In this work, we introduce the Bidirectional
+Awareness Induction (BAI), a training method that leverages a subset of
+elements in the network, the Pivots, to perform bidirectional learning without
+breaking the autoregressive constraints. To showcase its flexibility, we apply
+the method to three architectures, the Transformer, ExpansionNet v2 and GPT,
+then perform experiments over three tasks. Experimental results showcase BAI's
+effectiveness on all selected tasks and architectures. In particular, we
+observed an increase of up to 2.4 CIDEr in Image-Captioning, 4.96 BLEU in
+Neural Machine Translation, and 1.16 ROUGE in Text Summarization compared to
+the respective baselines. Notably, BAI not only has a positive impact on models
+trained from scratch but on pre-trained models as well. Such an aspect,
+combined with the absence of architectural requirements synergizes well with
+the current trend of LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Prediction of COPD Using Machine Learning, Clinical Summary Notes, and
+  Vital Signs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13958v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13958v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Negar Orangi-Fard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chronic obstructive pulmonary disease (COPD) is a chronic inflammatory lung
+disease that causes obstructed airflow from the lungs. In the United States,
+more than 15.7 million Americans have been diagnosed with COPD, with 96% of
+individuals living with at least one other chronic health condition. It is the
+4th leading cause of death in the country. Over 2.2 million patients are
+admitted to hospitals annually due to COPD exacerbations. Monitoring and
+predicting patient exacerbations on-time could save their life. This paper
+presents two different predictive models to predict COPD exacerbation using AI
+and natural language processing (NLP) approaches. These models use respiration
+summary notes, symptoms, and vital signs. To train and test these models, data
+records containing physiologic signals and vital signs time series were used.
+These records were captured from patient monitors and comprehensive clinical
+data obtained from hospital medical information systems for tens of thousands
+of Intensive Care Unit (ICU) patients. We achieved an area under the Receiver
+operating characteristic (ROC) curve of 0.82 in detection and prediction of
+COPD exacerbation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CoT Rerailer: Enhancing the Reliability of Large Language Models in
+  Complex Reasoning Tasks through Error Detection and Correction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13940v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13940v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guangya Wan, Yuqi Wu, Jie Chen, Sheng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chain-of-Thought (CoT) prompting enhances Large Language Models (LLMs)
+complex reasoning abilities by generating intermediate steps. However, these
+steps can introduce hallucinations and accumulate errors. We propose the CoT
+Rerailer to address these challenges, employing self-consistency and
+multi-agent debate systems to identify and rectify errors in the reasoning
+process. The CoT Rerailer first selects the most logically correct Reasoning
+Path (RP) using consistency checks and critical evaluation by automated agents.
+It then engages a multi-agent debate system to propose and validate corrections
+to ensure the generation of an error-free intermediate logical path. The
+corrected steps are then used to generate a revised reasoning chain to further
+reduce hallucinations and enhance answer quality. We demonstrate the
+effectiveness of our approach across diverse question-answering datasets in
+various knowledge domains. The CoT Rerailer enhances the reliability of
+LLM-generated reasoning, contributing to more trustworthy AI driven
+decision-making processes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MobileQuant: Mobile-friendly Quantization for On-device Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13933v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13933v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fuwen Tan, Royson Lee, Łukasz Dudziak, Shell Xu Hu, Sourav Bhattacharya, Timothy Hospedales, Georgios Tzimiropoulos, Brais Martinez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have revolutionized language processing,
+delivering outstanding results across multiple applications. However, deploying
+LLMs on edge devices poses several challenges with respect to memory, energy,
+and compute costs, limiting their widespread use in devices such as mobile
+phones. A promising solution is to reduce the number of bits used to represent
+weights and activations. While existing works have found partial success at
+quantizing LLMs to lower bitwidths, e.g. 4-bit weights, quantizing activations
+beyond 16 bits often leads to large computational overheads due to poor
+on-device quantization support, or a considerable accuracy drop. Yet, 8-bit
+activations are very attractive for on-device deployment as they would enable
+LLMs to fully exploit mobile-friendly hardware, e.g. Neural Processing Units
+(NPUs). In this work, we make a first attempt to facilitate the on-device
+deployment of LLMs using integer-only quantization. We first investigate the
+limitations of existing quantization methods for on-device deployment, with a
+special focus on activation quantization. We then address these limitations by
+introducing a simple post-training quantization method, named MobileQuant, that
+extends previous weight equivalent transformation works by jointly optimizing
+the weight transformation and activation range parameters in an end-to-end
+manner. MobileQuant demonstrates superior capabilities over existing methods by
+1) achieving near-lossless quantization on a wide range of LLM benchmarks, 2)
+reducing latency and energy consumption by 20\%-50\% compared to current
+on-device quantization strategies, 3) requiring limited compute budget, 4)
+being compatible with mobile-friendly compute units, e.g. NPU.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code and models available: https://github.com/saic-fi/MobileQuant</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLMs are Superior Feedback Providers: Bootstrapping Reasoning for Lie
+  Detection with Self-Generated Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13915v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13915v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tanushree Banerjee, Richard Zhu, Runzhe Yang, Karthik Narasimhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) excel at generating human-like dialogues and
+comprehending text. However, understanding the subtleties of complex exchanges
+in language remains a challenge. We propose a bootstrapping framework that
+leverages self-generated feedback to enhance LLM reasoning capabilities for lie
+detection. The framework consists of three stages: suggestion, feedback
+collection, and modification. In the suggestion stage, a cost-effective
+language model generates initial predictions based on game state and dialogue.
+The feedback-collection stage involves a language model providing feedback on
+these predictions. In the modification stage, a more advanced language model
+refines the initial predictions using the auto-generated feedback. We
+investigate the application of the proposed framework for detecting betrayal
+and deception in Diplomacy games, and compare it with feedback from
+professional human players. The LLM-generated feedback exhibits superior
+quality and significantly enhances the performance of the model. Our approach
+achieves a 39% improvement over the zero-shot baseline in lying-F1 without the
+need for any training data, rivaling state-of-the-art supervised learning
+results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 18 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LowCLIP: Adapting the CLIP Model Architecture for Low-Resource Languages
+  in Multimodal Image Retrieval Task 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13909v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13909v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Asgarov, Samir Rustamov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This research explores the development of multimodal vision-language models
+for image retrieval in low-resource languages, specifically Azerbaijani.
+Existing vision-language models primarily support high-resource languages, and
+fine-tuning them remains computationally demanding. To address challenges in
+vision-language retrieval for low-resource languages, we integrated the CLIP
+model architecture and employed several techniques to balance computational
+efficiency with performance. These techniques include synthetic data generation
+through machine translation, image augmentation, and further training the
+attention mechanisms of transformer-based models with domain-specific data. We
+integrated Multilingual BERT as a text encoder with image encoders like
+ResNet50, EfficientNet0, Vision Transformer (ViT), and Tiny Swin Transformer.
+Our study found that models like EfficientNet0 and Tiny Swin Transformer
+perform best on the datasets they were trained on, such as COCO, Flickr30k, and
+Flickr8k. Augmentation techniques boosted EfficientNet0 MAP on Flickr30k from
+0.84 to 0.87 and ResNet50 MAP on MSCOCO from 0.70 to 0.80, contributing to a
+new state of the art in vision-language retrieval. We share our configurations
+and results to support further research. Code and pre-trained models are
+available at https://github.com/aliasgerovs/azclip.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SpeechCaps: Advancing Instruction-Based Universal Speech Models with
+  Multi-Talker Speaking Style Captioning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13891v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13891v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chien-yu Huang, Min-Han Shih, Ke-Han Lu, Chi-Yuan Hsiao, Hung-yi Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Instruction-based speech processing is becoming popular. Studies show that
+training with multiple tasks boosts performance, but collecting diverse,
+large-scale tasks and datasets is expensive. Thus, it is highly desirable to
+design a fundamental task that benefits other downstream tasks. This paper
+introduces a multi-talker speaking style captioning task to enhance the
+understanding of speaker and prosodic information. We used large language
+models to generate descriptions for multi-talker speech. Then, we trained our
+model with pre-training on this captioning task followed by instruction tuning.
+Evaluation on Dynamic-SUPERB shows our model outperforming the baseline
+pre-trained only on single-talker tasks, particularly in speaker and emotion
+recognition. Additionally, tests on a multi-talker QA task reveal that current
+models struggle with attributes such as gender, pitch, and speaking rate. The
+code and dataset are available at https://github.com/cyhuang-tw/speechcaps.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>SynData4GenAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLM with Relation Classifier for Document-Level Relation Extraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13889v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13889v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingzuo Li, Kehai Chen, Yunfei Long, Min Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) create a new paradigm for natural language
+processing. Despite their advancement, LLM-based methods still lag behind
+traditional approaches in document-level relation extraction (DocRE), a
+critical task for understanding complex entity relations. This paper
+investigates the causes of this performance gap, identifying the dispersion of
+attention by LLMs due to entity pairs without relations as a primary factor. We
+then introduce a novel classifier-LLM approach to DocRE. The proposed approach
+begins with a classifier specifically designed to select entity pair candidates
+exhibiting potential relations and thereby feeds them to LLM for the final
+relation extraction. This method ensures that during inference, the LLM's focus
+is directed primarily at entity pairs with relations. Experiments on DocRE
+benchmarks reveal that our method significantly outperforms recent LLM-based
+DocRE models and achieves competitive performance with several leading
+traditional DocRE models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CodeGraph: Enhancing Graph Reasoning of LLMs with Code 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13863v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13863v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiaolong Cai, Zhaowei Wang, Shizhe Diao, James Kwok, Yangqiu Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the increasing popularity of large language models (LLMs), reasoning on
+basic graph algorithm problems is an essential intermediate step in assessing
+their abilities to process and infer complex graph reasoning tasks. Existing
+methods usually convert graph-structured data to textual descriptions and then
+use LLMs for reasoning and computation. However, LLMs often produce computation
+errors on arithmetic parts in basic graph algorithm problems, such as counting
+number of edges. In addition, they struggle to control or understand the output
+of the reasoning process, raising concerns about whether LLMs are simply
+guessing. In this paper, we introduce CodeGraph, a method that encodes graph
+problem solutions as code. The methods solve new graph problems by learning
+from exemplars, generating programs, and executing them via a program
+interpreter. Using the few-shot setting, we evaluate CodeGraph with the base
+LLM being GPT-3.5 Turbo, Llama3-70B Instruct, Mixtral-8x22B Instruct, and
+Mixtral-8x7B Instruct. Experimental results on six tasks with six graph
+encoding methods in the GraphQA dataset demonstrate that CodeGraph can boost
+performance on graph reasoning tasks inside LLMs by 1.3% to 58.6%, depending on
+the task. Compared to the existing methods, CodeGraph demonstrates strong
+performance on arithmetic problems in graph tasks and offers a more
+controllable and interpretable approach to the reasoning process.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In Progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Knowledge-Aware Reasoning over Multimodal Semi-structured Tables 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13860v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13860v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Suyash Vardhan Mathur, Jainit Sushil Bafna, Kunal Kartik, Harshita Khandelwal, Manish Shrivastava, Vivek Gupta, Mohit Bansal, Dan Roth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing datasets for tabular question answering typically focus exclusively
+on text within cells. However, real-world data is inherently multimodal, often
+blending images such as symbols, faces, icons, patterns, and charts with
+textual content in tables. With the evolution of AI models capable of
+multimodal reasoning, it is pertinent to assess their efficacy in handling such
+structured data. This study investigates whether current AI models can perform
+knowledge-aware reasoning on multimodal structured data. We explore their
+ability to reason on tables that integrate both images and text, introducing
+MMTabQA, a new dataset designed for this purpose. Our experiments highlight
+substantial challenges for current AI models in effectively integrating and
+interpreting multiple text and image inputs, understanding visual context, and
+comparing visual content across images. These findings establish our dataset as
+a robust benchmark for advancing AI's comprehension and capabilities in
+analyzing multimodal structured data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Biomedical Large Languages Models Seem not to be Superior to Generalist
+  Models on Unseen Medical Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13833v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13833v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Felix J. Dorfner, Amin Dada, Felix Busch, Marcus R. Makowski, Tianyu Han, Daniel Truhn, Jens Kleesiek, Madhumita Sushil, Jacqueline Lammert, Lisa C. Adams, Keno K. Bressem
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have shown potential in biomedical applications,
+leading to efforts to fine-tune them on domain-specific data. However, the
+effectiveness of this approach remains unclear. This study evaluates the
+performance of biomedically fine-tuned LLMs against their general-purpose
+counterparts on a variety of clinical tasks. We evaluated their performance on
+clinical case challenges from the New England Journal of Medicine (NEJM) and
+the Journal of the American Medical Association (JAMA) and on several clinical
+tasks (e.g., information extraction, document summarization, and clinical
+coding). Using benchmarks specifically chosen to be likely outside the
+fine-tuning datasets of biomedical models, we found that biomedical LLMs mostly
+perform inferior to their general-purpose counterparts, especially on tasks not
+focused on medical knowledge. While larger models showed similar performance on
+case tasks (e.g., OpenBioLLM-70B: 66.4% vs. Llama-3-70B-Instruct: 65% on JAMA
+cases), smaller biomedical models showed more pronounced underperformance
+(e.g., OpenBioLLM-8B: 30% vs. Llama-3-8B-Instruct: 64.3% on NEJM cases).
+Similar trends were observed across the CLUE (Clinical Language Understanding
+Evaluation) benchmark tasks, with general-purpose models often performing
+better on text generation, question answering, and coding tasks. Our results
+suggest that fine-tuning LLMs to biomedical data may not provide the expected
+benefits and may potentially lead to reduced performance, challenging
+prevailing assumptions about domain-specific adaptation of LLMs and
+highlighting the need for more rigorous evaluation frameworks in healthcare AI.
+Alternative approaches, such as retrieval-augmented generation, may be more
+effective in enhancing the biomedical capabilities of LLMs without compromising
+their general knowledge.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 3 tables, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Guardians of the Machine Translation Meta-Evaluation: Sentinel Metrics
+  Fall In! <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13831v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13831v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stefano Perrella, Lorenzo Proietti, Alessandro Scirè, Edoardo Barba, Roberto Navigli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Annually, at the Conference of Machine Translation (WMT), the Metrics Shared
+Task organizers conduct the meta-evaluation of Machine Translation (MT)
+metrics, ranking them according to their correlation with human judgments.
+Their results guide researchers toward enhancing the next generation of metrics
+and MT systems. With the recent introduction of neural metrics, the field has
+witnessed notable advancements. Nevertheless, the inherent opacity of these
+metrics has posed substantial challenges to the meta-evaluation process. This
+work highlights two issues with the meta-evaluation framework currently
+employed in WMT, and assesses their impact on the metrics rankings. To do this,
+we introduce the concept of sentinel metrics, which are designed explicitly to
+scrutinize the meta-evaluation process's accuracy, robustness, and fairness. By
+employing sentinel metrics, we aim to validate our findings, and shed light on
+and monitor the potential biases or inconsistencies in the rankings. We
+discover that the present meta-evaluation framework favors two categories of
+metrics: i) those explicitly trained to mimic human quality assessments, and
+ii) continuous metrics. Finally, we raise concerns regarding the evaluation
+capabilities of state-of-the-art metrics, emphasizing that they might be basing
+their assessments on spurious correlations found in their training data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented at ACL 2024 Main Conference. 29 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revisiting the Exit from Nuclear Energy in Germany with NLP 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13810v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13810v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sebastian Haunss, André Blessing
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Annotation of political discourse is resource-intensive, but recent
+developments in NLP promise to automate complex annotation tasks. Fine-tuned
+transformer-based models outperform human annotators in some annotation tasks,
+but they require large manually annotated training datasets. In our
+contribution, we explore to which degree a manually annotated dataset can be
+automatically replicated with today's NLP methods, using unsupervised machine
+learning and zero- and few-shot learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 8 figures, Accepted for publication in Zeitschrift f\"ur
+  Diskursforschung/Journal for Discourse Studies, ISSN: 2195-867X</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Reliable Medical Question Answering: Techniques and Challenges
+  in Mitigating Hallucinations in Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13808v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13808v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Duy Khoa Pham, Bao Quoc Vo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid advancement of large language models (LLMs) has significantly
+impacted various domains, including healthcare and biomedicine. However, the
+phenomenon of hallucination, where LLMs generate outputs that deviate from
+factual accuracy or context, poses a critical challenge, especially in
+high-stakes domains. This paper conducts a scoping study of existing techniques
+for mitigating hallucinations in knowledge-based task in general and especially
+for medical domains. Key methods covered in the paper include
+Retrieval-Augmented Generation (RAG)-based techniques, iterative feedback
+loops, supervised fine-tuning, and prompt engineering. These techniques, while
+promising in general contexts, require further adaptation and optimization for
+the medical domain due to its unique demands for up-to-date, specialized
+knowledge and strict adherence to medical guidelines. Addressing these
+challenges is crucial for developing trustworthy AI systems that enhance
+clinical decision-making and patient safety as well as accuracy of biomedical
+scientific research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DOCE: Finding the Sweet Spot for Execution-Based Code Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13745v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13745v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haau-Sing Li, Patrick Fernandes, Iryna Gurevych, André F. T. Martins
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, a diverse set of decoding and reranking procedures have been shown
+effective for LLM-based code generation. However, a comprehensive framework
+that links and experimentally compares these methods is missing. We address
+this by proposing Decoding Objectives for Code Execution, a comprehensive
+framework that includes candidate generation, $n$-best reranking, minimum Bayes
+risk (MBR) decoding, and self-debugging as the core components. We then study
+the contributions of these components through execution-based evaluation
+metrics. Our findings highlight the importance of execution-based methods and
+the difference gap between execution-based and execution-free methods.
+Furthermore, we assess the impact of filtering based on trial unit tests, a
+simple and effective strategy that has been often overlooked in prior works. We
+also propose self-debugging on multiple candidates, obtaining state-of-the-art
+performance on reranking for code generation. We expect our framework to
+provide a solid guideline for future research on code generation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages (32 including appendix), 5 figures, 25 tables. arXiv admin
+  note: text overlap with arXiv:2304.05128 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Literary and Colloquial Tamil Dialect Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13739v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13739v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        M. Nanmalar, P. Vijayalakshmi, T. Nagarajan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Culture and language evolve together. The old literary form of Tamil is used
+commonly for writing and the contemporary colloquial Tamil is used for
+speaking. Human-computer interaction applications require Colloquial Tamil (CT)
+to make it more accessible and easy for the everyday user and, it requires
+Literary Tamil (LT) when information is needed in a formal written format.
+Continuing the use of LT alongside CT in computer aided language learning
+applications will both preserve LT, and provide ease of use via CT, at the same
+time. Hence there is a need for the conversion between LT and CT dialects,
+which demands as a first step, dialect identification. Dialect Identification
+(DID) of LT and CT is an unexplored area of research. In the current work,
+keeping the nuances of both these dialects in mind, five methods are explored
+which include two implicit methods - Gaussian Mixture Model (GMM) and
+Convolutional Neural Network (CNN); two explicit methods - Parallel Phone
+Recognition (PPR) and Parallel Large Vocabulary Continuous Speech Recognition
+(P-LVCSR); two versions of the proposed explicit Unified Phone Recognition
+method (UPR-1 and UPR-2). These methods vary based on: the need for annotated
+data, the size of the unit, the way in which modelling is carried out, and the
+way in which the final decision is made. Even though the average duration of
+the test utterances is less - 4.9s for LT and 2.5s for CT - the systems
+performed well, offering the following identification accuracies: 87.72% (GMM),
+93.97% (CNN), 89.24% (PPR), 94.21% (P-LVCSR), 88.57% (UPR-1), 93.53% (UPR-1
+with P-LVCSR), 94.55% (UPR-2), and 95.61% (UPR-2 with P-LVCSR).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 6 figures, submitted to "Circuits, Systems, and Signal
+  Processing"</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Poor-Supervised Evaluation for SuperLLM via Mutual Consistency <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13738v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13738v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peiwen Yuan, Shaoxiong Feng, Yiwei Li, Xinglin Wang, Boyuan Pan, Heda Wang, Yao Hu, Kan Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The guidance from capability evaluations has greatly propelled the progress
+of both human society and Artificial Intelligence. However, as LLMs evolve, it
+becomes challenging to construct evaluation benchmarks for them with accurate
+labels on hard tasks that approach the boundaries of human capabilities. To
+credibly conduct evaluation without accurate labels (denoted as poor-supervised
+evaluation), we propose the PoEM framework. We first prove that the capability
+of a model can be equivalently assessed by the consistency between it and
+certain reference model, when their prediction distributions are independent
+and the sample size is infinite. To alleviate the insufficiencies of the
+conditions in reality, we further introduce an algorithm that treats humans
+(when available) and the models under evaluation as reference models,
+alternately conducting model weights calibration and filtering during E-step
+and M-step. Comprehensive experiments across 3 types of tasks with 16
+mainstream LLMs have shown that PoEM under poor supervision can achieve an
+average of 0.98 Pearson correlation coefficient with supervised evaluation
+results, demonstrating good effectiveness, efficiency and generalizability.
+More generally, PoEM has advanced the evaluation paradigm evolution from
+human-centric to human&model-centric by treating both of them as reference
+models, mitigating the limitations of human evaluation in the era of LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DHP Benchmark: Are LLMs Good NLG Evaluators? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13704v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13704v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yicheng Wang, Jiayi Yuan, Yu-Neng Chuang, Zhuoer Wang, Yingchi Liu, Mark Cusick, Param Kulkarni, Zhengping Ji, Yasser Ibrahim, Xia Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) are increasingly serving as evaluators in
+Natural Language Generation (NLG) tasks. However, the capabilities of LLMs in
+scoring NLG quality remain inadequately explored. Current studies depend on
+human assessments and simple metrics that fail to capture the discernment of
+LLMs across diverse NLG tasks. To address this gap, we propose the Discernment
+of Hierarchical Perturbation (DHP) benchmarking framework, which provides
+quantitative discernment scores for LLMs utilizing hierarchically perturbed
+text data and statistical tests to measure the NLG evaluation capabilities of
+LLMs systematically. We have re-established six evaluation datasets for this
+benchmark, covering four NLG tasks: Summarization, Story Completion, Question
+Answering, and Translation. Our comprehensive benchmarking of five major LLM
+series provides critical insight into their strengths and limitations as NLG
+evaluators.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MuMA-ToM: Multi-modal Multi-Agent Theory of Mind <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.12574v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.12574v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haojun Shi, Suyu Ye, Xinyu Fang, Chuanyang Jin, Leyla Isik, Yen-Ling Kuo, Tianmin Shu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding people's social interactions in complex real-world scenarios
+often relies on intricate mental reasoning. To truly understand how and why
+people interact with one another, we must infer the underlying mental states
+that give rise to the social interactions, i.e., Theory of Mind reasoning in
+multi-agent interactions. Additionally, social interactions are often
+multi-modal -- we can watch people's actions, hear their conversations, and/or
+read about their past behaviors. For AI systems to successfully and safely
+interact with people in real-world environments, they also need to understand
+people's mental states as well as their inferences about each other's mental
+states based on multi-modal information about their interactions. For this, we
+introduce MuMA-ToM, a Multi-modal Multi-Agent Theory of Mind benchmark.
+MuMA-ToM is the first multi-modal Theory of Mind benchmark that evaluates
+mental reasoning in embodied multi-agent interactions. In MuMA-ToM, we provide
+video and text descriptions of people's multi-modal behavior in realistic
+household environments. Based on the context, we then ask questions about
+people's goals, beliefs, and beliefs about others' goals. We validated MuMA-ToM
+in a human experiment and provided a human baseline. We also proposed a novel
+multi-modal, multi-agent ToM model, LIMP (Language model-based Inverse
+Multi-agent Planning). Our experimental results show that LIMP significantly
+outperforms state-of-the-art methods, including large multi-modal models (e.g.,
+GPT-4o, Gemini-1.5 Pro) and a recent multi-modal ToM model, BIP-ALM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project website: https://scai.cs.jhu.edu/projects/MuMA-ToM/ Code:
+  https://github.com/SCAI-JHU/MuMA-ToM</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Modeling language contact with the Iterated Learning Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.06878v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.06878v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seth Bullock, Conor Houghton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contact between languages has the potential to transmit vocabulary and other
+language features; however, this does not always happen. Here, an iterated
+learning model is used to examine, in a simple way, the resistance of languages
+to change during language contact. Iterated learning models are agent-based
+models of language change, they demonstrate that languages that are expressive
+and compositional arise spontaneously as a consequence of a language
+transmission bottleneck. A recently introduced type of iterated learning model,
+the Semi-Supervised ILM is used to simulate language contact. These simulations
+do not include many of the complex factors involved in language contact and do
+not model a population of speakers; nonetheless the model demonstrates that the
+dynamics which lead languages in the model to spontaneously become expressive
+and compositional, also cause a language to maintain its core traits even after
+mixing with another language.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>to appear ALIFE24</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ QFMTS: Generating Query-Focused Summaries over Multi-Table Inputs <span class="chip">ECAI-2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.05109v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.05109v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weijia Zhang, Vaishali Pal, Jia-Hong Huang, Evangelos Kanoulas, Maarten de Rijke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Table summarization is a crucial task aimed at condensing information from
+tabular data into concise and comprehensible textual summaries. However,
+existing approaches often fall short of adequately meeting users' information
+and quality requirements and tend to overlook the complexities of real-world
+queries. In this paper, we propose a novel method to address these limitations
+by introducing query-focused multi-table summarization. Our approach, which
+comprises a table serialization module, a summarization controller, and a large
+language model (LLM), utilizes textual queries and multiple tables to generate
+query-dependent table summaries tailored to users' information needs. To
+facilitate research in this area, we present a comprehensive dataset
+specifically tailored for this task, consisting of 4909 query-summary pairs,
+each associated with multiple tables. Through extensive experiments using our
+curated dataset, we demonstrate the effectiveness of our proposed method
+compared to baseline approaches. Our findings offer insights into the
+challenges of complex table reasoning for precise summarization, contributing
+to the advancement of research in query-focused multi-table summarization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by the 27th European Conference on Artificial Intelligence
+  (ECAI-2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Structural Pruning of <span class="highlight-title">Pre-train</span>ed Language Models via Neural
+  Architecture Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.02267v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.02267v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aaron Klein, Jacek Golebiowski, Xingchen Ma, Valerio Perrone, Cedric Archambeau
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained language models (PLM), for example BERT or RoBERTa, mark the
+state-of-the-art for natural language understanding task when fine-tuned on
+labeled data. However, their large size poses challenges in deploying them for
+inference in real-world applications, due to significant GPU memory
+requirements and high inference latency. This paper explores neural
+architecture search (NAS) for structural pruning to find sub-parts of the
+fine-tuned network that optimally trade-off efficiency, for example in terms of
+model size or latency, and generalization performance. We also show how we can
+utilize more recently developed two-stage weight-sharing NAS approaches in this
+setting to accelerate the search process. Unlike traditional pruning methods
+with fixed thresholds, we propose to adopt a multi-objective approach that
+identifies the Pareto optimal set of sub-networks, allowing for a more flexible
+and automated compression process.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large Language Models as Carriers of Hidden Messages 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02481v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02481v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jakub Hoscilowicz, Pawel Popiolek, Jan Rudkowski, Jedrzej Bieniasz, Artur Janicki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the help of simple fine-tuning, one can artificially embed hidden text
+into large language models (LLMs). This text is revealed only when triggered by
+a specific query to the LLM. Two primary applications are LLM fingerprinting
+and steganography. In the context of LLM fingerprinting, a unique text
+identifier (fingerprint) is embedded within the model to verify licensing
+compliance. In the context of steganography, the LLM serves as a carrier for
+hidden messages that can be disclosed through a chosen trigger question.
+  Our work demonstrates that embedding hidden text in the LLM via fine-tuning,
+though seemingly secure due to the vast number of potential triggers (any
+sequence of characters or tokens could serve as a trigger), is susceptible to
+extraction through analysis of the LLM's output decoding process. We propose an
+extraction attack called Unconditional Token Forcing (UTF). It is premised on
+the hypothesis that iteratively feeding each token from the LLM's vocabulary
+into the model should reveal output sequences with abnormally high token
+probabilities, indicating potential hidden text candidates. We also present a
+defense method to hide text in such a way that it is resistant to both UTF and
+attacks based on sampling decoding methods, which we named Unconditional Token
+Forcing Confusion (UTFC). To the best of our knowledge, there is no attack
+method that can extract text hidden with UTFC. UTFC has both benign
+applications (improving LLM fingerprinting) and malign applications (using LLMs
+to create covert communication channels).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress. Code is available at
+  https://github.com/j-hoscilowic/zurek-stegano</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLMs Meet Long Video: Advancing Long Video Question Answering with An
+  Interactive Visual Adapter in LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.13546v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.13546v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunxin Li, Xinyu Chen, Baotain Hu, Min Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Long video understanding is a significant and ongoing challenge in the
+intersection of multimedia and artificial intelligence. Employing large
+language models (LLMs) for comprehending video becomes an emerging and
+promising method. However, this approach incurs high computational costs due to
+the extensive array of video tokens, experiences reduced visual clarity as a
+consequence of token aggregation, and confronts challenges arising from
+irrelevant visual tokens while answering video-related questions. To alleviate
+these issues, we present an Interactive Visual Adapter (IVA) within LLMs,
+designed to enhance interaction with fine-grained visual elements.
+Specifically, we first transform long videos into temporal video tokens via
+leveraging a visual encoder alongside a pretrained causal transformer, then
+feed them into LLMs with the video instructions. Subsequently, we integrated
+IVA, which contains a lightweight temporal frame selector and a spatial feature
+interactor, within the internal blocks of LLMs to capture instruction-aware and
+fine-grained visual signals. Consequently, the proposed video-LLM facilitates a
+comprehensive understanding of long video content through appropriate long
+video modeling and precise visual interactions. We conducted extensive
+experiments on nine video understanding benchmarks and experimental results
+show that our interactive visual adapter significantly improves the performance
+of video LLMs on long video QA tasks. Ablation studies further verify the
+effectiveness of IVA in understanding long and short video.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages; working in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Put Your Money Where Your Mouth Is: Evaluating Strategic Planning and
+  Execution of LLM Agents in an Auction Arena 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.05746v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.05746v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiangjie Chen, Siyu Yuan, Rong Ye, Bodhisattwa Prasad Majumder, Kyle Richardson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in Large Language Models (LLMs) showcase advanced
+reasoning, yet NLP evaluations often depend on static benchmarks. Evaluating
+this necessitates environments that test strategic reasoning in dynamic,
+competitive scenarios requiring long-term planning. We introduce AucArena, a
+novel evaluation suite that simulates auctions, a setting chosen for being
+highly unpredictable and involving many skills related to resource and risk
+management, while also being easy to evaluate. We conduct controlled
+experiments using state-of-the-art LLMs to power bidding agents to benchmark
+their planning and execution skills. Our research demonstrates that LLMs, such
+as GPT-4, possess key skills for auction participation, such as budget
+management and goal adherence, which improve with adaptive strategies. This
+highlights LLMs' potential in modeling complex social interactions in
+competitive contexts. However, variability in LLM performance and occasional
+outperformance by simpler methods indicate opportunities for further
+advancements in LLM design and the value of our simulation environment for
+ongoing testing and refinement.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://auction-arena.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AlignBench: Benchmarking Chinese Alignment of Large Language Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.18743v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.18743v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiao Liu, Xuanyu Lei, Shengyuan Wang, Yue Huang, Zhuoer Feng, Bosi Wen, Jiale Cheng, Pei Ke, Yifan Xu, Weng Lam Tam, Xiaohan Zhang, Lichao Sun, Xiaotao Gu, Hongning Wang, Jing Zhang, Minlie Huang, Yuxiao Dong, Jie Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Alignment has become a critical step for instruction-tuned Large Language
+Models (LLMs) to become helpful assistants. However, the effective evaluation
+of alignment for emerging Chinese LLMs is still largely unexplored. To fill in
+this gap, we introduce AlignBench, a comprehensive multi-dimensional benchmark
+for evaluating LLMs' alignment in Chinese. We design a human-in-the-loop data
+curation pipeline, containing eight main categories, 683 real-scenario rooted
+queries and corresponding human verified references. To ensure the correctness
+of references, each knowledge-intensive query is accompanied with evidences
+collected from reliable web sources (including URLs and quotations) by our
+annotators. For automatic evaluation, our benchmark employs a rule-calibrated
+multi-dimensional LLM-as-Judge~\cite{zheng2023judging} approach with
+Chain-of-Thought to generate explanations and final ratings, ensuring high
+reliability and interpretability. All evaluation code, data, and LLM
+generations are available at \url{https://github.com/THUDM/AlignBench}. Since
+its release, AlignBench has been adopted by top (Chinese) LLMs for evaluating
+their alignment capabilities in Chinese, including ChatGLM, Qwen, DeepSeek, Yi,
+Baichuan, and Abab.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HyperLoader: Integrating Hypernetwork-Based LoRA and Adapter Layers into
+  Multi-Task <span class="highlight-title">Transformer</span>s for Sequence Labelling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.01411v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.01411v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jesus-German Ortiz-Barajas, Helena Gomez-Adorno, Thamar Solorio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present HyperLoader, a simple approach that combines different
+parameter-efficient fine-tuning methods in a multi-task setting. To achieve
+this goal, our model uses a hypernetwork to generate the weights of these
+modules based on the task, the transformer layer, and its position within this
+layer. Our method combines the benefits of multi-task learning by capturing the
+structure of all tasks while reducing the task interference problem by
+encapsulating the task-specific knowledge in the generated weights and the
+benefits of combining different parameter-efficient methods to outperform
+full-fine tuning. We provide empirical evidence that HyperLoader outperforms
+previous approaches in most datasets and obtains the best average performance
+across tasks in high-resource and low-resource scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From Zero to Hero: Harnessing <span class="highlight-title">Transformer</span>s for Biomedical Named Entity
+  Recognition in Zero- and Few-shot Contexts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.04928v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.04928v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miloš Košprdić, Nikola Prodanović, Adela Ljajić, Bojana Bašaragin, Nikola Milošević
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Supervised named entity recognition (NER) in the biomedical domain depends on
+large sets of annotated texts with the given named entities. The creation of
+such datasets can be time-consuming and expensive, while extraction of new
+entities requires additional annotation tasks and retraining the model. To
+address these challenges, this paper proposes a method for zero- and few-shot
+NER in the biomedical domain. The method is based on transforming the task of
+multi-class token classification into binary token classification and
+pre-training on a large amount of datasets and biomedical entities, which allow
+the model to learn semantic relations between the given and potentially novel
+named entity labels. We have achieved average F1 scores of 35.44% for zero-shot
+NER, 50.10% for one-shot NER, 69.94% for 10-shot NER, and 79.51% for 100-shot
+NER on 9 diverse evaluated biomedical entities with fine-tuned PubMedBERT-based
+model. The results demonstrate the effectiveness of the proposed method for
+recognizing new biomedical entities with no or limited number of examples,
+outperforming previous transformer-based methods, and being comparable to
+GPT3-based models using models with over 1000 times fewer parameters. We make
+models and developed code publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Collaboration between Bayer Pharma R&D and Serbian Institute for
+  Artificial Intelligence Research and Development. Artificial Intelligence in
+  Medicine (2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Chaos with Keywords: Exposing Large Language Models Sycophantic
+  Hallucination to Misleading Keywords and Evaluating Defense Strategies <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03827v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03827v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aswin RRV, Nemika Tyagi, Md Nayem Uddin, Neeraj Varshney, Chitta Baral
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study explores the sycophantic tendencies of Large Language Models
+(LLMs), where these models tend to provide answers that match what users want
+to hear, even if they are not entirely correct. The motivation behind this
+exploration stems from the common behavior observed in individuals searching
+the internet for facts with partial or misleading knowledge. Similar to using
+web search engines, users may recall fragments of misleading keywords and
+submit them to an LLM, hoping for a comprehensive response. Our empirical
+analysis of several LLMs shows the potential danger of these models amplifying
+misinformation when presented with misleading keywords. Additionally, we
+thoroughly assess four existing hallucination mitigation strategies to reduce
+LLMs sycophantic behavior. Our experiments demonstrate the effectiveness of
+these strategies for generating factually correct statements. Furthermore, our
+analyses delve into knowledge-probing experiments on factual keywords and
+different categories of sycophancy mitigation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Findings of ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">28</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Shifted Window Fourier Transform And Retention For Image Captioning <span class="chip">ICONIP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13963v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13963v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jia Cheng Hu, Roberto Cavicchioli, Alessandro Capotondi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image Captioning is an important Language and Vision task that finds
+application in a variety of contexts, ranging from healthcare to autonomous
+vehicles. As many real-world applications rely on devices with limited
+resources, much effort in the field was put into the development of lighter and
+faster models. However, much of the current optimizations focus on the
+Transformer architecture in contrast to the existence of more efficient
+methods. In this work, we introduce SwiFTeR, an architecture almost entirely
+based on Fourier Transform and Retention, to tackle the main efficiency
+bottlenecks of current light image captioning models, being the visual
+backbone's onerosity, and the decoder's quadratic cost. SwiFTeR is made of only
+20M parameters, and requires 3.1 GFLOPs for a single forward pass.
+Additionally, it showcases superior scalability to the caption length and its
+small memory requirements enable more images to be processed in parallel,
+compared to the traditional transformer-based architectures. For instance, it
+can generate 400 captions in one second. Although, for the time being, the
+caption quality is lower (110.2 CIDEr-D), most of the decrease is not
+attributed to the architecture but rather an incomplete training practice which
+currently leaves much room for improvements. Overall, SwiFTeR points toward a
+promising direction to new efficient architectural design. The implementation
+code will be released in the future.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Pre-print version of paper accepted for ICONIP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ InterTrack: Tracking Human Object Interaction without Object Templates 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13953v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13953v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xianghui Xie, Jan Eric Lenssen, Gerard Pons-Moll
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tracking human object interaction from videos is important to understand
+human behavior from the rapidly growing stream of video data. Previous
+video-based methods require predefined object templates while
+single-image-based methods are template-free but lack temporal consistency. In
+this paper, we present a method to track human object interaction without any
+object shape templates. We decompose the 4D tracking problem into per-frame
+pose tracking and canonical shape optimization. We first apply a single-view
+reconstruction method to obtain temporally-inconsistent per-frame interaction
+reconstructions. Then, for the human, we propose an efficient autoencoder to
+predict SMPL vertices directly from the per-frame reconstructions, introducing
+temporally consistent correspondence. For the object, we introduce a pose
+estimator that leverages temporal information to predict smooth object
+rotations under occlusions. To train our model, we propose a method to generate
+synthetic interaction videos and synthesize in total 10 hour videos of 8.5k
+sequences with full 3D ground truth. Experiments on BEHAVE and InterCap show
+that our method significantly outperforms previous template-based video
+tracking and single-frame reconstruction methods. Our proposed synthetic video
+dataset also allows training video-based methods that generalize to real-world
+videos. Our code and dataset will be publicly released.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 13 figures and 6 tables. Project page:
+  https://virtualhumans.mpi-inf.mpg.de/InterTrack/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Personalized Topology-Informed 12-Lead ECG Electrode Localization from
+  Incomplete Cardiac MRIs for Efficient Cardiac Digital Twins 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13945v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13945v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Li, Hannah Smith, Yilin Lyu, Julia Camps, Blanca Rodriguez, Abhirup Banerjee, Vicente Grau
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cardiac digital twins (CDTs) offer personalized \textit{in-silico} cardiac
+representations for the inference of multi-scale properties tied to cardiac
+mechanisms. The creation of CDTs requires precise information about the
+electrode position on the torso, especially for the personalized
+electrocardiogram (ECG) calibration. However, current studies commonly rely on
+additional acquisition of torso imaging and manual/semi-automatic methods for
+ECG electrode localization. In this study, we propose a novel and efficient
+topology-informed model to fully automatically extract personalized ECG
+electrode locations from 2D clinically standard cardiac MRIs. Specifically, we
+obtain the sparse torso contours from the cardiac MRIs and then localize the
+electrodes from the contours. Cardiac MRIs aim at imaging of the heart instead
+of the torso, leading to incomplete torso geometry within the imaging. To
+tackle the missing topology, we incorporate the electrodes as a subset of the
+keypoints, which can be explicitly aligned with the 3D torso topology. The
+experimental results demonstrate that the proposed model outperforms the
+time-consuming conventional method in terms of accuracy (Euclidean distance:
+$1.24 \pm 0.293$ cm vs. $1.48 \pm 0.362$ cm) and efficiency ($2$~s vs.
+$30$-$35$~min). We further demonstrate the effectiveness of using the detected
+electrodes for \textit{in-silico} ECG simulation, highlighting their potential
+for creating accurate and efficient CDT models. The code will be released
+publicly after the manuscript is accepted for publication.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OpenNav: Efficient Open Vocabulary 3D Object Detection for Smart
+  Wheelchair Navigation <span class="chip">ECCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13936v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13936v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Rameez ur Rahman, Piero Simonetto, Anna Polato, Francesco Pasti, Luca Tonin, Sebastiano Vascon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Open vocabulary 3D object detection (OV3D) allows precise and extensible
+object recognition crucial for adapting to diverse environments encountered in
+assistive robotics. This paper presents OpenNav, a zero-shot 3D object
+detection pipeline based on RGB-D images for smart wheelchairs. Our pipeline
+integrates an open-vocabulary 2D object detector with a mask generator for
+semantic segmentation, followed by depth isolation and point cloud construction
+to create 3D bounding boxes. The smart wheelchair exploits these 3D bounding
+boxes to identify potential targets and navigate safely. We demonstrate
+OpenNav's performance through experiments on the Replica dataset and we report
+preliminary results with a real wheelchair. OpenNav improves state-of-the-art
+significantly on the Replica dataset at mAP25 (+9pts) and mAP50 (+5pts) with
+marginal improvement at mAP. The code is publicly available at this link:
+https://github.com/EasyWalk-PRIN/OpenNav.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ECCVW</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GeoPlant: Spatial Plant Species Prediction <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13928v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13928v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lukas Picek, Christophe Botella, Maximilien Servajean, César Leblanc, Rémi Palard, Théo Larcher, Benjamin Deneu, Diego Marcos, Pierre Bonnet, Alexis Joly
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The difficulty of monitoring biodiversity at fine scales and over large areas
+limits ecological knowledge and conservation efforts. To fill this gap, Species
+Distribution Models (SDMs) predict species across space from spatially explicit
+features. Yet, they face the challenge of integrating the rich but
+heterogeneous data made available over the past decade, notably millions of
+opportunistic species observations and standardized surveys, as well as
+multi-modal remote sensing data. In light of that, we have designed and
+developed a new European-scale dataset for SDMs at high spatial resolution
+(10-50 m), including more than 10k species (i.e., most of the European flora).
+The dataset comprises 5M heterogeneous Presence-Only records and 90k exhaustive
+Presence-Absence survey records, all accompanied by diverse environmental
+rasters (e.g., elevation, human footprint, and soil) that are traditionally
+used in SDMs. In addition, it provides Sentinel-2 RGB and NIR satellite images
+with 10 m resolution, a 20-year time-series of climatic variables, and
+satellite time-series from the Landsat program. In addition to the data, we
+provide an openly accessible SDM benchmark (hosted on Kaggle), which has
+already attracted an active community and a set of strong baselines for single
+predictor/modality and multimodal approaches. All resources, e.g., the dataset,
+pre-trained models, and baseline methods (in the form of notebooks), are
+available on Kaggle, allowing one to start with our dataset literally with two
+mouse clicks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Infrared Domain Adaptation with Zero-Shot Quantization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13925v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13925v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Burak Sevsay, Erdem Akagündüz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantization is one of the most popular techniques for reducing computation
+time and shrinking model size. However, ensuring the accuracy of quantized
+models typically involves calibration using training data, which may be
+inaccessible due to privacy concerns. In such cases, zero-shot quantization, a
+technique that relies on pretrained models and statistical information without
+the need for specific training data, becomes valuable. Exploring zero-shot
+quantization in the infrared domain is important due to the prevalence of
+infrared imaging in sensitive fields like medical and security applications. In
+this work, we demonstrate how to apply zero-shot quantization to an object
+detection model retrained with thermal imagery. We use batch normalization
+statistics of the model to distill data for calibration. RGB image-trained
+models and thermal image-trained models are compared in the context of
+zero-shot quantization. Our investigation focuses on the contributions of mean
+and standard deviation statistics to zero-shot quantization performance.
+Additionally, we compare zero-shot quantization with post-training quantization
+on a thermal dataset. We demonstrated that zero-shot quantization successfully
+generates data that represents the training dataset for the quantization of
+object detection models. Our results indicate that our zero-shot quantization
+framework is effective in the absence of training data and is well-suited for
+the infrared domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICMV 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ COMPOSE: Comprehensive Portrait Shadow Editing <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13922v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13922v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrew Hou, Zhixin Shu, Xuaner Zhang, He Zhang, Yannick Hold-Geoffroy, Jae Shin Yoon, Xiaoming Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing portrait relighting methods struggle with precise control over
+facial shadows, particularly when faced with challenges such as handling hard
+shadows from directional light sources or adjusting shadows while remaining in
+harmony with existing lighting conditions. In many situations, completely
+altering input lighting is undesirable for portrait retouching applications:
+one may want to preserve some authenticity in the captured environment.
+Existing shadow editing methods typically restrict their application to just
+the facial region and often offer limited lighting control options, such as
+shadow softening or rotation. In this paper, we introduce COMPOSE: a novel
+shadow editing pipeline for human portraits, offering precise control over
+shadow attributes such as shape, intensity, and position, all while preserving
+the original environmental illumination of the portrait. This level of
+disentanglement and controllability is obtained thanks to a novel decomposition
+of the environment map representation into ambient light and an editable
+gaussian dominant light source. COMPOSE is a four-stage pipeline that consists
+of light estimation and editing, light diffusion, shadow synthesis, and finally
+shadow editing. We define facial shadows as the result of a dominant light
+source, encoded using our novel gaussian environment map representation.
+Utilizing an OLAT dataset, we have trained models to: (1) predict this light
+source representation from images, and (2) generate realistic shadows using
+this representation. We also demonstrate comprehensive and intuitive shadow
+editing with our pipeline. Through extensive quantitative and qualitative
+evaluations, we have demonstrated the robust capability of our system in shadow
+editing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ECCV 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Splatt3R: Zero-shot Gaussian Splatting from Uncalibarated Image Pairs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13912v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13912v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brandon Smart, Chuanxia Zheng, Iro Laina, Victor Adrian Prisacariu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce Splatt3R, a pose-free, feed-forward method for
+in-the-wild 3D reconstruction and novel view synthesis from stereo pairs. Given
+uncalibrated natural images, Splatt3R can predict 3D Gaussian Splats without
+requiring any camera parameters or depth information. For generalizability, we
+start from a 'foundation' 3D geometry reconstruction method, MASt3R, and extend
+it to be a full 3D structure and appearance reconstructor. Specifically, unlike
+the original MASt3R which reconstructs only 3D point clouds, we predict the
+additional Gaussian attributes required to construct a Gaussian primitive for
+each point. Hence, unlike other novel view synthesis methods, Splatt3R is first
+trained by optimizing the 3D point cloud's geometry loss, and then a novel view
+synthesis objective. By doing this, we avoid the local minima present in
+training 3D Gaussian Splats from stereo views. We also propose a novel loss
+masking strategy that we empirically find is critical for strong performance on
+extrapolated viewpoints. We train Splatt3R on the ScanNet++ dataset and
+demonstrate excellent generalisation to uncalibrated, in-the-wild images.
+Splatt3R can reconstruct scenes at 4FPS at 512 x 512 resolution, and the
+resultant splats can be rendered in real-time.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Our project page can be found at: https://splatt3r.active.vision/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LowCLIP: Adapting the CLIP Model Architecture for Low-Resource Languages
+  in Multimodal Image Retrieval Task 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13909v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13909v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Asgarov, Samir Rustamov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This research explores the development of multimodal vision-language models
+for image retrieval in low-resource languages, specifically Azerbaijani.
+Existing vision-language models primarily support high-resource languages, and
+fine-tuning them remains computationally demanding. To address challenges in
+vision-language retrieval for low-resource languages, we integrated the CLIP
+model architecture and employed several techniques to balance computational
+efficiency with performance. These techniques include synthetic data generation
+through machine translation, image augmentation, and further training the
+attention mechanisms of transformer-based models with domain-specific data. We
+integrated Multilingual BERT as a text encoder with image encoders like
+ResNet50, EfficientNet0, Vision Transformer (ViT), and Tiny Swin Transformer.
+Our study found that models like EfficientNet0 and Tiny Swin Transformer
+perform best on the datasets they were trained on, such as COCO, Flickr30k, and
+Flickr8k. Augmentation techniques boosted EfficientNet0 MAP on Flickr30k from
+0.84 to 0.87 and ResNet50 MAP on MSCOCO from 0.70 to 0.80, contributing to a
+new state of the art in vision-language retrieval. We share our configurations
+and results to support further research. Code and pre-trained models are
+available at https://github.com/aliasgerovs/azclip.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ConVis: Contrastive Decoding with Hallucination Visualization for
+  Mitigating Hallucinations in Multimodal Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13906v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13906v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yeji Park, Deokyeong Lee, Junsuk Choe, Buru Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hallucinations in Multimodal Large Language Models (MLLMs) where generated
+responses fail to accurately reflect the given image pose a significant
+challenge to their reliability. To address this, we introduce ConVis, a novel
+training-free contrastive decoding method. ConVis leverages a text-to-image
+(T2I) generation model to semantically reconstruct the given image from
+hallucinated captions. By comparing the contrasting probability distributions
+produced by the original and reconstructed images, ConVis enables MLLMs to
+capture visual contrastive signals that penalize hallucination generation.
+Notably, this method operates purely within the decoding process, eliminating
+the need for additional data or model updates. Our extensive experiments on
+five popular benchmarks demonstrate that ConVis effectively reduces
+hallucinations across various MLLMs, highlighting its potential to enhance
+model reliability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>First two authors contributed equally. Source code is available at
+  https://github.com/yejipark-m/ConVis</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TraIL-Det: Transformation-Invariant Local Feature Networks for 3D LiDAR
+  Object Detection with Unsupervised <span class="highlight-title">Pre-Train</span>ing <span class="chip">BMVC 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13902v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13902v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Li Li, Tanqiu Qiao, Hubert P. H. Shum, Toby P. Breckon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D point clouds are essential for perceiving outdoor scenes, especially
+within the realm of autonomous driving. Recent advances in 3D LiDAR Object
+Detection focus primarily on the spatial positioning and distribution of points
+to ensure accurate detection. However, despite their robust performance in
+variable conditions, these methods are hindered by their sole reliance on
+coordinates and point intensity, resulting in inadequate isometric invariance
+and suboptimal detection outcomes. To tackle this challenge, our work
+introduces Transformation-Invariant Local (TraIL) features and the associated
+TraIL-Det architecture. Our TraIL features exhibit rigid transformation
+invariance and effectively adapt to variations in point density, with a design
+focus on capturing the localized geometry of neighboring structures. They
+utilize the inherent isotropic radiation of LiDAR to enhance local
+representation, improve computational efficiency, and boost detection
+performance. To effectively process the geometric relations among points within
+each proposal, we propose a Multi-head self-Attention Encoder (MAE) with
+asymmetric geometric features to encode high-dimensional TraIL features into
+manageable representations. Our method outperforms contemporary self-supervised
+3D object detection approaches in terms of mAP on KITTI (67.8, 20% label,
+moderate) and Waymo (68.9, 20% label, moderate) datasets under various label
+ratios (20%, 50%, and 100%).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>BMVC 2024; 15 pages, 3 figures, 3 tables; Code at
+  https://github.com/l1997i/rapid_seg</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating Attribute Comprehension in Large Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13898v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13898v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haiwen Zhang, Zixi Yang, Yuanzhi Liu, Xinran Wang, Zheqi He, Kongming Liang, Zhanyu Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Currently, large vision-language models have gained promising progress on
+many downstream tasks. However, they still suffer many challenges in
+fine-grained visual understanding tasks, such as object attribute
+comprehension. Besides, there have been growing efforts on the evaluations of
+large vision-language models, but lack of in-depth study of attribute
+comprehension and the visual language fine-tuning process. In this paper, we
+propose to evaluate the attribute comprehension ability of large
+vision-language models from two perspectives: attribute recognition and
+attribute hierarchy understanding. We evaluate three vision-language
+interactions, including visual question answering, image-text matching, and
+image-text cosine similarity. Furthermore, we explore the factors affecting
+attribute comprehension during fine-tuning. Through a series of quantitative
+and qualitative experiments, we introduce three main findings: (1) Large
+vision-language models possess good attribute recognition ability, but their
+hierarchical understanding ability is relatively limited. (2) Compared to ITC,
+ITM exhibits superior capability in capturing finer details, making it more
+suitable for attribute understanding tasks. (3) The attribute information in
+the captions used for fine-tuning plays a crucial role in attribute
+understanding. We hope this work can help guide future progress in fine-grained
+visual understanding of large vision-language models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RT-Attack: Jailbreaking Text-to-Image Models via Random Token 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13896v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13896v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sensen Gao, Xiaojun Jia, Yihao Huang, Ranjie Duan, Jindong Gu, Yang Liu, Qing Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, Text-to-Image(T2I) models have achieved remarkable success in image
+generation and editing, yet these models still have many potential issues,
+particularly in generating inappropriate or Not-Safe-For-Work(NSFW) content.
+Strengthening attacks and uncovering such vulnerabilities can advance the
+development of reliable and practical T2I models. Most of the previous works
+treat T2I models as white-box systems, using gradient optimization to generate
+adversarial prompts. However, accessing the model's gradient is often
+impossible in real-world scenarios. Moreover, existing defense methods, those
+using gradient masking, are designed to prevent attackers from obtaining
+accurate gradient information. While some black-box jailbreak attacks have been
+explored, these typically rely on simply replacing sensitive words, leading to
+suboptimal attack performance. To address this issue, we introduce a two-stage
+query-based black-box attack method utilizing random search. In the first
+stage, we establish a preliminary prompt by maximizing the semantic similarity
+between the adversarial and target harmful prompts. In the second stage, we use
+this initial prompt to refine our approach, creating a detailed adversarial
+prompt aimed at jailbreaking and maximizing the similarity in image features
+between the images generated from this prompt and those produced by the target
+harmful prompt. Extensive experiments validate the effectiveness of our method
+in attacking the latest prompt checkers, post-hoc image checkers, securely
+trained T2I models, and online commercial models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Making Large Language Models Better Planners with Reasoning-Decision
+  Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13890v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13890v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhijian Huang, Tao Tang, Shaoxiang Chen, Sihao Lin, Zequn Jie, Lin Ma, Guangrun Wang, Xiaodan Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data-driven approaches for autonomous driving (AD) have been widely adopted
+in the past decade but are confronted with dataset bias and uninterpretability.
+Inspired by the knowledge-driven nature of human driving, recent approaches
+explore the potential of large language models (LLMs) to improve understanding
+and decision-making in traffic scenarios. They find that the pretrain-finetune
+paradigm of LLMs on downstream data with the Chain-of-Thought (CoT) reasoning
+process can enhance explainability and scene understanding. However, such a
+popular strategy proves to suffer from the notorious problems of misalignment
+between the crafted CoTs against the consequent decision-making, which remains
+untouched by previous LLM-based AD methods. To address this problem, we
+motivate an end-to-end decision-making model based on multimodality-augmented
+LLM, which simultaneously executes CoT reasoning and carries out planning
+results. Furthermore, we propose a reasoning-decision alignment constraint
+between the paired CoTs and planning results, imposing the correspondence
+between reasoning and decision-making. Moreover, we redesign the CoTs to enable
+the model to comprehend complex scenarios and enhance decision-making
+performance. We dub our proposed large language planners with
+reasoning-decision alignment as RDA-Driver. Experimental evaluations on the
+nuScenes and DriveLM-nuScenes benchmarks demonstrate the effectiveness of our
+RDA-Driver in enhancing the performance of end-to-end AD systems. Specifically,
+our RDA-Driver achieves state-of-the-art planning performance on the nuScenes
+dataset with 0.80 L2 error and 0.32 collision rate, and also achieves leading
+results on challenging DriveLM-nuScenes benchmarks with 0.82 L2 error and 0.38
+collision rate.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Camouflaged_Object_Tracking__A_Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13877v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13877v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoyu Guo, Pengzhi Zhong, Hao Zhang, Ling Huang, Defeng Huang, Shuiwang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual tracking has seen remarkable advancements, largely driven by the
+availability of large-scale training datasets that have enabled the development
+of highly accurate and robust algorithms. While significant progress has been
+made in tracking general objects, research on more challenging scenarios, such
+as tracking camouflaged objects, remains limited. Camouflaged objects, which
+blend seamlessly with their surroundings or other objects, present unique
+challenges for detection and tracking in complex environments. This challenge
+is particularly critical in applications such as military, security,
+agriculture, and marine monitoring, where precise tracking of camouflaged
+objects is essential. To address this gap, we introduce the Camouflaged Object
+Tracking Dataset (COTD), a specialized benchmark designed specifically for
+evaluating camouflaged object tracking methods. The COTD dataset comprises 200
+sequences and approximately 80,000 frames, each annotated with detailed
+bounding boxes. Our evaluation of 20 existing tracking algorithms reveals
+significant deficiencies in their performance with camouflaged objects. To
+address these issues, we propose a novel tracking framework, HiPTrack-MLS,
+which demonstrates promising results in improving tracking performance for
+camouflaged objects. COTD and code are avialable at
+https://github.com/openat25/HIPTrack-MLS.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Particle-Filtering-based Latent Diffusion for Inverse Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13868v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13868v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amir Nazemi, Mohammad Hadi Sepanj, Nicholas Pellegrino, Chris Czarnecki, Paul Fieguth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current strategies for solving image-based inverse problems apply latent
+diffusion models to perform posterior sampling.However, almost all approaches
+make no explicit attempt to explore the solution space, instead drawing only a
+single sample from a Gaussian distribution from which to generate their
+solution. In this paper, we introduce a particle-filtering-based framework for
+a nonlinear exploration of the solution space in the initial stages of reverse
+SDE methods. Our proposed particle-filtering-based latent diffusion (PFLD)
+method and proposed problem formulation and framework can be applied to any
+diffusion-based solution for linear or nonlinear inverse problems. Our
+experimental results show that PFLD outperforms the SoTA solver PSLD on the
+FFHQ-1K and ImageNet-1K datasets on inverse problem tasks of super resolution,
+Gaussian debluring and inpainting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Mohammad Hadi Sepanj, Nicholas Pellegrino, and Chris Czarnecki
+  contributed equally</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Knowledge-Aware Reasoning over Multimodal Semi-structured Tables 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13860v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13860v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Suyash Vardhan Mathur, Jainit Sushil Bafna, Kunal Kartik, Harshita Khandelwal, Manish Shrivastava, Vivek Gupta, Mohit Bansal, Dan Roth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing datasets for tabular question answering typically focus exclusively
+on text within cells. However, real-world data is inherently multimodal, often
+blending images such as symbols, faces, icons, patterns, and charts with
+textual content in tables. With the evolution of AI models capable of
+multimodal reasoning, it is pertinent to assess their efficacy in handling such
+structured data. This study investigates whether current AI models can perform
+knowledge-aware reasoning on multimodal structured data. We explore their
+ability to reason on tables that integrate both images and text, introducing
+MMTabQA, a new dataset designed for this purpose. Our experiments highlight
+substantial challenges for current AI models in effectively integrating and
+interpreting multiple text and image inputs, understanding visual context, and
+comparing visual content across images. These findings establish our dataset as
+a robust benchmark for advancing AI's comprehension and capabilities in
+analyzing multimodal structured data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Draw Like an Artist: Complex Scene Generation with Diffusion Model via
+  Composition, Painting, and Retouching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13858v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13858v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minghao Liu, Le Zhang, Yingjie Tian, Xiaochao Qu, Luoqi Liu, Ting Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in text-to-image diffusion models have demonstrated
+impressive capabilities in image quality. However, complex scene generation
+remains relatively unexplored, and even the definition of `complex scene'
+itself remains unclear. In this paper, we address this gap by providing a
+precise definition of complex scenes and introducing a set of Complex
+Decomposition Criteria (CDC) based on this definition. Inspired by the artists
+painting process, we propose a training-free diffusion framework called Complex
+Diffusion (CxD), which divides the process into three stages: composition,
+painting, and retouching. Our method leverages the powerful chain-of-thought
+capabilities of large language models (LLMs) to decompose complex prompts based
+on CDC and to manage composition and layout. We then develop an attention
+modulation method that guides simple prompts to specific regions to complete
+the complex scene painting. Finally, we inject the detailed output of the LLM
+into a retouching model to enhance the image details, thus implementing the
+retouching stage. Extensive experiments demonstrate that our method outperforms
+previous SOTA approaches, significantly improving the generation of
+high-quality, semantically consistent, and visually diverse images for complex
+scenes, even with intricate prompts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tangram: A Challenging Benchmark for Geometric Element Recognizing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13854v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13854v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiamin Tang, Chao Zhang, Xudong Zhu, Mengchi Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Significant advancements in Large Multimodal Models (LMMs) have enabled them
+to tackle complex problems involving visual-mathematical reasoning. However,
+their ability to identify geometric elements remains understudied. To bridge
+this gap, we introduce Tangram, a novel benchmark designed to evaluate the
+performance of LMMs on geometric element recognition. Tangram includes 1,080
+diverse geometric diagrams sourced from primary and secondary school exams,
+competitions, and textbooks, covering from simple basic geometric shapes to
+complex combinations. Each diagram is associated with four questions, resulting
+in a total of 4,320 visual-question-answer pairs. Unlike existing benchmarks
+that seek higher-level cognition and reasoning, Tangram focuses on the
+understanding of geometric elements, requiring models to perform a "simple but
+interesting" counting task. Systematic evaluation of 10 prominent LMMs, such as
+GPT-4o and Claude 3.5 Sonnet, shows that even in the seemingly simple task,
+these models still face significant challenges. Notably, the overall accuracy
+of the top performer across all tested models is only 56.8%, marking a
+significant gap when compared to human performance. These findings highlight
+the limitations of current multimodal artificial intelligence systems in
+handling basic perception tasks, and will inspire the development of the next
+generation of expert-level multimodal foundational models. The Tangram and
+evaluation code will be available soon.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LaneTCA: Enhancing Video Lane Detection with Temporal Context
+  Aggregation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13852v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13852v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Keyi Zhou, Li Li, Wengang Zhou, Yonghui Wang, Hao Feng, Houqiang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In video lane detection, there are rich temporal contexts among successive
+frames, which is under-explored in existing lane detectors. In this work, we
+propose LaneTCA to bridge the individual video frames and explore how to
+effectively aggregate the temporal context. Technically, we develop an
+accumulative attention module and an adjacent attention module to abstract the
+long-term and short-term temporal context, respectively. The accumulative
+attention module continuously accumulates visual information during the journey
+of a vehicle, while the adjacent attention module propagates this lane
+information from the previous frame to the current frame. The two modules are
+meticulously designed based on the transformer architecture. Finally, these
+long-short context features are fused with the current frame features to
+predict the lane lines in the current frame. Extensive quantitative and
+qualitative experiments are conducted on two prevalent benchmark datasets. The
+results demonstrate the effectiveness of our method, achieving several new
+state-of-the-art records. The codes and models are available at
+https://github.com/Alex-1337/LaneTCA
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bring the Power of Diffusion Model to Defect Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13845v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13845v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuyi Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to the high complexity and technical requirements of industrial
+production processes, surface defects will inevitably appear, which seriously
+affects the quality of products. Although existing lightweight detection
+networks are highly efficient, they are susceptible to false or missed
+detection of non-salient defects due to the lack of semantic information. In
+contrast, the diffusion model can generate higher-order semantic
+representations in the denoising process. Therefore, the aim of this paper is
+to incorporate the higher-order modelling capability of the diffusion model
+into the detection model, so as to better assist in the classification and
+localization of difficult targets. First, the denoising diffusion probabilistic
+model (DDPM) is pre-trained to extract the features of denoising process to
+construct as a feature repository. In particular, to avoid the potential
+bottleneck of memory caused by the dataloader loading high-dimensional
+features, a residual convolutional variational auto-encoder (ResVAE) is
+designed to further compress the feature repository. The image is fed into both
+image backbone and feature repository for feature extraction and querying
+respectively. The queried latent features are reconstructed and filtered to
+obtain high-dimensional DDPM features. A dynamic cross-fusion method is
+proposed to fully refine the contextual features of DDPM to optimize the
+detection model. Finally, we employ knowledge distillation to migrate the
+higher-order modelling capabilities back into the lightweight baseline model
+without additional efficiency cost. Experiment results demonstrate that our
+method achieves competitive results on several industrial datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MuMA-ToM: Multi-modal Multi-Agent Theory of Mind <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.12574v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.12574v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haojun Shi, Suyu Ye, Xinyu Fang, Chuanyang Jin, Leyla Isik, Yen-Ling Kuo, Tianmin Shu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding people's social interactions in complex real-world scenarios
+often relies on intricate mental reasoning. To truly understand how and why
+people interact with one another, we must infer the underlying mental states
+that give rise to the social interactions, i.e., Theory of Mind reasoning in
+multi-agent interactions. Additionally, social interactions are often
+multi-modal -- we can watch people's actions, hear their conversations, and/or
+read about their past behaviors. For AI systems to successfully and safely
+interact with people in real-world environments, they also need to understand
+people's mental states as well as their inferences about each other's mental
+states based on multi-modal information about their interactions. For this, we
+introduce MuMA-ToM, a Multi-modal Multi-Agent Theory of Mind benchmark.
+MuMA-ToM is the first multi-modal Theory of Mind benchmark that evaluates
+mental reasoning in embodied multi-agent interactions. In MuMA-ToM, we provide
+video and text descriptions of people's multi-modal behavior in realistic
+household environments. Based on the context, we then ask questions about
+people's goals, beliefs, and beliefs about others' goals. We validated MuMA-ToM
+in a human experiment and provided a human baseline. We also proposed a novel
+multi-modal, multi-agent ToM model, LIMP (Language model-based Inverse
+Multi-agent Planning). Our experimental results show that LIMP significantly
+outperforms state-of-the-art methods, including large multi-modal models (e.g.,
+GPT-4o, Gemini-1.5 Pro) and a recent multi-modal ToM model, BIP-ALM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project website: https://scai.cs.jhu.edu/projects/MuMA-ToM/ Code:
+  https://github.com/SCAI-JHU/MuMA-ToM</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Evaluation Methods for Infrared Small-Target Detection in
+  Real-world Scenarios 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.03796v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.03796v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saed Moradi, Alireza Memarmoghadam, Payman Moallem, Mohamad Farzan Sabahi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Infrared small target detection (IRSTD) poses a significant challenge in the
+field of computer vision. While substantial efforts have been made over the
+past two decades to improve the detection capabilities of IRSTD algorithms,
+there has been a lack of extensive investigation into the evaluation metrics
+used for assessing their performance. In this paper, we employ a systematic
+approach to address this issue by first evaluating the effectiveness of
+existing metrics and then proposing new metrics to overcome the limitations of
+conventional ones. To achieve this, we carefully analyze the necessary
+conditions for successful detection and identify the shortcomings of current
+evaluation metrics, including both pre-thresholding and post-thresholding
+metrics. We then introduce new metrics that are designed to align with the
+requirements of real-world systems. Furthermore, we utilize these newly
+proposed metrics to compare and evaluate the performance of five widely
+recognized small infrared target detection algorithms. The results demonstrate
+that the new metrics provide consistent and meaningful quantitative
+assessments, aligning with qualitative observations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RAPiD-Seg: Range-Aware Pointwise Distance Distribution Networks for 3D
+  LiDAR Segmentation <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.10159v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.10159v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Li Li, Hubert P. H. Shum, Toby P. Breckon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D point clouds play a pivotal role in outdoor scene perception, especially
+in the context of autonomous driving. Recent advancements in 3D LiDAR
+segmentation often focus intensely on the spatial positioning and distribution
+of points for accurate segmentation. However, these methods, while robust in
+variable conditions, encounter challenges due to sole reliance on coordinates
+and point intensity, leading to poor isometric invariance and suboptimal
+segmentation. To tackle this challenge, our work introduces Range-Aware
+Pointwise Distance Distribution (RAPiD) features and the associated RAPiD-Seg
+architecture. Our RAPiD features exhibit rigid transformation invariance and
+effectively adapt to variations in point density, with a design focus on
+capturing the localized geometry of neighboring structures. They utilize
+inherent LiDAR isotropic radiation and semantic categorization for enhanced
+local representation and computational efficiency, while incorporating a 4D
+distance metric that integrates geometric and surface material reflectivity for
+improved semantic segmentation. To effectively embed high-dimensional RAPiD
+features, we propose a double-nested autoencoder structure with a novel
+class-aware embedding objective to encode high-dimensional features into
+manageable voxel-wise embeddings. Additionally, we propose RAPiD-Seg which
+incorporates a channel-wise attention fusion and two effective RAPiD-Seg
+variants, further optimizing the embedding for enhanced performance and
+generalization. Our method outperforms contemporary LiDAR segmentation work in
+terms of mIoU on SemanticKITTI (76.1) and nuScenes (83.6) datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ECCV 2024 (Oral); 18 pages, 6 figures, 7 tables; Code at
+  https://github.com/l1997i/rapid_seg</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bayesian Low-Rank LeArning (Bella): A Practical Approach to Bayesian
+  Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.20891v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.20891v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bao Gia Doan, Afshar Shamsi, Xiao-Yu Guo, Arash Mohammadi, Hamid Alinejad-Rokny, Dino Sejdinovic, Damith C. Ranasinghe, Ehsan Abbasnejad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Computational complexity of Bayesian learning is impeding its adoption in
+practical, large-scale tasks. Despite demonstrations of significant merits such
+as improved robustness and resilience to unseen or out-of-distribution inputs
+over their non- Bayesian counterparts, their practical use has faded to near
+insignificance. In this study, we introduce an innovative framework to mitigate
+the computational burden of Bayesian neural networks (BNNs). Our approach
+follows the principle of Bayesian techniques based on deep ensembles, but
+significantly reduces their cost via multiple low-rank perturbations of
+parameters arising from a pre-trained neural network. Both vanilla version of
+ensembles as well as more sophisticated schemes such as Bayesian learning with
+Stein Variational Gradient Descent (SVGD), previously deemed impractical for
+large models, can be seamlessly implemented within the proposed framework,
+called Bayesian Low-Rank LeArning (Bella). In a nutshell, i) Bella achieves a
+dramatic reduction in the number of trainable parameters required to
+approximate a Bayesian posterior; and ii) it not only maintains, but in some
+instances, surpasses the performance of conventional Bayesian learning methods
+and non-Bayesian baselines. Our results with large-scale tasks such as
+ImageNet, CAMELYON17, DomainNet, VQA with CLIP, LLaVA demonstrate the
+effectiveness and versatility of Bella in building highly scalable and
+practical Bayesian deep models for real-world applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 14 figures, 11 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cross-Age and Cross-Site Domain Shift Impacts on Deep Learning-Based
+  White Matter Fiber Estimation in Newborn and Baby Brains 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.14773v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.14773v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rizhong Lin, Ali Gholipour, Jean-Philippe Thiran, Davood Karimi, Hamza Kebiri, Meritxell Bach Cuadra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning models have shown great promise in estimating tissue
+microstructure from limited diffusion magnetic resonance imaging data. However,
+these models face domain shift challenges when test and train data are from
+different scanners and protocols, or when the models are applied to data with
+inherent variations such as the developing brains of infants and children
+scanned at various ages. Several techniques have been proposed to address some
+of these challenges, such as data harmonization or domain adaptation in the
+adult brain. However, those techniques remain unexplored for the estimation of
+fiber orientation distribution functions in the rapidly developing brains of
+infants. In this work, we extensively investigate the age effect and domain
+shift within and across two different cohorts of 201 newborns and 165 babies
+using the Method of Moments and fine-tuning strategies. Our results show that
+reduced variations in the microstructural development of babies in comparison
+to newborns directly impact the deep learning models' cross-age performance. We
+also demonstrate that a small number of target domain samples can significantly
+mitigate domain shift problems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 5 figures; accepted as an Oral Presentation at the 2024 IEEE
+  International Symposium on Biomedical Imaging (ISBI) in Athens, Greece</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Show-o: One Single <span class="highlight-title">Transformer</span> to Unify Multimodal Understanding and
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.12528v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.12528v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinheng Xie, Weijia Mao, Zechen Bai, David Junhao Zhang, Weihao Wang, Kevin Qinghong Lin, Yuchao Gu, Zhijie Chen, Zhenheng Yang, Mike Zheng Shou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a unified transformer, i.e., Show-o, that unifies multimodal
+understanding and generation. Unlike fully autoregressive models, Show-o
+unifies autoregressive and (discrete) diffusion modeling to adaptively handle
+inputs and outputs of various and mixed modalities. The unified model flexibly
+supports a wide range of vision-language tasks including visual
+question-answering, text-to-image generation, text-guided
+inpainting/extrapolation, and mixed-modality generation. Across various
+benchmarks, it demonstrates comparable or superior performance to existing
+individual models with an equivalent or larger number of parameters tailored
+for understanding or generation. This significantly highlights its potential as
+a next-generation foundation model. Code and models are released at
+https://github.com/showlab/Show-o.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Epsilon: Exploring Comprehensive Visual-Semantic Projection for
+  Multi-Label Zero-Shot Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.12253v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.12253v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziming Liu, Jingcai Guo, Song Guo, Xiaocheng Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates a challenging problem of zero-shot learning in the
+multi-label scenario (MLZSL), wherein the model is trained to recognize
+multiple unseen classes within a sample (e.g., an image) based on seen classes
+and auxiliary knowledge, e.g., semantic information. Existing methods usually
+resort to analyzing the relationship of various seen classes residing in a
+sample from the dimension of spatial or semantic characteristics and
+transferring the learned model to unseen ones. However, they neglect the
+integrity of local and global features. Although the use of the attention
+structure will accurately locate local features, especially objects, it will
+significantly lose its integrity, and the relationship between classes will
+also be affected. Rough processing of global features will also directly affect
+comprehensiveness. This neglect will make the model lose its grasp of the main
+components of the image. Relying only on the local existence of seen classes
+during the inference stage introduces unavoidable bias. In this paper, we
+propose a novel and comprehensive visual-semantic framework for MLZSL, dubbed
+Epsilon, to fully make use of such properties and enable a more accurate and
+robust visual-semantic projection. In terms of spatial information, we achieve
+effective refinement by group aggregating image features into several semantic
+prompts. It can aggregate semantic information rather than class information,
+preserving the correlation between semantics. In terms of global semantics, we
+use global forward propagation to collect as much information as possible to
+ensure that semantics are not omitted. Experiments on large-scale MLZSL
+benchmark datasets NUS-Wide and Open-Images-v4 demonstrate that the proposed
+Epsilon outperforms other state-of-the-art methods with large margins.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 6 figures. arXiv admin note: substantial text overlap with
+  arXiv:2309.00923</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">1</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From Zero to Hero: Harnessing <span class="highlight-title">Transformer</span>s for Biomedical Named Entity
+  Recognition in Zero- and Few-shot Contexts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.04928v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.04928v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miloš Košprdić, Nikola Prodanović, Adela Ljajić, Bojana Bašaragin, Nikola Milošević
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Supervised named entity recognition (NER) in the biomedical domain depends on
+large sets of annotated texts with the given named entities. The creation of
+such datasets can be time-consuming and expensive, while extraction of new
+entities requires additional annotation tasks and retraining the model. To
+address these challenges, this paper proposes a method for zero- and few-shot
+NER in the biomedical domain. The method is based on transforming the task of
+multi-class token classification into binary token classification and
+pre-training on a large amount of datasets and biomedical entities, which allow
+the model to learn semantic relations between the given and potentially novel
+named entity labels. We have achieved average F1 scores of 35.44% for zero-shot
+NER, 50.10% for one-shot NER, 69.94% for 10-shot NER, and 79.51% for 100-shot
+NER on 9 diverse evaluated biomedical entities with fine-tuned PubMedBERT-based
+model. The results demonstrate the effectiveness of the proposed method for
+recognizing new biomedical entities with no or limited number of examples,
+outperforming previous transformer-based methods, and being comparable to
+GPT3-based models using models with over 1000 times fewer parameters. We make
+models and developed code publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Collaboration between Bayer Pharma R&D and Serbian Institute for
+  Artificial Intelligence Research and Development. Artificial Intelligence in
+  Medicine (2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">20</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimizing Luxury Vehicle Dealership Networks: A Graph Neural Network
+  Approach to Site Selection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13961v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13961v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luca Silvano Carocci, Qiwei Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study presents a novel application of Graph Neural Networks (GNNs) to
+optimize dealership network planning for a luxury car manufacturer in the U.S.
+By conducting a comprehensive literature review on dealership location
+determinants, the study identifies 65 county-level explanatory variables,
+augmented by two additional measures of regional interconnectedness derived
+from social and mobility data. An ablation study involving 34 variable
+combinations and ten state-of-the-art GNN operators reveals key insights into
+the predictive power of various variables, particularly highlighting the
+significance of competition, demographic factors, and mobility patterns in
+influencing dealership location decisions. The analysis pinpoints seven
+specific counties as promising targets for network expansion. This research not
+only illustrates the effectiveness of GNNs in solving complex geospatial
+decision-making problems but also provides actionable recommendations and
+valuable methodological insights for industry practitioners.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 4 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Time Series Analysis for Education: Methods, Applications, and Future
+  Directions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13960v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13960v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengzhong Mao, Chaoli Zhang, Yichi Song, Jindong Wang, Xiao-Jun Zeng, Zenglin Xu, Qingsong Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in the collection and analysis of sequential educational
+data have brought time series analysis to a pivotal position in educational
+research, highlighting its essential role in facilitating data-driven
+decision-making. However, there is a lack of comprehensive summaries that
+consolidate these advancements. To the best of our knowledge, this paper is the
+first to provide a comprehensive review of time series analysis techniques
+specifically within the educational context. We begin by exploring the
+landscape of educational data analytics, categorizing various data sources and
+types relevant to education. We then review four prominent time series
+methods-forecasting, classification, clustering, and anomaly
+detection-illustrating their specific application points in educational
+settings. Subsequently, we present a range of educational scenarios and
+applications, focusing on how these methods are employed to address diverse
+educational tasks, which highlights the practical integration of multiple time
+series methods to solve complex educational problems. Finally, we conclude with
+a discussion on future directions, including personalized learning analytics,
+multimodal data fusion, and the role of large language models (LLMs) in
+educational time series. The contributions of this paper include a detailed
+taxonomy of educational data, a synthesis of time series techniques with
+specific educational applications, and a forward-looking perspective on
+emerging trends and future research opportunities in educational analysis. The
+related papers and resources are available and regularly updated at the project
+page.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 3 figures, 6 tables, project page: see
+  https://github.com/ai-for-edu/time-series-analysis-for-education</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Prediction of COPD Using Machine Learning, Clinical Summary Notes, and
+  Vital Signs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13958v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13958v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Negar Orangi-Fard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chronic obstructive pulmonary disease (COPD) is a chronic inflammatory lung
+disease that causes obstructed airflow from the lungs. In the United States,
+more than 15.7 million Americans have been diagnosed with COPD, with 96% of
+individuals living with at least one other chronic health condition. It is the
+4th leading cause of death in the country. Over 2.2 million patients are
+admitted to hospitals annually due to COPD exacerbations. Monitoring and
+predicting patient exacerbations on-time could save their life. This paper
+presents two different predictive models to predict COPD exacerbation using AI
+and natural language processing (NLP) approaches. These models use respiration
+summary notes, symptoms, and vital signs. To train and test these models, data
+records containing physiologic signals and vital signs time series were used.
+These records were captured from patient monitors and comprehensive clinical
+data obtained from hospital medical information systems for tens of thousands
+of Intensive Care Unit (ICU) patients. We achieved an area under the Receiver
+operating characteristic (ROC) curve of 0.82 in detection and prediction of
+COPD exacerbation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning to Move Like Professional Counter-Strike Players 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13934v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13934v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Durst, Feng Xie, Vishnu Sarukkai, Brennan Shacklett, Iuri Frosio, Chen Tessler, Joohwan Kim, Carly Taylor, Gilbert Bernstein, Sanjiban Choudhury, Pat Hanrahan, Kayvon Fatahalian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In multiplayer, first-person shooter games like Counter-Strike: Global
+Offensive (CS:GO), coordinated movement is a critical component of high-level
+strategic play. However, the complexity of team coordination and the variety of
+conditions present in popular game maps make it impractical to author
+hand-crafted movement policies for every scenario. We show that it is possible
+to take a data-driven approach to creating human-like movement controllers for
+CS:GO. We curate a team movement dataset comprising 123 hours of professional
+game play traces, and use this dataset to train a transformer-based movement
+model that generates human-like team movement for all players in a "Retakes"
+round of the game. Importantly, the movement prediction model is efficient.
+Performing inference for all players takes less than 0.5 ms per game step
+(amortized cost) on a single CPU core, making it plausible for use in
+commercial games today. Human evaluators assess that our model behaves more
+like humans than both commercially-available bots and procedural movement
+controllers scripted by experts (16% to 59% higher by TrueSkill rating of
+"human-like"). Using experiments involving in-game bot vs. bot self-play, we
+demonstrate that our model performs simple forms of teamwork, makes fewer
+common movement mistakes, and yields movement distributions, player lifetimes,
+and kill locations similar to those observed in professional CS:GO match play.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The project website is at https://davidbdurst.com/mlmove/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FedGlu: A personalized federated learning-based glucose forecasting
+  algorithm for improved performance in glycemic excursion regions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13926v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13926v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Darpit Dave, Kathan Vyas, Jagadish Kumaran Jayagopal, Alfredo Garcia, Madhav Erraguntla, Mark Lawley
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continuous glucose monitoring (CGM) devices provide real-time glucose
+monitoring and timely alerts for glycemic excursions, improving glycemic
+control among patients with diabetes. However, identifying rare events like
+hypoglycemia and hyperglycemia remain challenging due to their infrequency.
+Moreover, limited access to sensitive patient data hampers the development of
+robust machine learning models. Our objective is to accurately predict glycemic
+excursions while addressing data privacy concerns. To tackle excursion
+prediction, we propose a novel Hypo-Hyper (HH) loss function, which
+significantly improves performance in the glycemic excursion regions. The HH
+loss function demonstrates a 46% improvement over mean-squared error (MSE) loss
+across 125 patients. To address privacy concerns, we propose FedGlu, a machine
+learning model trained in a federated learning (FL) framework. FL allows
+collaborative learning without sharing sensitive data by training models
+locally and sharing only model parameters across other patients. FedGlu
+achieves a 35% superior glycemic excursion detection rate compared to local
+models. This improvement translates to enhanced performance in predicting both,
+hypoglycemia and hyperglycemia, for 105 out of 125 patients. These results
+underscore the effectiveness of the proposed HH loss function in augmenting the
+predictive capabilities of glucose predictions. Moreover, implementing models
+within a federated learning framework not only ensures better predictive
+capabilities but also safeguards sensitive data concurrently.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Splatt3R: Zero-shot Gaussian Splatting from Uncalibarated Image Pairs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13912v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13912v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brandon Smart, Chuanxia Zheng, Iro Laina, Victor Adrian Prisacariu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce Splatt3R, a pose-free, feed-forward method for
+in-the-wild 3D reconstruction and novel view synthesis from stereo pairs. Given
+uncalibrated natural images, Splatt3R can predict 3D Gaussian Splats without
+requiring any camera parameters or depth information. For generalizability, we
+start from a 'foundation' 3D geometry reconstruction method, MASt3R, and extend
+it to be a full 3D structure and appearance reconstructor. Specifically, unlike
+the original MASt3R which reconstructs only 3D point clouds, we predict the
+additional Gaussian attributes required to construct a Gaussian primitive for
+each point. Hence, unlike other novel view synthesis methods, Splatt3R is first
+trained by optimizing the 3D point cloud's geometry loss, and then a novel view
+synthesis objective. By doing this, we avoid the local minima present in
+training 3D Gaussian Splats from stereo views. We also propose a novel loss
+masking strategy that we empirically find is critical for strong performance on
+extrapolated viewpoints. We train Splatt3R on the ScanNet++ dataset and
+demonstrate excellent generalisation to uncalibrated, in-the-wild images.
+Splatt3R can reconstruct scenes at 4FPS at 512 x 512 resolution, and the
+resultant splats can be rendered in real-time.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Our project page can be found at: https://splatt3r.active.vision/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ConVis: Contrastive Decoding with Hallucination Visualization for
+  Mitigating Hallucinations in Multimodal Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13906v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13906v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yeji Park, Deokyeong Lee, Junsuk Choe, Buru Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hallucinations in Multimodal Large Language Models (MLLMs) where generated
+responses fail to accurately reflect the given image pose a significant
+challenge to their reliability. To address this, we introduce ConVis, a novel
+training-free contrastive decoding method. ConVis leverages a text-to-image
+(T2I) generation model to semantically reconstruct the given image from
+hallucinated captions. By comparing the contrasting probability distributions
+produced by the original and reconstructed images, ConVis enables MLLMs to
+capture visual contrastive signals that penalize hallucination generation.
+Notably, this method operates purely within the decoding process, eliminating
+the need for additional data or model updates. Our extensive experiments on
+five popular benchmarks demonstrate that ConVis effectively reduces
+hallucinations across various MLLMs, highlighting its potential to enhance
+model reliability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>First two authors contributed equally. Source code is available at
+  https://github.com/yejipark-m/ConVis</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TraIL-Det: Transformation-Invariant Local Feature Networks for 3D LiDAR
+  Object Detection with Unsupervised <span class="highlight-title">Pre-Train</span>ing <span class="chip">BMVC 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13902v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13902v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Li Li, Tanqiu Qiao, Hubert P. H. Shum, Toby P. Breckon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D point clouds are essential for perceiving outdoor scenes, especially
+within the realm of autonomous driving. Recent advances in 3D LiDAR Object
+Detection focus primarily on the spatial positioning and distribution of points
+to ensure accurate detection. However, despite their robust performance in
+variable conditions, these methods are hindered by their sole reliance on
+coordinates and point intensity, resulting in inadequate isometric invariance
+and suboptimal detection outcomes. To tackle this challenge, our work
+introduces Transformation-Invariant Local (TraIL) features and the associated
+TraIL-Det architecture. Our TraIL features exhibit rigid transformation
+invariance and effectively adapt to variations in point density, with a design
+focus on capturing the localized geometry of neighboring structures. They
+utilize the inherent isotropic radiation of LiDAR to enhance local
+representation, improve computational efficiency, and boost detection
+performance. To effectively process the geometric relations among points within
+each proposal, we propose a Multi-head self-Attention Encoder (MAE) with
+asymmetric geometric features to encode high-dimensional TraIL features into
+manageable representations. Our method outperforms contemporary self-supervised
+3D object detection approaches in terms of mAP on KITTI (67.8, 20% label,
+moderate) and Waymo (68.9, 20% label, moderate) datasets under various label
+ratios (20%, 50%, and 100%).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>BMVC 2024; 15 pages, 3 figures, 3 tables; Code at
+  https://github.com/l1997i/rapid_seg</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MuMA-ToM: Multi-modal Multi-Agent Theory of Mind <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.12574v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.12574v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haojun Shi, Suyu Ye, Xinyu Fang, Chuanyang Jin, Leyla Isik, Yen-Ling Kuo, Tianmin Shu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding people's social interactions in complex real-world scenarios
+often relies on intricate mental reasoning. To truly understand how and why
+people interact with one another, we must infer the underlying mental states
+that give rise to the social interactions, i.e., Theory of Mind reasoning in
+multi-agent interactions. Additionally, social interactions are often
+multi-modal -- we can watch people's actions, hear their conversations, and/or
+read about their past behaviors. For AI systems to successfully and safely
+interact with people in real-world environments, they also need to understand
+people's mental states as well as their inferences about each other's mental
+states based on multi-modal information about their interactions. For this, we
+introduce MuMA-ToM, a Multi-modal Multi-Agent Theory of Mind benchmark.
+MuMA-ToM is the first multi-modal Theory of Mind benchmark that evaluates
+mental reasoning in embodied multi-agent interactions. In MuMA-ToM, we provide
+video and text descriptions of people's multi-modal behavior in realistic
+household environments. Based on the context, we then ask questions about
+people's goals, beliefs, and beliefs about others' goals. We validated MuMA-ToM
+in a human experiment and provided a human baseline. We also proposed a novel
+multi-modal, multi-agent ToM model, LIMP (Language model-based Inverse
+Multi-agent Planning). Our experimental results show that LIMP significantly
+outperforms state-of-the-art methods, including large multi-modal models (e.g.,
+GPT-4o, Gemini-1.5 Pro) and a recent multi-modal ToM model, BIP-ALM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project website: https://scai.cs.jhu.edu/projects/MuMA-ToM/ Code:
+  https://github.com/SCAI-JHU/MuMA-ToM</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Over-Certainty Phenomenon in Modern UDA Algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.16168v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.16168v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fin Amin, Jung-Eun Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When neural networks are confronted with unfamiliar data that deviate from
+their training set, this signifies a domain shift. While these networks output
+predictions on their inputs, they typically fail to account for their level of
+familiarity with these novel observations. While prevailing works navigate
+unsupervised domain adaptation with the goal of curtailing model entropy, they
+unintentionally birth models that grapple with sub-optimal calibration - a
+dilemma we term the over-certainty phenomenon. In this paper, we uncover a
+concerning trend in unsupervised domain adaptation and propose a solution that
+not only maintains accuracy but also addresses calibration.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Better Not to Propagate: Understanding Edge Uncertainty and
+  Over-smoothing in Signed Graph Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.04895v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.04895v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yoonhyuk Choi, Jiho Choi, Taewook Ko, Chong-Kwon Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional Graph Neural Networks (GNNs) rely on network homophily, which can
+lead to performance degradation due to over-smoothing in many real-world
+heterophily scenarios. Recent studies analyze the smoothing effect
+(separability) after message-passing (MP), depending on the expectation of node
+features. Regarding separability gain, they provided theoretical backgrounds on
+over-smoothing caused by various propagation schemes, including positive,
+signed, and blocked MPs. More recently, by extending these theorems, some works
+have suggested improvements in signed propagation under multiple classes.
+However, prior works assume that the error ratio of all propagation schemes is
+fixed, failing to investigate this phenomenon correctly. To solve this problem,
+we propose a novel method for estimating homophily and edge error ratio,
+integrated with dynamic selection between blocked and signed propagation during
+training. Our theoretical analysis, supported by extensive experiments,
+demonstrates that blocking MP can be more effective than signed propagation
+under high edge error ratios, improving the performance in both homophilic and
+heterophilic graphs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Outlier-Insensitive Kalman Filtering: Theory and Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.09505v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.09505v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shunit Truzman, Guy Revach, Nir Shlezinger, Itzik Klein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  State estimation of dynamical systems from noisy observations is a
+fundamental task in many applications. It is commonly addressed using the
+linear Kalman filter (KF), whose performance can significantly degrade in the
+presence of outliers in the observations, due to the sensitivity of its convex
+quadratic objective function. To mitigate such behavior, outlier detection
+algorithms can be applied. In this work, we propose a parameter-free algorithm
+which mitigates the harmful effect of outliers while requiring only a short
+iterative process of the standard update step of the KF. To that end, we model
+each potential outlier as a normal process with unknown variance and apply
+online estimation through either expectation maximization or alternating
+maximization algorithms. Simulations and field experiment evaluations
+demonstrate competitive performance of our method, showcasing its robustness to
+outliers in filtering scenarios compared to alternative algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Equivariant Ensembles and Regularization for Reinforcement Learning in
+  Map-based Path Planning <span class="chip">IROS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.12856v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.12856v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mirco Theile, Hongpeng Cao, Marco Caccamo, Alberto L. Sangiovanni-Vincentelli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In reinforcement learning (RL), exploiting environmental symmetries can
+significantly enhance efficiency, robustness, and performance. However,
+ensuring that the deep RL policy and value networks are respectively
+equivariant and invariant to exploit these symmetries is a substantial
+challenge. Related works try to design networks that are equivariant and
+invariant by construction, limiting them to a very restricted library of
+components, which in turn hampers the expressiveness of the networks. This
+paper proposes a method to construct equivariant policies and invariant value
+functions without specialized neural network components, which we term
+equivariant ensembles. We further add a regularization term for adding
+inductive bias during training. In a map-based path planning case study, we
+show how equivariant ensembles and regularization benefit sample efficiency and
+performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at IROS 2024. A video can be found here:
+  https://youtu.be/L6NOdvU7n7s. The code is available at
+  https://github.com/theilem/uavSim</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Network Level Spatial Temporal Traffic State Forecasting with
+  Hierarchical Attention LSTM (HierAttnLSTM) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2201.05760v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2201.05760v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianya Terry Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traffic state data, such as speed, volume and travel time collected from
+ubiquitous traffic monitoring sensors require advanced network level analytics
+for forecasting and identifying significant traffic patterns. This paper
+leverages diverse traffic state datasets from the Caltrans Performance
+Measurement System (PeMS) hosted on the open benchmark and achieved promising
+performance compared to well recognized spatial-temporal models. Drawing
+inspiration from the success of hierarchical architectures in various
+Artificial Intelligence (AI) tasks, we integrate cell and hidden states from
+low-level to high-level Long Short-Term Memory (LSTM) networks with an
+attention pooling mechanism, similar to human perception systems. The developed
+hierarchical structure is designed to account for dependencies across different
+time scales, capturing the spatial-temporal correlations of network-level
+traffic states, enabling the prediction of traffic states for all corridors
+rather than a single link or route. The efficiency of designed attention-based
+LSTM is analyzed by ablation study. Comparative results with baseline LSTM
+models demonstrate that the Hierarchical Attention LSTM (HierAttnLSTM) model
+not only provides higher prediction accuracy but also effectively forecasts
+unusual congestion patterns. Data and code are made publicly available to
+support reproducible scientific research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Shield Synthesis via State-Space Transformation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.19911v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.19911v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Asger Horn Brorholt, Andreas Holck Høeg-Petersen, Kim Guldstrand Larsen, Christian Schilling
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of synthesizing safety strategies for control
+systems, also known as shields. Since the state space is infinite, shields are
+typically computed over a finite-state abstraction, with the most common
+abstraction being a rectangular grid. However, for many systems, such a grid
+does not align well with the safety property or the system dynamics. That is
+why a coarse grid is rarely sufficient, but a fine grid is typically
+computationally infeasible to obtain. In this paper, we show that appropriate
+state-space transformations can still allow to use a coarse grid at almost no
+computational overhead. We demonstrate in three case studies that our
+transformation-based synthesis outperforms a standard synthesis by several
+orders of magnitude. In the first two case studies, we use domain knowledge to
+select a suitable transformation. In the third case study, we instead report on
+results in engineering a transformation without domain knowledge.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Detection of KIC 1718360, A Rotating Variable with a Possible
+  Companion, Using Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.05282v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.05282v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jakob Roche
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents the detection of a periodic dimming event in the
+lightcurve of the G1.5IV-V type star KIC 1718360. This is based on
+visible-light observations conducted by both the TESS and Kepler space
+telescopes. Analysis of the data seems to point toward a high rotation rate in
+the star, with a rotational period of 2.938 days. The high variability seen
+within the star's lightcurve points toward classification as a rotating
+variable. The initial observation was made in Kepler Quarter 16 data using the
+One-Class SVM machine learning method. Subsequent observations by the TESS
+space telescope corroborated these findings. It appears that KIC 1718360 is a
+nearby rotating variable that appears in little to no major catalogs as such. A
+secondary, additional periodic dip is also present, indicating a possible
+exoplanetary companion.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 6 figures Revised to correct errors, update and add data</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ UAMM: Price-oracle based Automated Market Maker 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06375v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06375v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Jiwoong Im, Alexander Kondratskiy, Vincent Harvey, Hsuan-Wei Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated market makers (AMMs) are pricing mechanisms utilized by
+decentralized exchanges (DEX). Traditional AMM approaches are constrained by
+pricing solely based on their own liquidity pool, without consideration of
+external markets or risk management for liquidity providers. In this paper, we
+propose a new approach known as UBET AMM (UAMM), which calculates prices by
+considering external market prices and the impermanent loss of the liquidity
+pool. Despite relying on external market prices, our method maintains the
+desired properties of a constant product curve when computing slippages. The
+key element of UAMM is determining the appropriate slippage amount based on the
+desired target balance, which encourages the liquidity pool to minimize
+impermanent loss. We demonstrate that our approach eliminates arbitrage
+opportunities when external market prices are efficient.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RAPiD-Seg: Range-Aware Pointwise Distance Distribution Networks for 3D
+  LiDAR Segmentation <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.10159v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.10159v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Li Li, Hubert P. H. Shum, Toby P. Breckon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D point clouds play a pivotal role in outdoor scene perception, especially
+in the context of autonomous driving. Recent advancements in 3D LiDAR
+segmentation often focus intensely on the spatial positioning and distribution
+of points for accurate segmentation. However, these methods, while robust in
+variable conditions, encounter challenges due to sole reliance on coordinates
+and point intensity, leading to poor isometric invariance and suboptimal
+segmentation. To tackle this challenge, our work introduces Range-Aware
+Pointwise Distance Distribution (RAPiD) features and the associated RAPiD-Seg
+architecture. Our RAPiD features exhibit rigid transformation invariance and
+effectively adapt to variations in point density, with a design focus on
+capturing the localized geometry of neighboring structures. They utilize
+inherent LiDAR isotropic radiation and semantic categorization for enhanced
+local representation and computational efficiency, while incorporating a 4D
+distance metric that integrates geometric and surface material reflectivity for
+improved semantic segmentation. To effectively embed high-dimensional RAPiD
+features, we propose a double-nested autoencoder structure with a novel
+class-aware embedding objective to encode high-dimensional features into
+manageable voxel-wise embeddings. Additionally, we propose RAPiD-Seg which
+incorporates a channel-wise attention fusion and two effective RAPiD-Seg
+variants, further optimizing the embedding for enhanced performance and
+generalization. Our method outperforms contemporary LiDAR segmentation work in
+terms of mIoU on SemanticKITTI (76.1) and nuScenes (83.6) datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ECCV 2024 (Oral); 18 pages, 6 figures, 7 tables; Code at
+  https://github.com/l1997i/rapid_seg</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automatic Differentiation is Essential in Training Neural Networks for
+  Solving Differential Equations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.14099v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.14099v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chuqi Chen, Yahong Yang, Yang Xiang, Wenrui Hao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural network-based approaches have recently shown significant promise in
+solving partial differential equations (PDEs) in science and engineering,
+especially in scenarios featuring complex domains or the incorporation of
+empirical data. One advantage of the neural network method for PDEs lies in its
+automatic differentiation (AD), which necessitates only the sample points
+themselves, unlike traditional finite difference (FD) approximations that
+require nearby local points to compute derivatives. In this paper, we
+quantitatively demonstrate the advantage of AD in training neural networks. The
+concept of truncated entropy is introduced to characterize the training
+property. Specifically, through comprehensive experimental and theoretical
+analyses conducted on random feature models and two-layer neural networks, we
+discover that the defined truncated entropy serves as a reliable metric for
+quantifying the residual loss of random feature models and the training speed
+of neural networks for both AD and FD methods. Our experimental and theoretical
+analyses demonstrate that, from a training perspective, AD outperforms FD in
+solving partial differential equations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bayesian Low-Rank LeArning (Bella): A Practical Approach to Bayesian
+  Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.20891v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.20891v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bao Gia Doan, Afshar Shamsi, Xiao-Yu Guo, Arash Mohammadi, Hamid Alinejad-Rokny, Dino Sejdinovic, Damith C. Ranasinghe, Ehsan Abbasnejad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Computational complexity of Bayesian learning is impeding its adoption in
+practical, large-scale tasks. Despite demonstrations of significant merits such
+as improved robustness and resilience to unseen or out-of-distribution inputs
+over their non- Bayesian counterparts, their practical use has faded to near
+insignificance. In this study, we introduce an innovative framework to mitigate
+the computational burden of Bayesian neural networks (BNNs). Our approach
+follows the principle of Bayesian techniques based on deep ensembles, but
+significantly reduces their cost via multiple low-rank perturbations of
+parameters arising from a pre-trained neural network. Both vanilla version of
+ensembles as well as more sophisticated schemes such as Bayesian learning with
+Stein Variational Gradient Descent (SVGD), previously deemed impractical for
+large models, can be seamlessly implemented within the proposed framework,
+called Bayesian Low-Rank LeArning (Bella). In a nutshell, i) Bella achieves a
+dramatic reduction in the number of trainable parameters required to
+approximate a Bayesian posterior; and ii) it not only maintains, but in some
+instances, surpasses the performance of conventional Bayesian learning methods
+and non-Bayesian baselines. Our results with large-scale tasks such as
+ImageNet, CAMELYON17, DomainNet, VQA with CLIP, LLaVA demonstrate the
+effectiveness and versatility of Bella in building highly scalable and
+practical Bayesian deep models for real-world applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 14 figures, 11 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">5</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Localization of Synthetic Manipulations in Western Blot Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13786v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13786v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anmol Manjunath, Viola Negroni, Sara Mandelli, Daniel Moreira, Paolo Bestagini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent breakthroughs in deep learning and generative systems have
+significantly fostered the creation of synthetic media, as well as the local
+alteration of real content via the insertion of highly realistic synthetic
+manipulations. Local image manipulation, in particular, poses serious
+challenges to the integrity of digital content and societal trust. This problem
+is not only confined to multimedia data, but also extends to biological images
+included in scientific publications, like images depicting Western blots. In
+this work, we address the task of localizing synthetic manipulations in Western
+blot images. To discriminate between pristine and synthetic pixels of an
+analyzed image, we propose a synthetic detector that operates on small patches
+extracted from the image. We aggregate patch contributions to estimate a
+tampering heatmap, highlighting synthetic pixels out of pristine ones. Our
+methodology proves effective when tested over two manipulated Western blot
+image datasets, one altered automatically and the other manually by exploiting
+advanced AI-based image manipulation tools that are unknown at our training
+stage. We also explore the robustness of our method over an external dataset of
+other scientific images depicting different semantics, manipulated through
+unseen generation techniques.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Analyzing the Impact of Splicing Artifacts in Partially Fake Speech
+  Signals 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13784v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13784v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Viola Negroni, Davide Salvi, Paolo Bestagini, Stefano Tubaro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speech deepfake detection has recently gained significant attention within
+the multimedia forensics community. Related issues have also been explored,
+such as the identification of partially fake signals, i.e., tracks that include
+both real and fake speech segments. However, generating high-quality spliced
+audio is not as straightforward as it may appear. Spliced signals are typically
+created through basic signal concatenation. This process could introduce
+noticeable artifacts that can make the generated data easier to detect. We
+analyze spliced audio tracks resulting from signal concatenation, investigate
+their artifacts and assess whether such artifacts introduce any bias in
+existing datasets. Our findings reveal that by analyzing splicing artifacts, we
+can achieve a detection EER of 6.16% and 7.36% on PartialSpoof and HAD
+datasets, respectively, without needing to train any detector. These results
+underscore the complexities of generating reliable spliced audio data and lead
+to discussions that can help improve future research in this area.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ASVspoof 5 Workshop (Interspeech2024 Satellite)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Riemann-based Multi-scale Attention Reasoning Network for Text-3D
+  Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13712v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13712v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenrui Li, Wei Han, Yandu Chen, Yeyu Chai, Yidan Lu, Xingtao Wang, Xiaopeng Fan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to the challenges in acquiring paired Text-3D data and the inherent
+irregularity of 3D data structures, combined representation learning of 3D
+point clouds and text remains unexplored. In this paper, we propose a novel
+Riemann-based Multi-scale Attention Reasoning Network (RMARN) for text-3D
+retrieval. Specifically, the extracted text and point cloud features are
+refined by their respective Adaptive Feature Refiner (AFR). Furthermore, we
+introduce the innovative Riemann Local Similarity (RLS) module and the Global
+Pooling Similarity (GPS) module. However, as 3D point cloud data and text data
+often possess complex geometric structures in high-dimensional space, the
+proposed RLS employs a novel Riemann Attention Mechanism to reflect the
+intrinsic geometric relationships of the data. Without explicitly defining the
+manifold, RMARN learns the manifold parameters to better represent the
+distances between text-point cloud samples. To address the challenges of
+lacking paired text-3D data, we have created the large-scale Text-3D Retrieval
+dataset T3DR-HIT, which comprises over 3,380 pairs of text and point cloud
+data. T3DR-HIT contains coarse-grained indoor 3D scenes and fine-grained
+Chinese artifact scenes, consisting of 1,380 and over 2,000 text-3D pairs,
+respectively. Experiments on our custom datasets demonstrate the superior
+performance of the proposed method. Our code and proposed datasets are
+available at \url{https://github.com/liwrui/RMARN}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SceneDreamer360: Text-Driven 3D-Consistent Scene Generation with
+  Panoramic Gaussian Splatting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13711v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13711v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenrui Li, Yapeng Mi, Fucheng Cai, Zhe Yang, Wangmeng Zuo, Xingtao Wang, Xiaopeng Fan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-driven 3D scene generation has seen significant advancements recently.
+However, most existing methods generate single-view images using generative
+models and then stitch them together in 3D space. This independent generation
+for each view often results in spatial inconsistency and implausibility in the
+3D scenes. To address this challenge, we proposed a novel text-driven
+3D-consistent scene generation model: SceneDreamer360. Our proposed method
+leverages a text-driven panoramic image generation model as a prior for 3D
+scene generation and employs 3D Gaussian Splatting (3DGS) to ensure consistency
+across multi-view panoramic images. Specifically, SceneDreamer360 enhances the
+fine-tuned Panfusion generator with a three-stage panoramic enhancement,
+enabling the generation of high-resolution, detail-rich panoramic images.
+During the 3D scene construction, a novel point cloud fusion initialization
+method is used, producing higher quality and spatially consistent point clouds.
+Our extensive experiments demonstrate that compared to other methods,
+SceneDreamer360 with its panoramic image generation and 3DGS can produce higher
+quality, spatially consistent, and visually appealing 3D scenes from any text
+prompt. Our codes are available at
+\url{https://github.com/liwrui/SceneDreamer360}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Attack on Scene Flow using Point Clouds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.13621v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.13621v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haniyeh Ehsani Oskouie, Mohammad-Shahram Moin, Shohreh Kasaei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks have made significant advancements in accurately
+estimating scene flow using point clouds, which is vital for many applications
+like video analysis, action recognition, and navigation. The robustness of
+these techniques, however, remains a concern, particularly in the face of
+adversarial attacks that have been proven to deceive state-of-the-art deep
+neural networks in many domains. Surprisingly, the robustness of scene flow
+networks against such attacks has not been thoroughly investigated. To address
+this problem, the proposed approach aims to bridge this gap by introducing
+adversarial white-box attacks specifically tailored for scene flow networks.
+Experimental results show that the generated adversarial examples obtain up to
+33.7 relative degradation in average end-point error on the KITTI and
+FlyingThings3D datasets. The study also reveals the significant impact that
+attacks targeting point clouds in only one dimension or color channel have on
+average end-point error. Analyzing the success and failure of these attacks on
+the scene flow networks and their 2D optical flow network variants shows a
+higher vulnerability for the optical flow networks. Code is available at
+https://github.com/aheldis/Attack-on-Scene-Flow-using-Point-Clouds.git.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2024-08-24T00:00:00Z">2024-08-24</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">32</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A layer-wise analysis of Mandarin and English suprasegmentals in SSL
+  speech models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13678v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13678v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Antón de la Fuente, Dan Jurafsky
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study asks how self-supervised speech models represent suprasegmental
+categories like Mandarin lexical tone, English lexical stress, and English
+phrasal accents. Through a series of probing tasks, we make layer-wise
+comparisons of English and Mandarin 12 layer monolingual models. Our findings
+suggest that 1) English and Mandarin wav2vec 2.0 models learn contextual
+representations of abstract suprasegmental categories which are strongest in
+the middle third of the network. 2) Models are better at representing features
+that exist in the language of their training data, and this difference is
+driven by enriched context in transformer blocks, not local acoustic
+representation. 3) Fine-tuned wav2vec 2.0 improves performance in later layers
+compared to pre-trained models mainly for lexically contrastive features like
+tone and stress, 4) HuBERT and WavLM learn similar representations to wav2vec
+2.0, differing mainly in later layer performance. Our results extend previous
+understanding of how models represent suprasegmentals and offer new insights
+into the language-specificity and contextual nature of these representations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 3 figures, to be published in Interspeech 2024 proceedings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Symbolic Working Memory Enhances Language Models for Complex Rule
+  Application 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13654v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13654v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyuan Wang, Zhongyu Wei, Yejin Choi, Xiang Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have shown remarkable reasoning performance but
+struggle with multi-step deductive reasoning involving a series of rule
+application steps, especially when rules are presented non-sequentially. Our
+preliminary analysis shows that while LLMs excel in single-step rule
+application, their performance drops significantly in multi-step scenarios due
+to the challenge in rule grounding. It requires anchoring the applicable rule
+and supporting facts at each step, amidst multiple input rules, facts, and
+inferred facts. To address this, we propose augmenting LLMs with external
+working memory and introduce a neurosymbolic framework for rule application.
+The memory stores facts and rules in both natural language and symbolic forms,
+enabling precise tracking. Utilizing this memory, our framework iteratively
+performs symbolic rule grounding and LLM-based rule implementation. The former
+matches predicates and variables of symbolic rules and facts to ground
+applicable rules at each step. Experiments indicate our framework's
+effectiveness in rule application and its robustness across various steps and
+settings~\footnote{Code and data are available at
+\url{https://github.com/SiyuanWangw/RuleApplication}.}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Narratives at Conflict: Computational Analysis of News Framing in
+  Multilingual Disinformation Campaigns <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13651v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13651v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Antonina Sinelnik, Dirk Hovy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Any report frames issues to favor a particular interpretation by highlighting
+or excluding certain aspects of a story. Despite the widespread use of framing
+in disinformation, framing properties and detection methods remain
+underexplored outside the English-speaking world. We explore how multilingual
+framing of the same issue differs systematically. We use eight years of
+Russia-backed disinformation campaigns, spanning 8k news articles in 4
+languages targeting 15 countries. We find that disinformation campaigns
+consistently and intentionally favor specific framing, depending on the target
+language of the audience. We further discover how Russian-language articles
+consistently highlight selected frames depending on the region of the media
+coverage. We find that the two most prominent models for automatic frame
+analysis underperform and show high disagreement, highlighting the need for
+further research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in ACL SRW 2024 Proceedings, see
+  https://aclanthology.org/2024.acl-srw.21/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ancient but Digitized: Developing Handwritten Optical Character
+  Recognition for East Syriac Script Through Creating KHAMIS <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13631v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13631v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ameer Majeed, Hossein Hassani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many languages have vast amounts of handwritten texts, such as ancient
+scripts about folktale stories and historical narratives or contemporary
+documents and letters. Digitization of those texts has various applications,
+such as daily tasks, cultural studies, and historical research. Syriac is an
+ancient, endangered, and low-resourced language that has not received the
+attention it requires and deserves. This paper reports on a research project
+aimed at developing a optical character recognition (OCR) model based on the
+handwritten Syriac texts as a starting point to build more digital services for
+this endangered language. A dataset was created, KHAMIS (inspired by the East
+Syriac poet, Khamis bar Qardahe), which consists of handwritten sentences in
+the East Syriac script. We used it to fine-tune the Tesseract-OCR engine's
+pretrained Syriac model on handwritten data. The data was collected from
+volunteers capable of reading and writing in the language to create KHAMIS.
+KHAMIS currently consists of 624 handwritten Syriac sentences collected from 31
+university students and one professor, and it will be partially available
+online and the whole dataset available in the near future for development and
+research purposes. As a result, the handwritten OCR model was able to achieve a
+character error rate of 1.097-1.610% and 8.963-10.490% on both training and
+evaluation sets, respectively, and both a character error rate of 18.89-19.71%
+and a word error rate of 62.83-65.42% when evaluated on the test set, which is
+twice as better than the default Syriac model of Tesseract.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 12 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ No <span class="highlight-title">Dataset</span> Needed for Downstream Knowledge Benchmarking: Response
+  Dispersion Inversely Correlates with Accuracy on Domain-specific QA 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13624v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13624v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Robert L Simione II
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This research seeks to obviate the need for creating QA datasets and grading
+(chatbot) LLM responses when comparing LLMs' knowledge in specific topic
+domains. This is done in an entirely end-user centric way without need for
+access to any inner workings of the LLM, so long as it can be prompted and
+given a random seed to create different generations to the same prompt. The
+paper does this by, for a given topic domain, defining the "response
+dispersion" of an LLM by repeatedly asking an LLM the same opinion question
+about that topic domain. Namely, the response dispersion is the count of
+singular values needed to explain 95% of the variance in the embedding matrix
+of the LLM's responses. It is found that the response dispersion is inversely
+correlated with accuracy on relevant QA evaluations (average spearman rank
+correlation stronger than -.59). A use-case analysis shows that when comparing
+two different LLMs on the same topic domain, comparing their response
+dispersion is a suitable replacement for comparing their QA accuracy between
+74% and 89% of the time, the range depending on certain reasonable
+accuracy-difference tolerances that may be acceptable to an end-user in
+exchange for the labor being saved using response dispersion instead of QA
+accuracy for comparison. Two response embeddings are studied for creating the
+embedding matrix in this study, one is from OpenAI's APIs and one is a novel
+embedding, here named reference sentence similarity embeddings, that can be
+computed locally and performs very nearly as well in calculating response
+dispersion. Also in this research, a pre-existing dataset called the IRC-Wiki
+Trivia dataset, originally developed for trivia games, has been re-purposed,
+curated, and the curation, called IRC-WikiTriviaQA, is made available for the
+purpose of this research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 3 tables, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Balancing Diversity and Risk in LLM Sampling: How to Select Your Method
+  and Parameter for Open-Ended Text Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13586v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13586v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxuan Zhou, Margret Keuper, Mario Fritz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sampling-based decoding strategies have been widely adopted for Large
+Language Models (LLMs) in numerous applications, which target a balance between
+diversity and quality via temperature tuning and tail truncation (e.g., top-k
+and top-p sampling). Considering the high dynamic range of the candidate
+next-token given different prefixes, recent studies propose to adaptively
+truncate the tail of LLM's predicted distribution. Although improved results
+haven been reported with these methods on open-ended text generation tasks, the
+results are highly dependent on the curated truncation parameters and exemplar
+text. In this paper, we propose a systematic way to estimate the intrinsic
+capacity of a truncation sampling method by considering the trade-off between
+diversity and risk at each decoding step, based on our collected prefix tree
+which preserves the context of a full sentence. Our work provides a
+comprehensive comparison between existing truncation sampling methods, as well
+as their recommended parameters as a guideline for users.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FLEURS-ASL: Including American Sign Language in Massively Multilingual
+  Multitask Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13585v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13585v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Garrett Tanzer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sign language translation has historically been peripheral to mainstream
+machine translation research. In order to help converge the fields, we
+introduce FLEURS-ASL, an extension of the multiway parallel benchmarks FLORES
+(for text) and FLEURS (for speech) to support their first sign language (as
+video), American Sign Language, translated by 5 Certified Deaf Interpreters.
+FLEURS-ASL can be used to evaluate a variety of tasks -- primarily sentence-
+and discourse-level translation -- between ASL and 200 other languages as text,
+or 102 languages as speech. We provide baselines for tasks from ASL to English
+text using a unified modeling approach that incorporates timestamp tokens and
+previous text tokens in a 34-second context window, trained on random video
+clips from YouTube-ASL. This model meets or exceeds the performance of
+phrase-level baselines while supporting a multitude of new tasks. We also use
+FLEURS-ASL to show that multimodal frontier models have virtually no
+understanding of ASL, underscoring the importance of including sign languages
+in standard evaluation suites.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Access FLEURS-ASL at
+  https://www.kaggle.com/datasets/googleai/fleurs-asl. arXiv admin note: text
+  overlap with arXiv:2408.07065</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IQA-EVAL: Automatic Evaluation of Human-Model Interactive Question
+  Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13545v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13545v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruosen Li, Barry Wang, Ruochen Li, Xinya Du
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To evaluate Large Language Models (LLMs) for question answering (QA),
+traditional methods typically focus on directly assessing the immediate
+responses generated by the models based on the given question and context. In
+the common use case of humans seeking AI assistant's help in finding
+information, these non-interactive evaluations do not account for the dynamic
+nature of human-model conversations, and interaction-aware evaluations have
+shown that accurate QA models are preferred by humans (Lee et al., 2023).
+Recent works in human-computer interaction (HCI) have employed human evaluators
+to conduct interactions and evaluations, but they are often prohibitively
+expensive and time-consuming to scale. In this work, we introduce an automatic
+evaluation framework IQA-EVAL to Interactive Question Answering Evaluation.
+More specifically, we introduce LLM-based Evaluation Agent (LEA) that can: (1)
+simulate human behaviors to generate interactions with IQA models; (2)
+automatically evaluate the generated interactions. Moreover, we propose
+assigning personas to LEAs to better simulate groups of real human evaluators.
+We show that: (1) our evaluation framework with GPT-4 (or Claude) as the
+backbone model achieves a high correlation with human evaluations on the IQA
+task; (2) assigning personas to LEA to better represent the crowd further
+significantly improves correlations. Finally, we use our automatic metric to
+evaluate five recent representative LLMs with over 1000 questions from complex
+and ambiguous question answering tasks, which comes with a substantial cost of
+$5k if evaluated by humans.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cultural Adaptation of Menus: A Fine-Grained Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13534v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13534v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhonghe Zhang, Xiaoyu He, Vivek Iyer, Alexandra Birch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine Translation of Culture-Specific Items (CSIs) poses significant
+challenges. Recent work on CSI translation has shown some success using Large
+Language Models (LLMs) to adapt to different languages and cultures; however, a
+deeper analysis is needed to examine the benefits and pitfalls of each method.
+In this paper, we introduce the ChineseMenuCSI dataset, the largest for
+Chinese-English menu corpora, annotated with CSI vs Non-CSI labels and a
+fine-grained test set. We define three levels of CSI figurativeness for a more
+nuanced analysis and develop a novel methodology for automatic CSI
+identification, which outperforms GPT-based prompts in most categories.
+Importantly, we are the first to integrate human translation theories into
+LLM-driven translation processes, significantly improving translation accuracy,
+with COMET scores increasing by up to 7 points.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Pandora's Box or Aladdin's Lamp: A Comprehensive Analysis Revealing the
+  Role of RAG Noise in Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13533v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13533v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinyang Wu, Feihu Che, Chuyuan Zhang, Jianhua Tao, Shuai Zhang, Pengpeng Shao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-Augmented Generation (RAG) has emerged as a crucial method for
+addressing hallucinations in large language models (LLMs). While recent
+research has extended RAG models to complex noisy scenarios, these explorations
+often confine themselves to limited noise types and presuppose that noise is
+inherently detrimental to LLMs, potentially deviating from real-world retrieval
+environments and restricting practical applicability. In this paper, we define
+seven distinct noise types from a linguistic perspective and establish a Noise
+RAG Benchmark (NoiserBench), a comprehensive evaluation framework encompassing
+multiple datasets and reasoning tasks. Through empirical evaluation of eight
+representative LLMs with diverse architectures and scales, we reveal that these
+noises can be further categorized into two practical groups: noise that is
+beneficial to LLMs (aka beneficial noise) and noise that is harmful to LLMs
+(aka harmful noise). While harmful noise generally impairs performance,
+beneficial noise may enhance several aspects of model capabilities and overall
+performance. Our analysis offers insights for developing more robust, adaptable
+RAG solutions and mitigating hallucinations across diverse retrieval scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HRGraph: Leveraging LLMs for HR Data Knowledge Graphs with Information
+  Propagation-based Job Recommendation <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13521v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13521v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Azmine Toushik Wasi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge Graphs (KGs) serving as semantic networks, prove highly effective
+in managing complex interconnected data in different domains, by offering a
+unified, contextualized, and structured representation with flexibility that
+allows for easy adaptation to evolving knowledge. Processing complex Human
+Resources (HR) data, KGs can help in different HR functions like recruitment,
+job matching, identifying learning gaps, and enhancing employee retention.
+Despite their potential, limited efforts have been made to implement practical
+HR knowledge graphs. This study addresses this gap by presenting a framework
+for effectively developing HR knowledge graphs from documents using Large
+Language Models. The resulting KG can be used for a variety of downstream
+tasks, including job matching, identifying employee skill gaps, and many more.
+In this work, we showcase instances where HR KGs prove instrumental in precise
+job matching, yielding advantages for both employers and employees. Empirical
+evidence from experiments with information propagation in KGs and Graph Neural
+Nets, along with case studies underscores the effectiveness of KGs in tasks
+such as job and employee recommendations and job area classification. Code and
+data are available at : https://github.com/azminewasi/HRGraph
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 Pages, 4 Figures. View in ACL Anthology:
+  https://aclanthology.org/2024.kallm-1.6/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Selective Preference Optimization via Token-Level Reward Function
+  Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13518v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13518v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kailai Yang, Zhiwei Liu, Qianqian Xie, Jimin Huang, Erxue Min, Sophia Ananiadou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in large language model alignment leverage token-level
+supervisions to perform fine-grained preference optimization. However, existing
+token-level alignment methods either optimize on all available tokens, which
+can be noisy and inefficient, or perform selective training with complex and
+expensive key token selection strategies. In this work, we propose Selective
+Preference Optimization (SePO), a novel selective alignment strategy that
+centers on efficient key token selection. SePO proposes the first token
+selection method based on Direct Preference Optimization (DPO), which trains an
+oracle model to estimate a token-level reward function on the target data. This
+method applies to any existing alignment datasets with response-level
+annotations and enables cost-efficient token selection with small-scale oracle
+models and training data. The estimated reward function is then utilized to
+score all tokens within the target dataset, where only the key tokens are
+selected to supervise the target policy model with a reference model-free
+contrastive objective function. Extensive experiments on three public
+evaluation benchmarks show that SePO significantly outperforms competitive
+baseline methods by only optimizing 30% key tokens on the target dataset. SePO
+applications on weak-to-strong generalization show that weak oracle models
+effectively supervise strong policy models with up to 16.8x more parameters.
+SePO also effectively selects key tokens from out-of-distribution data to
+enhance strong policy models and alleviate the over-optimization problem.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Utilizing Large Language Models for Named Entity Recognition in
+  Traditional Chinese Medicine against COVID-19 Literature: Comparative Study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13501v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13501v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xu Tong, Nina Smirnova, Sharmila Upadhyaya, Ran Yu, Jack H. Culbert, Chao Sun, Wolfgang Otto, Philipp Mayr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Objective: To explore and compare the performance of ChatGPT and other
+state-of-the-art LLMs on domain-specific NER tasks covering different entity
+types and domains in TCM against COVID-19 literature. Methods: We established a
+dataset of 389 articles on TCM against COVID-19, and manually annotated 48 of
+them with 6 types of entities belonging to 3 domains as the ground truth,
+against which the NER performance of LLMs can be assessed. We then performed
+NER tasks for the 6 entity types using ChatGPT (GPT-3.5 and GPT-4) and 4
+state-of-the-art BERT-based question-answering (QA) models (RoBERTa, MiniLM,
+PubMedBERT and SciBERT) without prior training on the specific task. A domain
+fine-tuned model (GSAP-NER) was also applied for a comprehensive comparison.
+Results: The overall performance of LLMs varied significantly in exact match
+and fuzzy match. In the fuzzy match, ChatGPT surpassed BERT-based QA models in
+5 out of 6 tasks, while in exact match, BERT-based QA models outperformed
+ChatGPT in 5 out of 6 tasks but with a smaller F-1 difference. GPT-4 showed a
+significant advantage over other models in fuzzy match, especially on the
+entity type of TCM formula and the Chinese patent drug (TFD) and ingredient
+(IG). Although GPT-4 outperformed BERT-based models on entity type of herb,
+target, and research method, none of the F-1 scores exceeded 0.5. GSAP-NER,
+outperformed GPT-4 in terms of F-1 by a slight margin on RM. ChatGPT achieved
+considerably higher recalls than precisions, particularly in the fuzzy match.
+Conclusions: The NER performance of LLMs is highly dependent on the entity
+type, and their performance varies across application scenarios. ChatGPT could
+be a good choice for scenarios where high recall is favored. However, for
+knowledge acquisition in rigorous scenarios, neither ChatGPT nor BERT-based QA
+models are off-the-shelf tools for professional practitioners.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages with 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Why Antiwork: A Ro<span class="highlight-title">BERT</span>a-Based System for Work-Related Stress
+  Identification and Leading Factor Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13473v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13473v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tao Lu, Muzhe Wu, Xinyi Lu, Siyuan Xu, Shuyu Zhan, Anuj Tambwekar, Emily Mower Provost
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Harsh working environments and work-related stress have been known to
+contribute to mental health problems such as anxiety, depression, and suicidal
+ideation. As such, it is paramount to create solutions that can both detect
+employee unhappiness and find the root cause of the problem. While prior works
+have examined causes of mental health using machine learning, they typically
+focus on general mental health analysis, with few of them focusing on
+explainable solutions or looking at the workplace-specific setting. r/antiwork
+is a subreddit for the antiwork movement, which is the desire to stop working
+altogether. Using this subreddit as a proxy for work environment
+dissatisfaction, we create a new dataset for antiwork sentiment detection and
+subsequently train a model that highlights the words with antiwork sentiments.
+Following this, we performed a qualitative and quantitative analysis to uncover
+some of the key insights into the mindset of individuals who identify with the
+antiwork movement and how their working environments influenced them. We find
+that working environments that do not give employees authority or
+responsibility, frustrating recruiting experiences, and unfair compensation,
+are some of the leading causes of the antiwork sentiment, resulting in a lack
+of self-confidence and motivation among their employees.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Make Every Penny Count: Difficulty-Adaptive Self-Consistency for
+  Cost-Efficient Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13457v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13457v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinglin Wang, Shaoxiong Feng, Yiwei Li, Peiwen Yuan, Yueqi Zhang, Boyuan Pan, Heda Wang, Yao Hu, Kan Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-consistency (SC), a widely used decoding strategy for chain-of-thought
+reasoning, shows significant gains across various multi-step reasoning tasks
+but comes with a high cost due to multiple sampling with the preset size. Its
+variants, Adaptive self-consistency (ASC) and Early-stopping self-consistency
+(ESC), dynamically adjust the number of samples based on the posterior
+distribution of a set of pre-samples, reducing the cost of SC with minimal
+impact on performance. Both methods, however, do not exploit the prior
+information about question difficulty. It often results in unnecessary repeated
+sampling for easy questions that could be accurately answered with just one
+attempt, wasting resources. To tackle this problem, we propose
+Difficulty-Adaptive Self-Consistency (DSC), which leverages the difficulty
+information from both prior and posterior perspectives to adaptively allocate
+inference resources, further reducing the cost of SC. To demonstrate the
+effectiveness of DSC, we conduct extensive experiments on three popular
+categories of reasoning tasks: arithmetic, commonsense and symbolic reasoning
+on six benchmarks. The empirical results show that DSC consistently surpasses
+the strong baseline ASC and ESC in terms of costs by a significant margin,
+while attaining comparable performances.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Law of Next-Token Prediction in Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13442v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13442v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hangfeng He, Weijie J. Su
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have been widely employed across various
+application domains, yet their black-box nature poses significant challenges to
+understanding how these models process input data internally to make
+predictions. In this paper, we introduce a precise and quantitative law that
+governs the learning of contextualized token embeddings through intermediate
+layers in pre-trained LLMs for next-token prediction. Our findings reveal that
+each layer contributes equally to enhancing prediction accuracy, from the
+lowest to the highest layer -- a universal phenomenon observed across a diverse
+array of open-source LLMs, built on architectures such as Transformer, RWKV,
+and Mamba. We demonstrate that this law offers new perspectives and insights to
+inform and guide practices in LLM development and applications, including model
+scaling, pre-training tasks, and information flow. Overall, our law enables
+more fine-grained approaches to the design, training, and interpretation of
+LLMs through scrutinizing their internal data processing mechanisms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Knowledge-Aware Conversation Derailment Forecasting Using Graph
+  Convolutional Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13440v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13440v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Enas Altarawneh, Ameeta Agrawal, Michael Jenkin, Manos Papagelis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online conversations are particularly susceptible to derailment, which can
+manifest itself in the form of toxic communication patterns including
+disrespectful comments and abuse. Forecasting conversation derailment predicts
+signs of derailment in advance enabling proactive moderation of conversations.
+State-of-the-art approaches to conversation derailment forecasting sequentially
+encode conversations and use graph neural networks to model dialogue user
+dynamics. However, existing graph models are not able to capture complex
+conversational characteristics such as context propagation and emotional
+shifts. The use of common sense knowledge enables a model to capture such
+characteristics, thus improving performance. Following this approach, here we
+derive commonsense statements from a knowledge base of dialogue contextual
+information to enrich a graph neural network classification architecture. We
+fuse the multi-source information on utterance into capsules, which are used by
+a transformer-based forecaster to predict conversation derailment. Our model
+captures conversation dynamics and context propagation, outperforming the
+state-of-the-art models on the CGA and CMV benchmark datasets
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2306.12982;
+  text overlap with arXiv:2106.01071 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Integrating Multi-Head Convolutional Encoders with Cross-Attention for
+  Improved SPARQL Query Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13432v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13432v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi-Hui Chen, Eric Jui-Lin Lu, Kwan-Ho Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The main task of the KGQA system (Knowledge Graph Question Answering) is to
+convert user input questions into query syntax (such as SPARQL). With the rise
+of modern popular encoders and decoders like Transformer and ConvS2S, many
+scholars have shifted the research direction of SPARQL generation to the Neural
+Machine Translation (NMT) architecture or the generative AI field of
+Text-to-SPARQL. In NMT-based QA systems, the system treats knowledge base query
+syntax as a language. It uses NMT-based translation models to translate natural
+language questions into query syntax. Scholars use popular architectures
+equipped with cross-attention, such as Transformer, ConvS2S, and BiLSTM, to
+train translation models for query syntax. To achieve better query results,
+this paper improved the ConvS2S encoder and added multi-head attention from the
+Transformer, proposing a Multi-Head Conv encoder (MHC encoder) based on the
+n-gram language model. The principle is to use convolutional layers to capture
+local hidden features in the input sequence with different receptive fields,
+using multi-head attention to calculate dependencies between them. Ultimately,
+we found that the translation model based on the Multi-Head Conv encoder
+achieved better performance than other encoders, obtaining 76.52\% and 83.37\%
+BLEU-1 (BiLingual Evaluation Understudy) on the QALD-9 and LC-QuAD-1.0
+datasets, respectively. Additionally, in the end-to-end system experiments on
+the QALD-9 and LC-QuAD-1.0 datasets, we achieved leading results over other
+KGQA systems, with Macro F1-measures reaching 52\% and 66\%, respectively.
+Moreover, the experimental results show that with limited computational
+resources, if one possesses an excellent encoder-decoder architecture and
+cross-attention, experts and scholars can achieve outstanding performance
+equivalent to large pre-trained models using only general embeddings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 20 figures, using the engrXiv template; the full version
+  has been submitted to ACM Transactions on Information Systems and is
+  currently under review. (2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Lemur: Harmonizing Natural Language and Code for Language Agents <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.06830v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.06830v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiheng Xu, Hongjin Su, Chen Xing, Boyu Mi, Qian Liu, Weijia Shi, Binyuan Hui, Fan Zhou, Yitao Liu, Tianbao Xie, Zhoujun Cheng, Siheng Zhao, Lingpeng Kong, Bailin Wang, Caiming Xiong, Tao Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Lemur and Lemur-Chat, openly accessible language models
+optimized for both natural language and coding capabilities to serve as the
+backbone of versatile language agents. The evolution from language chat models
+to functional language agents demands that models not only master human
+interaction, reasoning, and planning but also ensure grounding in the relevant
+environments. This calls for a harmonious blend of language and coding
+capabilities in the models. Lemur and Lemur-Chat are proposed to address this
+necessity, demonstrating balanced proficiencies in both domains, unlike
+existing open-source models that tend to specialize in either. Through
+meticulous pre-training using a code-intensive corpus and instruction
+fine-tuning on text and code data, our models achieve state-of-the-art averaged
+performance across diverse text and coding benchmarks among open-source models.
+Comprehensive experiments demonstrate Lemur's superiority over existing
+open-source models and its proficiency across various agent tasks involving
+human communication, tool usage, and interaction under fully- and partially-
+observable environments. The harmonization between natural and programming
+languages enables Lemur-Chat to significantly narrow the gap with proprietary
+models on agent abilities, providing key insights into developing advanced
+open-source agents adept at reasoning, planning, and operating seamlessly
+across environments. https://github.com/OpenLemur/Lemur
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2024 Spotlight; https://github.com/OpenLemur/Lemur</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SpeechDPR: End-to-End Spoken Passage Retrieval for Open-Domain Spoken
+  Question Answering <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13463v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13463v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chyi-Jiunn Lin, Guan-Ting Lin, Yung-Sung Chuang, Wei-Lun Wu, Shang-Wen Li, Abdelrahman Mohamed, Hung-yi Lee, Lin-shan Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spoken Question Answering (SQA) is essential for machines to reply to user's
+question by finding the answer span within a given spoken passage. SQA has been
+previously achieved without ASR to avoid recognition errors and
+Out-of-Vocabulary (OOV) problems. However, the real-world problem of
+Open-domain SQA (openSQA), in which the machine needs to first retrieve
+passages that possibly contain the answer from a spoken archive in addition,
+was never considered. This paper proposes the first known end-to-end framework,
+Speech Dense Passage Retriever (SpeechDPR), for the retrieval component of the
+openSQA problem. SpeechDPR learns a sentence-level semantic representation by
+distilling knowledge from the cascading model of unsupervised ASR (UASR) and
+text dense retriever (TDR). No manually transcribed speech data is needed.
+Initial experiments showed performance comparable to the cascading model of
+UASR and TDR, and significantly better when UASR was poor, verifying this
+approach is more robust to speech recognition errors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICASSP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unc-TTP: A Method for Classifying LLM Uncertainty to Improve In-Context
+  Example Selection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.09172v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.09172v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hsiu-Yuan Huang, Zichen Wu, Yutong Yang, Junzhao Zhang, Yunfang Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nowadays, Large Language Models (LLMs) have demonstrated exceptional
+performance across various downstream tasks. However, it is challenging for
+users to discern whether the responses are generated with certainty or are
+fabricated to meet user expectations. Estimating the uncertainty of LLMs is
+particularly challenging due to their vast scale and the lack of white-box
+access. In this work, we propose a novel Uncertainty Tripartite Testing
+Paradigm (Unc-TTP) to classify LLM uncertainty, via evaluating the consistency
+of LLM outputs when incorporating label interference into the sampling-based
+approach. Based on Unc-TTP outputs, we aggregate instances into certain and
+uncertain categories. Further, we conduct a detailed analysis of the
+uncertainty properties of LLMs and show Unc-TTP's superiority over the existing
+sampling-based methods. In addition, we leverage the obtained uncertainty
+information to guide in-context example selection, demonstrating that Unc-TTP
+obviously outperforms retrieval-based and sampling-based approaches in
+selecting more informative examples. Our work paves a new way to classify the
+uncertainty of both open- and closed-source LLMs, and introduces a practical
+approach to exploit this uncertainty to improve LLMs performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The model diagram in Figure 1 on page 3 of the paper has significant
+  ambiguities. It may lead readers to mistakenly believe that the experiments
+  were conducted in a multi-turn dialogue format. Therefore, we request the
+  withdrawal of this submission</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Empowering Whisper as a Joint Multi-Talker and Target-Talker Speech
+  Recognition System <span class="chip">INTERSPEECH 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.09817v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.09817v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lingwei Meng, Jiawen Kang, Yuejiao Wang, Zengrui Jin, Xixin Wu, Xunying Liu, Helen Meng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-talker speech recognition and target-talker speech recognition, both
+involve transcription in multi-talker contexts, remain significant challenges.
+However, existing methods rarely attempt to simultaneously address both tasks.
+In this study, we propose a pioneering approach to empower Whisper, which is a
+speech foundation model, to tackle joint multi-talker and target-talker speech
+recognition tasks. Specifically, (i) we freeze Whisper and plug a Sidecar
+separator into its encoder to separate mixed embedding for multiple talkers;
+(ii) a Target Talker Identifier is introduced to identify the embedding flow of
+the target talker on the fly, requiring only three-second enrollment speech as
+a cue; (iii) soft prompt tuning for decoder is explored for better task
+adaptation. Our method outperforms previous methods on two- and three-talker
+LibriMix and LibriSpeechMix datasets for both tasks, and delivers acceptable
+zero-shot performance on multi-talker ASR on AishellMix Mandarin dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to INTERSPEECH 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Synergy-of-Thoughts: Eliciting Efficient Reasoning in Hybrid Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.02563v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.02563v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Shang, Yu Li, Fengli Xu, Yong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have shown impressive emergent abilities in a
+wide range of tasks, but the associated expensive API cost greatly limits the
+real application. Previous works like chain-of-thought (CoT) and
+tree-of-thoughts (ToT) have predominately focused on enhancing accuracy, but
+overlook the rapidly increasing API cost, which could be particularly
+problematic for open-ended real-world tasks with huge solution spaces.
+Motivated by the dual process theory of human cognition, we propose "Synergy of
+Thoughts"(SoT) to unleash the synergistic potential of hybrid LLMs with
+different scales for efficient reasoning. By default, SoT uses smaller-scale
+language models to generate multiple low-cost intuitive thoughts, which
+resembles the parallel intuitions produced by System 1. We then design a
+confidence evaluator where the intuitive thoughts are cross-evaluated and
+introduce a controllable threshold mechanism to decide their mutual conflict.
+If these intuitive thoughts exhibit conflicts, SoT will invoke the reflective
+reasoning of scaled-up language models to emulate the intervention of System 2,
+which will override the intuitive thoughts and rectify the reasoning results.
+This framework is model-agnostic and training-free, which can be flexibly
+implemented with various off-the-shelf LLMs. Experiments on six representative
+reasoning tasks show that SoT substantially reduces the API cost by
+38.3%-75.1%, and simultaneously achieves state-of-the-art reasoning accuracy
+and solution diversity. Notably, the average token cost reduction on open-ended
+tasks reaches up to 69.1%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 16 figures, 12 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Is Sarcasm Detection A Step-by-Step Reasoning Process in Large Language
+  Models? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2407.12725v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2407.12725v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ben Yao, Yazhou Zhang, Qiuchi Li, Jing Qin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Elaborating a series of intermediate reasoning steps significantly improves
+the ability of large language models (LLMs) to solve complex problems, as such
+steps would evoke LLMs to think sequentially. However, human sarcasm
+understanding is often considered an intuitive and holistic cognitive process,
+in which various linguistic, contextual, and emotional cues are integrated to
+form a comprehensive understanding, in a way that does not necessarily follow a
+step-by-step fashion. To verify the validity of this argument, we introduce a
+new prompting framework (called SarcasmCue) containing four sub-methods, viz.
+chain of contradiction (CoC), graph of cues (GoC), bagging of cues (BoC) and
+tensor of cues (ToC), which elicits LLMs to detect human sarcasm by considering
+sequential and non-sequential prompting methods. Through a comprehensive
+empirical comparison on four benchmarks, we highlight three key findings: (1)
+CoC and GoC show superior performance with more advanced models like GPT-4 and
+Claude 3.5, with an improvement of 3.5%. (2) ToC significantly outperforms
+other methods when smaller LLMs are evaluated, boosting the F1 score by 29.7%
+over the best baseline. (3) Our proposed framework consistently pushes the
+state-of-the-art (i.e., ToT) by 4.2%, 2.0%, 29.7%, and 58.2% in F1 scores
+across four datasets. This demonstrates the effectiveness and stability of the
+proposed framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Fellowship of the LLMs: Multi-Agent Workflows for Synthetic
+  Preference Optimization <span class="highlight-title">Dataset</span> Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.08688v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.08688v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samee Arif, Sualeha Farid, Abdul Hameed Azeemi, Awais Athar, Agha Ali Raza
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents synthetic Preference Optimization (PO) datasets generated
+using multi-agent workflows and evaluates the effectiveness and potential of
+these workflows in the dataset generation process. PO dataset generation
+requires two modules: (1) response evaluation, and (2) response generation. In
+the response evaluation module, the responses from Large Language Models (LLMs)
+are evaluated and ranked - a task typically carried out by human annotators
+that we automate using LLMs. We assess the response evaluation module in a 2
+step process. In step 1, we assess LLMs as evaluators using three distinct
+prompting strategies. In step 2, we apply the winning prompting strategy to
+compare the performance of LLM-as-a-Judge, LLMs-as-a-Jury, and LLM Debate. In
+each step, we use inter-rater agreement using Cohen's Kappa between human
+annotators and LLMs. For the response generation module, we compare different
+configurations for the LLM Feedback Loop using the identified LLM evaluator
+configuration. We use the win rate (the fraction of times a generation
+framework is selected as the best by an LLM evaluator) to determine the best
+multi-agent configuration for generation. After identifying the best
+configurations for both modules, we use models from the GPT, Gemma, and Llama
+families to generate our PO datasets using the above pipeline. We generate two
+types of PO datasets, one to improve the generation capabilities of individual
+LLM and the other to improve the multi-agent workflow. Our evaluation shows
+that GPT-4o-as-a-Judge is more consistent across datasets when the candidate
+responses do not include responses from the GPT family. Additionally, we find
+that the LLM Feedback Loop, with Llama as the generator and Gemma as the
+reviewer, achieves a notable 71.8% and 73.8% win rate over single-agent Llama
+and Gemma, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Lyra: Orchestrating Dual Correction in Automated Theorem Proving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.15806v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.15806v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chuanyang Zheng, Haiming Wang, Enze Xie, Zhengying Liu, Jiankai Sun, Huajian Xin, Jianhao Shen, Zhenguo Li, Yu Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) present an intriguing avenue for exploration in
+the field of formal theorem proving. Nevertheless, their full potential,
+particularly concerning the mitigation of hallucinations and refinement through
+prover error messages, remains an area that has yet to be thoroughly
+investigated. To enhance the effectiveness of LLMs in the field, we introduce
+the Lyra, a new framework that employs two distinct correction mechanisms: Tool
+Correction (TC) and Conjecture Correction (CC). To implement Tool Correction in
+the post-processing of formal proofs, we leverage prior knowledge to utilize
+predefined prover tools (e.g., Sledgehammer) for guiding the replacement of
+incorrect tools. Tool Correction significantly contributes to mitigating
+hallucinations, thereby improving the overall accuracy of the proof. In
+addition, we introduce Conjecture Correction, an error feedback mechanism
+designed to interact with prover to refine formal proof conjectures with prover
+error messages. Compared to the previous refinement framework, the proposed
+Conjecture Correction refines generation with instruction but does not collect
+paired (generation, error & refinement) prompts. Our method has achieved
+state-of-the-art (SOTA) performance on both miniF2F validation (48.0% -> 55.3%)
+and test (45.5% -> 51.2%). We also present 3 IMO problems solved by Lyra. We
+believe Tool Correction (post-process for hallucination mitigation) and
+Conjecture Correction (subgoal adjustment from interaction with environment)
+could provide a promising avenue for future research in this field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to TMLR: https://openreview.net/forum?id=9Z0yB8rmQ2</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Comprehensive Evaluation of <span class="highlight-title">GPT</span>-4V on Knowledge-Intensive Visual
+  Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.07536v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.07536v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunxin Li, Longyue Wang, Baotian Hu, Xinyu Chen, Wanqi Zhong, Chenyang Lyu, Wei Wang, Min Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The emergence of multimodal large models (MLMs) has significantly advanced
+the field of visual understanding, offering remarkable capabilities in the
+realm of visual question answering (VQA). Yet, the true challenge lies in the
+domain of knowledge-intensive VQA tasks, which necessitate not just recognition
+of visual elements, but also a deep comprehension of the visual information in
+conjunction with a vast repository of learned knowledge. To uncover such
+capabilities of MLMs, particularly the newly introduced GPT-4V and Gemini, we
+provide an in-depth evaluation from three perspectives: 1) Commonsense
+Knowledge, which assesses how well models can understand visual cues and
+connect to general knowledge; 2) Fine-grained World Knowledge, which tests the
+model's skill in reasoning out specific knowledge from images, showcasing their
+proficiency across various specialized fields; 3) Comprehensive Knowledge with
+Decision-making Rationales, which examines model's capability to provide
+logical explanations for its inference, facilitating a deeper analysis from the
+interpretability perspective. Additionally, we utilize a visual
+knowledge-enhanced training strategy and multimodal retrieval-augmented
+generation approach to enhance MLMs, highlighting the future need for
+advancements in this research direction. Extensive experiments indicate that:
+a) GPT-4V demonstrates enhanced explanation generation when using composite
+images as few-shots; b) GPT-4V and other MLMs produce severe hallucinations
+when dealing with world knowledge; c) Visual knowledge enhanced training and
+prompting technicals present potential to improve performance. Codes:
+https://github.com/HITsz-TMG/Cognitive-Visual-Language-Mapper
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 15 pages; technical paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Big Tech influence over AI research revisited: memetic analysis of
+  attribution of ideas to affiliation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.12881v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.12881v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stanisław Giziński, Paulina Kaczyńska, Hubert Ruczyński, Emilia Wiśnios, Bartosz Pieliński, Przemysław Biecek, Julian Sienkiewicz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There exists a growing discourse around the domination of Big Tech on the
+landscape of artificial intelligence (AI) research, yet our comprehension of
+this phenomenon remains cursory. This paper aims to broaden and deepen our
+understanding of Big Tech's reach and power within AI research. It highlights
+the dominance not merely in terms of sheer publication volume but rather in the
+propagation of new ideas or memes. Current studies often oversimplify the
+concept of influence to the share of affiliations in academic papers, typically
+sourced from limited databases such as arXiv or specific academic conferences.
+  The main goal of this paper is to unravel the specific nuances of such
+influence, determining which AI ideas are predominantly driven by Big Tech
+entities. By employing network and memetic analysis on AI-oriented paper
+abstracts and their citation network, we are able to grasp a deeper insight
+into this phenomenon. By utilizing two databases: OpenAlex and S2ORC, we are
+able to perform such analysis on a much bigger scale than previous attempts.
+  Our findings suggest that while Big Tech-affiliated papers are
+disproportionately more cited in some areas, the most cited papers are those
+affiliated with both Big Tech and Academia. Focusing on the most contagious
+memes, their attribution to specific affiliation groups (Big Tech, Academia,
+mixed affiliation) seems equally distributed between those three groups. This
+suggests that the notion of Big Tech domination over AI research is
+oversimplified in the discourse.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SarcasmBench: Towards Evaluating Large Language Models on Sarcasm
+  Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.11319v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.11319v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yazhou Zhang, Chunwang Zou, Zheng Lian, Prayag Tiwari, Jing Qin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the era of large language models (LLMs), the task of ``System I''~-~the
+fast, unconscious, and intuitive tasks, e.g., sentiment analysis, text
+classification, etc., have been argued to be successfully solved. However,
+sarcasm, as a subtle linguistic phenomenon, often employs rhetorical devices
+like hyperbole and figuration to convey true sentiments and intentions,
+involving a higher level of abstraction than sentiment analysis. There is
+growing concern that the argument about LLMs' success may not be fully tenable
+when considering sarcasm understanding. To address this question, we select
+eleven SOTA LLMs and eight SOTA pre-trained language models (PLMs) and present
+comprehensive evaluations on six widely used benchmark datasets through
+different prompting approaches, i.e., zero-shot input/output (IO) prompting,
+few-shot IO prompting, chain of thought (CoT) prompting. Our results highlight
+three key findings: (1) current LLMs underperform supervised PLMs based sarcasm
+detection baselines across six sarcasm benchmarks. This suggests that
+significant efforts are still required to improve LLMs' understanding of human
+sarcasm. (2) GPT-4 consistently and significantly outperforms other LLMs across
+various prompting methods, with an average improvement of 14.0\%$\uparrow$.
+Claude 3 and ChatGPT demonstrate the next best performance after GPT-4. (3)
+Few-shot IO prompting method outperforms the other two methods: zero-shot IO
+and few-shot CoT. The reason is that sarcasm detection, being a holistic,
+intuitive, and non-rational cognitive process, is argued not to adhere to
+step-by-step logical reasoning, making CoT less effective in understanding
+sarcasm compared to its effectiveness in mathematical reasoning tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Re-Thinking Inverse Graphics With Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.15228v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.15228v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peter Kulits, Haiwen Feng, Weiyang Liu, Victoria Abrevaya, Michael J. Black
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Inverse graphics -- the task of inverting an image into physical variables
+that, when rendered, enable reproduction of the observed scene -- is a
+fundamental challenge in computer vision and graphics. Successfully
+disentangling an image into its constituent elements, such as the shape, color,
+and material properties of the objects of the 3D scene that produced it,
+requires a comprehensive understanding of the environment. This complexity
+limits the ability of existing carefully engineered approaches to generalize
+across domains. Inspired by the zero-shot ability of large language models
+(LLMs) to generalize to novel contexts, we investigate the possibility of
+leveraging the broad world knowledge encoded in such models to solve
+inverse-graphics problems. To this end, we propose the Inverse-Graphics Large
+Language Model (IG-LLM), an inverse-graphics framework centered around an LLM,
+that autoregressively decodes a visual embedding into a structured,
+compositional 3D-scene representation. We incorporate a frozen pre-trained
+visual encoder and a continuous numeric head to enable end-to-end training.
+Through our investigation, we demonstrate the potential of LLMs to facilitate
+inverse graphics through next-token prediction, without the application of
+image-space supervision. Our analysis enables new possibilities for precise
+spatial reasoning about images that exploit the visual knowledge of LLMs. We
+release our code and data at https://ig-llm.is.tue.mpg.de/ to ensure the
+reproducibility of our investigation and to facilitate future research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>TMLR camera-ready; 31 pages; project page:
+  https://ig-llm.is.tue.mpg.de/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LBC: Language-Based-Classifier for Out-Of-Variable Generalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.10923v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.10923v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kangjun Noh, Baekryun Seong, Hoyoon Byun, Youngjun Choi, Sungjin Song, Kyungwoo Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have great success in natural language
+processing tasks such as response generation. However, their use in tabular
+data has been limited due to their inferior performance compared to traditional
+machine learning models (TMLs) such as XGBoost. We find that the pre-trained
+knowledge of LLMs enables them to interpret new variables that appear in a test
+without additional training, a capability central to the concept of
+Out-of-Variable (OOV). From the findings, we propose a
+Language-Based-Classifier (LBC), a classifier that maximizes the benefits of
+LLMs to outperform TMLs on OOV tasks. LBC employs three key methodological
+strategies: 1) Categorical changes to adjust data to better fit the model's
+understanding, 2) Advanced order and indicator to enhance data representation
+to the model, and 3) Using verbalizer to map logit scores to classes during
+inference to generate model predictions. These strategies, combined with the
+pre-trained knowledge of LBC, emphasize the model's ability to effectively
+handle OOV tasks. We empirically and theoretically validate the superiority of
+LBC. LBC is the first study to apply an LLM-based model to OOV tasks. The
+source code is at https://github.com/sksmssh/LBCforOOVGen
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 7 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Anchored Preference Optimization and Contrastive Revisions: Addressing
+  Underspecification in Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.06266v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.06266v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karel D'Oosterlinck, Winnie Xu, Chris Develder, Thomas Demeester, Amanpreet Singh, Christopher Potts, Douwe Kiela, Shikib Mehri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) are often aligned using contrastive alignment
+objectives and preference pair datasets. The interaction between model, paired
+data, and objective makes alignment a complicated procedure, sometimes
+producing subpar results. We study this and find that (i) preference data gives
+a better learning signal when the underlying responses are contrastive, and
+(ii) alignment objectives lead to better performance when they specify more
+control over the model during training. Based on these insights, we introduce
+Contrastive Learning from AI Revisions (CLAIR), a data-creation method which
+leads to more contrastive preference pairs, and Anchored Preference
+Optimization (APO), a controllable and more stable alignment objective. We
+align Llama-3-8B-Instruct using various comparable datasets and alignment
+objectives and measure MixEval-Hard scores, which correlate highly with human
+judgments. The CLAIR preferences lead to the strongest performance out of all
+datasets, and APO consistently outperforms less controllable objectives. Our
+best model, trained on 32K CLAIR preferences with APO, improves
+Llama-3-8B-Instruct by 7.65%, closing the gap with GPT4-turbo by 45%. Our code
+is available at https://github.com/ContextualAI/CLAIR_and_APO.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">7</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Col<span class="highlight-title">BERT</span>'s [MASK]-based Query Augmentation: Effects of Quadrupling the
+  Query Input Length 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13672v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13672v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ben Giacalone, Richard Zanibbi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A unique aspect of ColBERT is its use of [MASK] tokens in queries to score
+documents (query augmentation). Prior work shows [MASK] tokens weighting
+non-[MASK] query terms, emphasizing certain tokens over others , rather than
+introducing whole new terms as initially proposed. We begin by demonstrating
+that a term weighting behavior previously reported for [MASK] tokens in
+ColBERTv1 holds for ColBERTv2. We then examine the effect of changing the
+number of [MASK] tokens from zero to up to four times past the query input
+length used in training, both for first stage retrieval, and for scoring
+candidates, observing an initial decrease in performance with few [MASK]s, a
+large increase when enough [MASK]s are added to pad queries to an average
+length of 32, then a plateau in performance afterwards. Additionally, we
+compare baseline performance to performance when the query length is extended
+to 128 tokens, and find that differences are small (e.g., within 1% on various
+metrics) and generally statistically insignificant, indicating performance does
+not collapse if ColBERT is presented with more [MASK] tokens than expected.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 3 figures, two tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HRGraph: Leveraging LLMs for HR Data Knowledge Graphs with Information
+  Propagation-based Job Recommendation <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13521v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13521v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Azmine Toushik Wasi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge Graphs (KGs) serving as semantic networks, prove highly effective
+in managing complex interconnected data in different domains, by offering a
+unified, contextualized, and structured representation with flexibility that
+allows for easy adaptation to evolving knowledge. Processing complex Human
+Resources (HR) data, KGs can help in different HR functions like recruitment,
+job matching, identifying learning gaps, and enhancing employee retention.
+Despite their potential, limited efforts have been made to implement practical
+HR knowledge graphs. This study addresses this gap by presenting a framework
+for effectively developing HR knowledge graphs from documents using Large
+Language Models. The resulting KG can be used for a variety of downstream
+tasks, including job matching, identifying employee skill gaps, and many more.
+In this work, we showcase instances where HR KGs prove instrumental in precise
+job matching, yielding advantages for both employers and employees. Empirical
+evidence from experiments with information propagation in KGs and Graph Neural
+Nets, along with case studies underscores the effectiveness of KGs in tasks
+such as job and employee recommendations and job area classification. Code and
+data are available at : https://github.com/azminewasi/HRGraph
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 Pages, 4 Figures. View in ACL Anthology:
+  https://aclanthology.org/2024.kallm-1.6/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Utilizing Large Language Models for Named Entity Recognition in
+  Traditional Chinese Medicine against COVID-19 Literature: Comparative Study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13501v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13501v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xu Tong, Nina Smirnova, Sharmila Upadhyaya, Ran Yu, Jack H. Culbert, Chao Sun, Wolfgang Otto, Philipp Mayr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Objective: To explore and compare the performance of ChatGPT and other
+state-of-the-art LLMs on domain-specific NER tasks covering different entity
+types and domains in TCM against COVID-19 literature. Methods: We established a
+dataset of 389 articles on TCM against COVID-19, and manually annotated 48 of
+them with 6 types of entities belonging to 3 domains as the ground truth,
+against which the NER performance of LLMs can be assessed. We then performed
+NER tasks for the 6 entity types using ChatGPT (GPT-3.5 and GPT-4) and 4
+state-of-the-art BERT-based question-answering (QA) models (RoBERTa, MiniLM,
+PubMedBERT and SciBERT) without prior training on the specific task. A domain
+fine-tuned model (GSAP-NER) was also applied for a comprehensive comparison.
+Results: The overall performance of LLMs varied significantly in exact match
+and fuzzy match. In the fuzzy match, ChatGPT surpassed BERT-based QA models in
+5 out of 6 tasks, while in exact match, BERT-based QA models outperformed
+ChatGPT in 5 out of 6 tasks but with a smaller F-1 difference. GPT-4 showed a
+significant advantage over other models in fuzzy match, especially on the
+entity type of TCM formula and the Chinese patent drug (TFD) and ingredient
+(IG). Although GPT-4 outperformed BERT-based models on entity type of herb,
+target, and research method, none of the F-1 scores exceeded 0.5. GSAP-NER,
+outperformed GPT-4 in terms of F-1 by a slight margin on RM. ChatGPT achieved
+considerably higher recalls than precisions, particularly in the fuzzy match.
+Conclusions: The NER performance of LLMs is highly dependent on the entity
+type, and their performance varies across application scenarios. ChatGPT could
+be a good choice for scenarios where high recall is favored. However, for
+knowledge acquisition in rigorous scenarios, neither ChatGPT nor BERT-based QA
+models are off-the-shelf tools for professional practitioners.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages with 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IntOPE: Off-Policy Evaluation in the Presence of Interference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13484v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13484v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuqi Bai, Ziyu Zhao, Minqin Zhu, Kun Kuang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Off-Policy Evaluation (OPE) is employed to assess the potential impact of a
+hypothetical policy using logged contextual bandit feedback, which is crucial
+in areas such as personalized medicine and recommender systems, where online
+interactions are associated with significant risks and costs. Traditionally,
+OPE methods rely on the Stable Unit Treatment Value Assumption (SUTVA), which
+assumes that the reward for any given individual is unaffected by the actions
+of others. However, this assumption often fails in real-world scenarios due to
+the presence of interference, where an individual's reward is affected not just
+by their own actions but also by the actions of their peers. This realization
+reveals significant limitations of existing OPE methods in real-world
+applications. To address this limitation, we propose IntIPW, an IPW-style
+estimator that extends the Inverse Probability Weighting (IPW) framework by
+integrating marginalized importance weights to account for both individual
+actions and the influence of adjacent entities. Extensive experiments are
+conducted on both synthetic and real-world data to demonstrate the
+effectiveness of the proposed IntIPW method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SpeechDPR: End-to-End Spoken Passage Retrieval for Open-Domain Spoken
+  Question Answering <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13463v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13463v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chyi-Jiunn Lin, Guan-Ting Lin, Yung-Sung Chuang, Wei-Lun Wu, Shang-Wen Li, Abdelrahman Mohamed, Hung-yi Lee, Lin-shan Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spoken Question Answering (SQA) is essential for machines to reply to user's
+question by finding the answer span within a given spoken passage. SQA has been
+previously achieved without ASR to avoid recognition errors and
+Out-of-Vocabulary (OOV) problems. However, the real-world problem of
+Open-domain SQA (openSQA), in which the machine needs to first retrieve
+passages that possibly contain the answer from a spoken archive in addition,
+was never considered. This paper proposes the first known end-to-end framework,
+Speech Dense Passage Retriever (SpeechDPR), for the retrieval component of the
+openSQA problem. SpeechDPR learns a sentence-level semantic representation by
+distilling knowledge from the cascading model of unsupervised ASR (UASR) and
+text dense retriever (TDR). No manually transcribed speech data is needed.
+Initial experiments showed performance comparable to the cascading model of
+UASR and TDR, and significantly better when UASR was poor, verifying this
+approach is more robust to speech recognition errors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICASSP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ End-to-End Cost-Effective Incentive Recommendation under Budget
+  Constraint with Uplift Modeling <span class="chip">RecSys 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.11623v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.11623v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zexu Sun, Hao Yang, Dugang Liu, Yunpeng Weng, Xing Tang, Xiuqiang He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In modern online platforms, incentives are essential factors that enhance
+user engagement and increase platform revenue. Over recent years, uplift
+modeling has been introduced as a strategic approach to assign incentives to
+individual customers. Especially in many real-world applications, online
+platforms can only incentivize customers with specific budget constraints. This
+problem can be reformulated as the multi-choice knapsack problem. This
+optimization aims to select the optimal incentive for each customer to maximize
+the return on investment. Recent works in this field frequently tackle the
+budget allocation problem using a two-stage approach. However, this solution is
+confronted with the following challenges: (1) The causal inference methods
+often ignore the domain knowledge in online marketing, where the expected
+response curve of a customer should be monotonic and smooth as the incentive
+increases. (2) An optimality gap between the two stages results in inferior
+sub-optimal allocation performance due to the loss of the incentive
+recommendation information for the uplift prediction under the limited budget
+constraint. To address these challenges, we propose a novel End-to-End
+Cost-Effective Incentive Recommendation (E3IR) model under budget constraints.
+Specifically, our methods consist of two modules, i.e., the uplift prediction
+module and the differentiable allocation module. In the uplift prediction
+module, we construct prediction heads to capture the incremental improvement
+between adjacent treatments with the marketing domain constraints (i.e.,
+monotonic and smooth). We incorporate integer linear programming (ILP) as a
+differentiable layer input in the allocation module. Furthermore, we conduct
+extensive experiments on public and real product datasets, demonstrating that
+our E3IR improves allocation performance compared to existing two-stage
+approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by RecSys 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Intelligent Model Update Strategy for Sequential Recommendation <span class="chip">WWW'24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.07335v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.07335v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheqi Lv, Wenqiao Zhang, Zhengyu Chen, Shengyu Zhang, Kun Kuang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern online platforms are increasingly employing recommendation systems to
+address information overload and improve user engagement. There is an evolving
+paradigm in this research field that recommendation network learning occurs
+both on the cloud and on edges with knowledge transfer in between (i.e.,
+edge-cloud collaboration). Recent works push this field further by enabling
+edge-specific context-aware adaptivity, where model parameters are updated in
+real-time based on incoming on-edge data. However, we argue that frequent data
+exchanges between the cloud and edges often lead to inefficiency and waste of
+communication/computation resources, as considerable parameter updates might be
+redundant. To investigate this problem, we introduce Intelligent Edge-Cloud
+Parameter Request Model, abbreviated as IntellectReq.
+  IntellectReq is designed to operate on edge, evaluating the cost-benefit
+landscape of parameter requests with minimal computation and communication
+overhead. We formulate this as a novel learning task, aimed at the detection of
+out-of-distribution data, thereby fine-tuning adaptive communication
+strategies. Further, we employ statistical mapping techniques to convert
+real-time user behavior into a normal distribution, thereby employing
+multi-sample outputs to quantify the model's uncertainty and thus its
+generalization capabilities. Rigorous empirical validation on four
+widely-adopted benchmarks evaluates our approach, evidencing a marked
+improvement in the efficiency and generalizability of edge-cloud collaborative
+and dynamic recommendation systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published on WWW'24(Oral): Proceedings of the ACM on Web Conference
+  2024 (pp. 3117-3128)</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">3</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SpeechCraft: A Fine-grained Expressive Speech <span class="highlight-title">Dataset</span> with Natural
+  Language Description 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13608v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13608v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeyu Jin, Jia Jia, Qixin Wang, Kehan Li, Shuoyi Zhou, Songtao Zhou, Xiaoyu Qin, Zhiyong Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speech-language multi-modal learning presents a significant challenge due to
+the fine nuanced information inherent in speech styles. Therefore, a
+large-scale dataset providing elaborate comprehension of speech style is
+urgently needed to facilitate insightful interplay between speech audio and
+natural language. However, constructing such datasets presents a major
+trade-off between large-scale data collection and high-quality annotation. To
+tackle this challenge, we propose an automatic speech annotation system for
+expressiveness interpretation that annotates in-the-wild speech clips with
+expressive and vivid human language descriptions. Initially, speech audios are
+processed by a series of expert classifiers and captioning models to capture
+diverse speech characteristics, followed by a fine-tuned LLaMA for customized
+annotation generation. Unlike previous tag/templet-based annotation frameworks
+with limited information and diversity, our system provides in-depth
+understandings of speech style through tailored natural language descriptions,
+thereby enabling accurate and voluminous data generation for large model
+training. With this system, we create SpeechCraft, a fine-grained bilingual
+expressive speech dataset. It is distinguished by highly descriptive natural
+language style prompts, containing approximately 2,000 hours of audio data and
+encompassing over two million speech clips. Extensive experiments demonstrate
+that the proposed dataset significantly boosts speech-language task performance
+in stylist speech synthesis and speech style understanding.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM Multimedia 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Open, Cross-Platform, Web-Based Metaverse Using WebXR and A-Frame 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2408.13520v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2408.13520v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giuseppe Macario
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The metaverse has received much attention in the literature and industry in
+the last few years, but the lack of an open and cross-platform architecture has
+led to many distinct metaverses that cannot communicate with each other. This
+work proposes a WebXR-based cross-platform architecture for developing spatial
+web apps using the A-Frame and Networked-Aframe frameworks with a view to an
+open and interoperable metaverse, accessible from both the web and extended
+reality devices. A prototype was implemented and evaluated, supporting the
+capability of the technology stack to enable immersive experiences across
+different platforms and devices. Positive feedback on ease of use of the
+immersive environment further corroborates the proposed approach, underscoring
+its effectiveness in facilitating engaging and interactive virtual spaces. By
+adhering to principles of interoperability and inclusivity, it lives up to Tim
+Berners-Lee's vision of the World Wide Web as an open platform that transcends
+geographical and technical boundaries.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2404.05317</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MMoFusion: Multi-modal Co-Speech Motion Generation with Diffusion Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.02905v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.02905v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sen Wang, Jiangning Zhang, Xin Tan, Zhifeng Xie, Chengjie Wang, Lizhuang Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The body movements accompanying speech aid speakers in expressing their
+ideas. Co-speech motion generation is one of the important approaches for
+synthesizing realistic avatars. Due to the intricate correspondence between
+speech and motion, generating realistic and diverse motion is a challenging
+task. In this paper, we propose MMoFusion, a Multi-modal co-speech Motion
+generation framework based on the diffusion model to ensure both the
+authenticity and diversity of generated motion. We propose a progressive fusion
+strategy to enhance the interaction of inter-modal and intra-modal, efficiently
+integrating multi-modal information. Specifically, we employ a masked style
+matrix based on emotion and identity information to control the generation of
+different motion styles. Temporal modeling of speech and motion is partitioned
+into style-guided specific feature encoding and shared feature encoding, aiming
+to learn both inter-modal and intra-modal features. Besides, we propose a
+geometric loss to enforce the joints' velocity and acceleration coherence among
+frames. Our framework generates vivid, diverse, and style-controllable motion
+of arbitrary length through inputting speech and editing identity and emotion.
+Extensive experiments demonstrate that our method outperforms current co-speech
+motion generation methods including upper body and challenging full body.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+
+</body>
+
+<footer>
+    <div>
+        <time id="build-timestamp" datetime="2024-09-01T05:26:08.849623484Z">
+            2024-09-01 05:26:08 UTC
+        </time>
+    </div>
+</footer>
+<script src="index.js"></script>
+</html>
diff --git a/index.js b/index.js
new file mode 100644
index 00000000..69f5da7b
--- /dev/null
+++ b/index.js
@@ -0,0 +1,39 @@
+/* Exapand/Collapse with TAB key */
+var expanded = false;
+document.onkeydown = function (e) {
+    if (e.keyCode === 9) {
+        expanded = !expanded;
+        document.querySelectorAll("details").forEach(detail => detail.open = expanded);
+        return false;
+    }
+};
+
+/* Switch Theme */
+const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]');
+
+function switchTheme(e) {
+    if (e.target.checked) {
+        document.documentElement.setAttribute('data-theme', 'light');
+        document.getElementById("theme-icon").className = "ri-sun-line";
+        localStorage.setItem('theme', 'light'); //add this
+    } else {
+        document.documentElement.setAttribute('data-theme', 'dark');
+        document.getElementById("theme-icon").className = "ri-moon-line";
+        localStorage.setItem('theme', 'dark'); //add this
+    }
+}
+
+toggleSwitch.addEventListener('change', switchTheme, false);
+const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null;
+if (currentTheme) {
+    document.documentElement.setAttribute('data-theme', currentTheme);
+    if (currentTheme === 'light') {
+        toggleSwitch.checked = true;
+    }
+}
+
+const timestamp = document.getElementById("build-timestamp");
+const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString();
+
+const badge = document.getElementById("build-timestamp-badge");
+// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`