From fdb1b6785aef159b82e1af973411beac35c4057b Mon Sep 17 00:00:00 2001 From: Zhao Penghai Date: Thu, 8 Aug 2024 12:02:55 +0800 Subject: [PATCH] 240808 --- README.md | 15 +- TKPD/TKPD.csv | 3628 +++++++++++++++++++++++++++ TKPD/prompt_keyword_async_search.py | 84 + previous_methods/Doc2Vec&Bi-LSTM.py | 250 ++ previous_methods/Ensemble_MLP.py | 97 + previous_methods/GPT_Predict.py | 177 ++ previous_methods/MLP.py | 95 + 7 files changed, 4345 insertions(+), 1 deletion(-) create mode 100644 TKPD/TKPD.csv create mode 100644 TKPD/prompt_keyword_async_search.py create mode 100644 previous_methods/Doc2Vec&Bi-LSTM.py create mode 100644 previous_methods/Ensemble_MLP.py create mode 100644 previous_methods/GPT_Predict.py create mode 100644 previous_methods/MLP.py diff --git a/README.md b/README.md index 1a9f4e5..62947f9 100644 --- a/README.md +++ b/README.md @@ -78,9 +78,22 @@ OMP_NUM_THREADS=1 accelerate launch offcial_train.py \ Then, type `sh train.sh` in the console. Wating for the training ends~ ## Testing (batch) +Similar to Fine-tuning, prepare `test.sh` as below: +``` python inference.py \ --data_path ScImpactPredict/NAID/NAID_test_extrainfo.csv \ --weight_dir path_to_runs_dir +``` +Then, type `sh test.sh`. ## Testing (single article) -Just modified the `single_pred.py` file, then type `python single_pred.py`. \ No newline at end of file +Just modified the `single_pred.py` file, then type `python single_pred.py`. + +## Model Weights +First, download the LLaMA-3 pretrain weights at huggingface official sites. +Then, download the provided LoRA weights (runs_dir) [here](https://drive.google.com/file/d/13-ugXsm35AuzOBUlL6jPacY_z8qVIb7x/view?usp=sharing). + +## Compare with Previos Methods +With a few adjustments based on your specific needs, it should work fine. Since these models train very quickly (less than few minutes on a single RTX 3080), we won’t be providing the trained models. + +### We are pretty confident in our methodology and experiments, and you should be able to achieve any of the performance reported in our paper. \ No newline at end of file diff --git a/TKPD/TKPD.csv b/TKPD/TKPD.csv new file mode 100644 index 0000000..982fdd4 --- /dev/null +++ b/TKPD/TKPD.csv @@ -0,0 +1,3628 @@ +title,abstract,field, +Active Token Mixer,"The three existing dominant network families, i.e., CNNs, Transformers, and +MLPs, differ from each other mainly in the ways of fusing spatial contextual +information, leaving designing more effective token-mixing mechanisms at the +core of backbone architecture development. In this work, we propose an +innovative token-mixer, dubbed Active Token Mixer (ATM), to actively +incorporate flexible contextual information distributed across different +channels from other tokens into the given query token. This fundamental +operator actively predicts where to capture useful contexts and learns how to +fuse the captured contexts with the query token at channel level. In this way, +the spatial range of token-mixing can be expanded to a global scope with +limited computational complexity, where the way of token-mixing is reformed. We +take ATM as the primary operator and assemble ATMs into a cascade architecture, +dubbed ATMNet. Extensive experiments demonstrate that ATMNet is generally +applicable and comprehensively surpasses different families of SOTA vision +backbones by a clear margin on a broad range of vision tasks, including visual +recognition and dense prediction tasks. Code is available at +https://github.com/microsoft/ActiveMLP.",token mixing mechanism, +Enhancing Multi-modal and Multi-hop Question Answering via Structured Knowledge and Unified Retrieval-Generation,"Multi-modal multi-hop question answering involves answering a question by +reasoning over multiple input sources from different modalities. Existing +methods often retrieve evidences separately and then use a language model to +generate an answer based on the retrieved evidences, and thus do not adequately +connect candidates and are unable to model the interdependent relations during +retrieval. Moreover, the pipelined approaches of retrieval and generation might +result in poor generation performance when retrieval performance is low. To +address these issues, we propose a Structured Knowledge and Unified +Retrieval-Generation (SKURG) approach. SKURG employs an Entity-centered Fusion +Encoder to align sources from different modalities using shared entities. It +then uses a unified Retrieval-Generation Decoder to integrate intermediate +retrieval results for answer generation and also adaptively determine the +number of retrieval steps. Extensive experiments on two representative +multi-modal multi-hop QA datasets MultimodalQA and WebQA demonstrate that SKURG +outperforms the state-of-the-art models in both source retrieval and answer +generation performance with fewer parameters. Our code is available at +https://github.com/HITsz-TMG/SKURG.",question answering, +Effective classification of ECG signals using enhanced convolutional neural network in IOT,"In this paper, a novel ECG monitoring approach based on IoT technology is +suggested. This paper proposes a routing system for IoT healthcare platforms +based on Dynamic Source Routing (DSR) and Routing by Energy and Link Quality +(REL). In addition, the Artificial Neural Network (ANN), Support Vector Machine +(SVM), and Convolution Neural Networks (CNNs)-based approaches for ECG signal +categorization were tested in this study. Deep-ECG will employ a deep CNN to +extract important characteristics, which will then be compared using simple and +fast distance functions in order to classify cardiac problems efficiently. This +work has suggested algorithms for the categorization of ECG data acquired from +mobile watch users in order to identify aberrant data. The Massachusetts +Institute of Technology (MIT) and Beth Israel Hospital (MIT/BIH) Arrhythmia +Database have been used for experimental verification of the suggested +approaches. The results show that the proposed strategy outperforms others in +terms of classification accuracy.",ECG signal classification, +Incentivizing Federated Learning,"Federated Learning is an emerging distributed collaborative learning paradigm +used by many of applications nowadays. The effectiveness of federated learning +relies on clients' collective efforts and their willingness to contribute local +data. However, due to privacy concerns and the costs of data collection and +model training, clients may not always contribute all the data they possess, +which would negatively affect the performance of the global model. This paper +presents an incentive mechanism that encourages clients to contribute as much +data as they can obtain. Unlike previous incentive mechanisms, our approach +does not monetize data. Instead, we implicitly use model performance as a +reward, i.e., significant contributors are paid off with better models. We +theoretically prove that clients will use as much data as they can possibly +possess to participate in federated learning under certain conditions with our +incentive mechanism",federated learning, +Roadmap to Autonomous Surgery -- A Framework to Surgical Autonomy,"Robotic surgery has increased the domain of surgeries possible. Several +examples of partial surgical automation have been seen in the past decade. We +break down the path of automation tasks into features required and provide a +checklist that can help reach higher levels of surgical automation. Finally, we +discuss the current challenges and advances required to make this happen.",surgical automation, +Vision-and-Language Pretraining,"With the burgeoning amount of data of image-text pairs and diversity of +Vision-and-Language (V\&L) tasks, scholars have introduced an abundance of deep +learning models in this research domain. Furthermore, in recent years, transfer +learning has also shown tremendous success in Computer Vision for tasks such as +Image Classification, Object Detection, etc., and in Natural Language +Processing for Question Answering, Machine Translation, etc. Inheriting the +spirit of Transfer Learning, research works in V\&L have devised multiple +pretraining techniques on large-scale datasets in order to enhance the +performance of downstream tasks. The aim of this article is to provide a +comprehensive revision of contemporary V\&L pretraining models. In particular, +we categorize and delineate pretraining approaches, along with the summary of +state-of-the-art vision-and-language pretrained models. Moreover, a list of +training datasets and downstream tasks is supplied to further polish the +perspective into V\&L pretraining. Lastly, we decided to take a further step to +discuss numerous directions for future research.",vision-and-language pretraining, +Oracle-MNIST: a Dataset of Oracle Characters for Benchmarking Machine Learning Algorithms,"We introduce the Oracle-MNIST dataset, comprising of 28$\times $28 grayscale +images of 30,222 ancient characters from 10 categories, for benchmarking +pattern classification, with particular challenges on image noise and +distortion. The training set totally consists of 27,222 images, and the test +set contains 300 images per class. Oracle-MNIST shares the same data format +with the original MNIST dataset, allowing for direct compatibility with all +existing classifiers and systems, but it constitutes a more challenging +classification task than MNIST. The images of ancient characters suffer from 1) +extremely serious and unique noises caused by three-thousand years of burial +and aging and 2) dramatically variant writing styles by ancient Chinese, which +all make them realistic for machine learning research. The dataset is freely +available at https://github.com/wm-bupt/oracle-mnist.",oracle character recognition, +Video Demoireing with Relation-Based Temporal Consistency,"Moire patterns, appearing as color distortions, severely degrade image and +video qualities when filming a screen with digital cameras. Considering the +increasing demands for capturing videos, we study how to remove such +undesirable moire patterns in videos, namely video demoireing. To this end, we +introduce the first hand-held video demoireing dataset with a dedicated data +collection pipeline to ensure spatial and temporal alignments of captured data. +Further, a baseline video demoireing model with implicit feature space +alignment and selective feature aggregation is developed to leverage +complementary information from nearby frames to improve frame-level video +demoireing. More importantly, we propose a relation-based temporal consistency +loss to encourage the model to learn temporal consistency priors directly from +ground-truth reference videos, which facilitates producing temporally +consistent predictions and effectively maintains frame-level qualities. +Extensive experiments manifest the superiority of our model. Code is available +at \url{https://daipengwa.github.io/VDmoire_ProjectPage/}.",video demoireing, +Causal Discovery and Knowledge Injection for Contestable Neural Networks (with Appendices),"Neural networks have proven to be effective at solving machine learning tasks +but it is unclear whether they learn any relevant causal relationships, while +their black-box nature makes it difficult for modellers to understand and debug +them. We propose a novel method overcoming these issues by allowing a two-way +interaction whereby neural-network-empowered machines can expose the +underpinning learnt causal graphs and humans can contest the machines by +modifying the causal graphs before re-injecting them into the machines. The +learnt models are guaranteed to conform to the graphs and adhere to expert +knowledge, some of which can also be given up-front. By building a window into +the model behaviour and enabling knowledge injection, our method allows +practitioners to debug networks based on the causal structure discovered from +the data and underpinning the predictions. Experiments with real and synthetic +tabular data show that our method improves predictive performance up to 2.4x +while producing parsimonious networks, up to 7x smaller in the input layer, +compared to SOTA regularised networks.",causal discovery, +Encoder-Decoder Network with Guided Transmission Map: Architecture,"An insight into the architecture of the Encoder-Decoder Network with Guided +Transmission Map (EDN-GTM), a novel and effective single image dehazing scheme, +is presented in this paper. The EDN-GTM takes a conventional RGB hazy image in +conjunction with the corresponding transmission map estimated by the dark +channel prior (DCP) approach as inputs of the network. The EDN-GTM adopts an +enhanced structure of U-Net developed for dehazing tasks and the resulting +EDN-GDM has shown state-of-the-art performances on benchmark dehazing datasets +in terms of PSNR and SSIM metrics. In order to give an in-depth understanding +of the well-designed architecture which largely contributes to the success of +the EDN-GTM, extensive experiments and analysis from selecting the core +structure of the scheme to investigating advanced network designs are presented +in this paper.",image dehazing, +CAPSTONE: Curriculum Sampling for Dense Retrieval with Document Expansion,"The dual-encoder has become the de facto architecture for dense retrieval. +Typically, it computes the latent representations of the query and document +independently, thus failing to fully capture the interactions between the query +and document. To alleviate this, recent research has focused on obtaining +query-informed document representations. During training, it expands the +document with a real query, but during inference, it replaces the real query +with a generated one. This inconsistency between training and inference causes +the dense retrieval model to prioritize query information while disregarding +the document when computing the document representation. Consequently, it +performs even worse than the vanilla dense retrieval model because its +performance heavily relies on the relevance between the generated queries and +the real query.In this paper, we propose a curriculum sampling strategy that +utilizes pseudo queries during training and progressively enhances the +relevance between the generated query and the real query. By doing so, the +retrieval model learns to extend its attention from the document alone to both +the document and query, resulting in high-quality query-informed document +representations. Experimental results on both in-domain and out-of-domain +datasets demonstrate that our approach outperforms previous dense retrieval +models.",dense retrieval, +Attack-Agnostic Adversarial Detection,"The growing number of adversarial attacks in recent years gives attackers an +advantage over defenders, as defenders must train detectors after knowing the +types of attacks, and many models need to be maintained to ensure good +performance in detecting any upcoming attacks. We propose a way to end the +tug-of-war between attackers and defenders by treating adversarial attack +detection as an anomaly detection problem so that the detector is agnostic to +the attack. We quantify the statistical deviation caused by adversarial +perturbations in two aspects. The Least Significant Component Feature (LSCF) +quantifies the deviation of adversarial examples from the statistics of benign +samples and Hessian Feature (HF) reflects how adversarial examples distort the +landscape of the model's optima by measuring the local loss curvature. +Empirical results show that our method can achieve an overall ROC AUC of 94.9%, +89.7%, and 94.6% on CIFAR10, CIFAR100, and SVHN, respectively, and has +comparable performance to adversarial detectors trained with adversarial +examples on most of the attacks.",adversarial attack detection, +Addressing Gap between Training Data and Deployed Environment by On-Device Learning,"The accuracy of tinyML applications is often affected by various +environmental factors, such as noises, location/calibration of sensors, and +time-related changes. This article introduces a neural network based on-device +learning (ODL) approach to address this issue by retraining in deployed +environments. Our approach relies on semi-supervised sequential training of +multiple neural networks tailored for low-end edge devices. This article +introduces its algorithm and implementation on wireless sensor nodes consisting +of a Raspberry Pi Pico and low-power wireless module. Experiments using +vibration patterns of rotating machines demonstrate that retraining by ODL +improves anomaly detection accuracy compared with a prediction-only deep neural +network in a noisy environment. The results also show that the ODL approach can +save communication cost and energy consumption for battery-powered Internet of +Things devices.",on-device learning, +LSCDiscovery: A shared task on semantic change discovery and detection in Spanish,"We present the first shared task on semantic change discovery and detection +in Spanish and create the first dataset of Spanish words manually annotated for +semantic change using the DURel framework (Schlechtweg et al., 2018). The task +is divided in two phases: 1) Graded Change Discovery, and 2) Binary Change +Detection. In addition to introducing a new language the main novelty with +respect to the previous tasks consists in predicting and evaluating changes for +all vocabulary words in the corpus. Six teams participated in phase 1 and seven +teams in phase 2 of the shared task, and the best system obtained a Spearman +rank correlation of 0.735 for phase 1 and an F1 score of 0.716 for phase 2. We +describe the systems developed by the competing teams, highlighting the +techniques that were particularly useful and discuss the limits of these +approaches.",semantic change detection, +MotionDeltaCNN: Sparse CNN Inference of Frame Differences in Moving Camera Videos,"Convolutional neural network inference on video input is computationally +expensive and requires high memory bandwidth. Recently, DeltaCNN managed to +reduce the cost by only processing pixels with significant updates over the +previous frame. However, DeltaCNN relies on static camera input. Moving cameras +add new challenges in how to fuse newly unveiled image regions with already +processed regions efficiently to minimize the update rate - without increasing +memory overhead and without knowing the camera extrinsics of future frames. In +this work, we propose MotionDeltaCNN, a sparse CNN inference framework that +supports moving cameras. We introduce spherical buffers and padded convolutions +to enable seamless fusion of newly unveiled regions and previously processed +regions -- without increasing memory footprint. Our evaluation shows that we +outperform DeltaCNN by up to 90% for moving camera videos.",sparse CNN inference, +Power Bundle Adjustment for Large-Scale 3D Reconstruction,"We introduce Power Bundle Adjustment as an expansion type algorithm for +solving large-scale bundle adjustment problems. It is based on the power series +expansion of the inverse Schur complement and constitutes a new family of +solvers that we call inverse expansion methods. We theoretically justify the +use of power series and we prove the convergence of our approach. Using the +real-world BAL dataset we show that the proposed solver challenges the +state-of-the-art iterative methods and significantly accelerates the solution +of the normal equation, even for reaching a very high accuracy. This +easy-to-implement solver can also complement a recently presented distributed +bundle adjustment framework. We demonstrate that employing the proposed Power +Bundle Adjustment as a sub-problem solver significantly improves speed and +accuracy of the distributed optimization.",large-scale 3D reconstruction, +Exploring Cross-lingual Textual Style Transfer with Large Multilingual Language Models,"Detoxification is a task of generating text in polite style while preserving +meaning and fluency of the original toxic text. Existing detoxification methods +are designed to work in one exact language. This work investigates multilingual +and cross-lingual detoxification and the behavior of large multilingual models +like in this setting. Unlike previous works we aim to make large language +models able to perform detoxification without direct fine-tuning in given +language. Experiments show that multilingual models are capable of performing +multilingual style transfer. However, models are not able to perform +cross-lingual detoxification and direct fine-tuning on exact language is +inevitable.",cross-lingual detoxification, +Modular Domain Adaptation,"Off-the-shelf models are widely used by computational social science +researchers to measure properties of text, such as sentiment. However, without +access to source data it is difficult to account for domain shift, which +represents a threat to validity. Here, we treat domain adaptation as a modular +process that involves separate model producers and model consumers, and show +how they can independently cooperate to facilitate more accurate measurements +of text. We introduce two lightweight techniques for this scenario, and +demonstrate that they reliably increase out-of-domain accuracy on four +multi-domain text classification datasets when used with linear and contextual +embedding models. We conclude with recommendations for model producers and +consumers, and release models and replication code to accompany this paper.",Domain Adaptation Text Classification, +CUNI Systems for the WMT22 Czech-Ukrainian Translation Task,"We present Charles University submissions to the WMT22 General Translation +Shared Task on Czech-Ukrainian and Ukrainian-Czech machine translation. We +present two constrained submissions based on block back-translation and tagged +back-translation and experiment with rule-based romanization of Ukrainian. Our +results show that the romanization only has a minor effect on the translation +quality. Further, we describe Charles Translator, a system that was developed +in March 2022 as a response to the migration from Ukraine to the Czech +Republic. Compared to our constrained systems, it did not use the romanization +and used some proprietary data sources.",machine translation, +"C$^3$Fusion: Consistent Contrastive Colon Fusion, Towards Deep SLAM in Colonoscopy","3D colon reconstruction from Optical Colonoscopy (OC) to detect non-examined +surfaces remains an unsolved problem. The challenges arise from the nature of +optical colonoscopy data, characterized by highly reflective low-texture +surfaces, drastic illumination changes and frequent tracking loss. Recent +methods demonstrate compelling results, but suffer from: (1) frangible +frame-to-frame (or frame-to-model) pose estimation resulting in many tracking +failures; or (2) rely on point-based representations at the cost of scan +quality. In this paper, we propose a novel reconstruction framework that +addresses these issues end to end, which result in both quantitatively and +qualitatively accurate and robust 3D colon reconstruction. Our SLAM approach, +which employs correspondences based on contrastive deep features, and deep +consistent depth maps, estimates globally optimized poses, is able to recover +from frequent tracking failures, and estimates a global consistent 3D model; +all within a single framework. We perform an extensive experimental evaluation +on multiple synthetic and real colonoscopy videos, showing high-quality results +and comparisons against relevant baselines.",3D colon reconstruction, +LatentKeypointGAN: Controlling Images via Latent Keypoints -- Extended Abstract,"Generative adversarial networks (GANs) can now generate photo-realistic +images. However, how to best control the image content remains an open +challenge. We introduce LatentKeypointGAN, a two-stage GAN internally +conditioned on a set of keypoints and associated appearance embeddings +providing control of the position and style of the generated objects and their +respective parts. A major difficulty that we address is disentangling the image +into spatial and appearance factors with little domain knowledge and +supervision signals. We demonstrate in a user study and quantitative +experiments that LatentKeypointGAN provides an interpretable latent space that +can be used to re-arrange the generated images by re-positioning and exchanging +keypoint embeddings, such as generating portraits by combining the eyes, and +mouth from different images. Notably, our method does not require labels as it +is self-supervised and thereby applies to diverse application domains, such as +editing portraits, indoor rooms, and full-body human poses.",Generative adversarial network, +Planning with Spatial-Temporal Abstraction from Point Clouds for Deformable Object Manipulation,"Effective planning of long-horizon deformable object manipulation requires +suitable abstractions at both the spatial and temporal levels. Previous methods +typically either focus on short-horizon tasks or make strong assumptions that +full-state information is available, which prevents their use on deformable +objects. In this paper, we propose PlAnning with Spatial-Temporal Abstraction +(PASTA), which incorporates both spatial abstraction (reasoning about objects +and their relations to each other) and temporal abstraction (reasoning over +skills instead of low-level actions). Our framework maps high-dimension 3D +observations such as point clouds into a set of latent vectors and plans over +skill sequences on top of the latent set representation. We show that our +method can effectively perform challenging sequential deformable object +manipulation tasks in the real world, which require combining multiple tool-use +skills such as cutting with a knife, pushing with a pusher, and spreading the +dough with a roller.",deformable object manipulation, +1st Place Solution of The Robust Vision Challenge 2022 Semantic Segmentation Track,"This report describes the winning solution to the Robust Vision Challenge +(RVC) semantic segmentation track at ECCV 2022. Our method adopts the +FAN-B-Hybrid model as the encoder and uses SegFormer as the segmentation +framework. The model is trained on a composite dataset consisting of images +from 9 datasets (ADE20K, Cityscapes, Mapillary Vistas, ScanNet, VIPER, WildDash +2, IDD, BDD, and COCO) with a simple dataset balancing strategy. All the +original labels are projected to a 256-class unified label space, and the model +is trained using a cross-entropy loss. Without significant hyperparameter +tuning or any specific loss weighting, our solution ranks the first place on +all the testing semantic segmentation benchmarks from multiple domains (ADE20K, +Cityscapes, Mapillary Vistas, ScanNet, VIPER, and WildDash 2). The proposed +method can serve as a strong baseline for the multi-domain segmentation task +and benefit future works. Code will be available at +https://github.com/lambert-x/RVC_Segmentation.",multi-domain semantic segmentation, +Oracles & Followers: Stackelberg Equilibria in Deep Multi-Agent Reinforcement Learning,"Stackelberg equilibria arise naturally in a range of popular learning +problems, such as in security games or indirect mechanism design, and have +received increasing attention in the reinforcement learning literature. We +present a general framework for implementing Stackelberg equilibria search as a +multi-agent RL problem, allowing a wide range of algorithmic design choices. We +discuss how previous approaches can be seen as specific instantiations of this +framework. As a key insight, we note that the design space allows for +approaches not previously seen in the literature, for instance by leveraging +multitask and meta-RL techniques for follower convergence. We propose one such +approach using contextual policies, and evaluate it experimentally on both +standard and novel benchmark domains, showing greatly improved sample +efficiency compared to previous approaches. Finally, we explore the effect of +adopting algorithm designs outside the borders of our framework.",Stackelberg equilibria search, +Randomized Smoothing under Attack: How Good is it in Pratice?,"Randomized smoothing is a recent and celebrated solution to certify the +robustness of any classifier. While it indeed provides a theoretical robustness +against adversarial attacks, the dimensionality of current classifiers +necessarily imposes Monte Carlo approaches for its application in practice. +This paper questions the effectiveness of randomized smoothing as a defense, +against state of the art black-box attacks. This is a novel perspective, as +previous research works considered the certification as an unquestionable +guarantee. We first formally highlight the mismatch between a theoretical +certification and the practice of attacks on classifiers. We then perform +attacks on randomized smoothing as a defense. Our main observation is that +there is a major mismatch in the settings of the RS for obtaining high +certified robustness or when defeating black box attacks while preserving the +classifier accuracy.",Randomized Smoothing Defense, +Generating Full Length Wikipedia Biographies: The Impact of Gender Bias on the Retrieval-Based Generation of Women Biographies,"Generating factual, long-form text such as Wikipedia articles raises three +key challenges: how to gather relevant evidence, how to structure information +into well-formed text, and how to ensure that the generated text is factually +correct. We address these by developing a model for English text that uses a +retrieval mechanism to identify relevant supporting information on the web and +a cache-based pre-trained encoder-decoder to generate long-form biographies +section by section, including citation information. To assess the impact of +available web evidence on the output text, we compare the performance of our +approach when generating biographies about women (for which less information is +available on the web) vs. biographies generally. To this end, we curate a +dataset of 1,500 biographies about women. We analyze our generated text to +understand how differences in available web evidence data affect generation. We +evaluate the factuality, fluency, and quality of the generated texts using +automatic metrics and human evaluation. We hope that these techniques can be +used as a starting point for human writers, to aid in reducing the complexity +inherent in the creation of long-form, factual text.",long-form text generation, +Keep Your Friends Close & Enemies Farther: Debiasing Contrastive Learning with Spatial Priors in 3D Radiology Images,"Understanding of spatial attributes is central to effective 3D radiology +image analysis where crop-based learning is the de facto standard. Given an +image patch, its core spatial properties (e.g., position & orientation) provide +helpful priors on expected object sizes, appearances, and structures through +inherent anatomical consistencies. Spatial correspondences, in particular, can +effectively gauge semantic similarities between inter-image regions, while +their approximate extraction requires no annotations or overbearing +computational costs. However, recent 3D contrastive learning approaches either +neglect correspondences or fail to maximally capitalize on them. To this end, +we propose an extensible 3D contrastive framework (Spade, for Spatial +Debiasing) that leverages extracted correspondences to select more effective +positive & negative samples for representation learning. Our method learns both +globally invariant and locally equivariant representations with downstream +segmentation in mind. We also propose separate selection strategies for global +& local scopes that tailor to their respective representational requirements. +Compared to recent state-of-the-art approaches, Spade shows notable +improvements on three downstream segmentation tasks (CT Abdominal Organ, CT +Heart, MR Heart).", 3D contrastive learning, +Euler State Networks: Non-dissipative Reservoir Computing,"Inspired by the numerical solution of ordinary differential equations, in +this paper we propose a novel Reservoir Computing (RC) model, called the Euler +State Network (EuSN). The presented approach makes use of forward Euler +discretization and antisymmetric recurrent matrices to design reservoir +dynamics that are both stable and non-dissipative by construction. + Our mathematical analysis shows that the resulting model is biased towards a +unitary effective spectral radius and zero local Lyapunov exponents, +intrinsically operating near to the edge of stability. Experiments on long-term +memory tasks show the clear superiority of the proposed approach over standard +RC models in problems requiring effective propagation of input information over +multiple time-steps. Furthermore, results on time-series classification +benchmarks indicate that EuSN is able to match (or even exceed) the accuracy of +trainable Recurrent Neural Networks, while retaining the training efficiency of +the RC family, resulting in up to $\approx$ 490-fold savings in computation +time and $\approx$ 1750-fold savings in energy consumption.",Reservoir Computing, +Improving Pre-trained Language Model Fine-tuning with Noise Stability Regularization,"The advent of large-scale pre-trained language models has contributed greatly +to the recent progress in natural language processing. Many state-of-the-art +language models are first trained on a large text corpus and then fine-tuned on +downstream tasks. Despite its recent success and wide adoption, fine-tuning a +pre-trained language model often suffers from overfitting, which leads to poor +generalizability due to the extremely high complexity of the model and the +limited training samples from downstream tasks. To address this problem, we +propose a novel and effective fine-tuning framework, named Layerwise Noise +Stability Regularization (LNSR). Specifically, we propose to inject the +standard Gaussian noise or In-manifold noise and regularize hidden +representations of the fine-tuned model. We first provide theoretical analyses +to support the efficacy of our method. We then demonstrate the advantages of +the proposed method over other state-of-the-art algorithms including L2-SP, +Mixout and SMART. While these previous works only verify the effectiveness of +their methods on relatively simple text classification tasks, we also verify +the effectiveness of our method on question answering tasks, where the target +problem is much more difficult and more training examples are available. +Furthermore, extensive experimental results indicate that the proposed +algorithm can not only enhance the in-domain performance of the language models +but also improve the domain generalization performance on out-of-domain data.",Language Model Fine-tuning, +Revisiting L1 Loss in Super-Resolution: A Probabilistic View and Beyond,"Super-resolution as an ill-posed problem has many high-resolution candidates +for a low-resolution input. However, the popular $\ell_1$ loss used to best fit +the given HR image fails to consider this fundamental property of +non-uniqueness in image restoration. In this work, we fix the missing piece in +$\ell_1$ loss by formulating super-resolution with neural networks as a +probabilistic model. It shows that $\ell_1$ loss is equivalent to a degraded +likelihood function that removes the randomness from the learning process. By +introducing a data-adaptive random variable, we present a new objective +function that aims at minimizing the expectation of the reconstruction error +over all plausible solutions. The experimental results show consistent +improvements on mainstream architectures, with no extra parameter or computing +cost at inference time.",Super-Resolution, +Neural Mesh-Based Graphics,"We revisit NPBG, the popular approach to novel view synthesis that introduced +the ubiquitous point feature neural rendering paradigm. We are interested in +particular in data-efficient learning with fast view synthesis. We achieve this +through a view-dependent mesh-based denser point descriptor rasterization, in +addition to a foreground/background scene rendering split, and an improved +loss. By training solely on a single scene, we outperform NPBG, which has been +trained on ScanNet and then scene finetuned. We also perform competitively with +respect to the state-of-the-art method SVS, which has been trained on the full +dataset (DTU and Tanks and Temples) and then scene finetuned, in spite of their +deeper neural renderer.",neural rendering, +Applicability limitations of differentiable full-reference image-quality,"Subjective image-quality measurement plays a critical role in the development +of image-processing applications. The purpose of a visual-quality metric is to +approximate the results of subjective assessment. In this regard, more and more +metrics are under development, but little research has considered their +limitations. This paper addresses that deficiency: we show how image +preprocessing before compression can artificially increase the quality scores +provided by the popular metrics DISTS, LPIPS, HaarPSI, and VIF as well as how +these scores are inconsistent with subjective-quality scores. We propose a +series of neural-network preprocessing models that increase DISTS by up to +34.5%, LPIPS by up to 36.8%, VIF by up to 98.0%, and HaarPSI by up to 22.6% in +the case of JPEG-compressed images. A subjective comparison of preprocessed +images showed that for most of the metrics we examined, visual quality drops or +stays unchanged, limiting the applicability of these metrics.",visual-quality metric, +Team Yao at Factify 2022: Utilizing Pre-trained Models and Co-attention Networks for Multi-Modal Fact Verification,"In recent years, social media has enabled users to get exposed to a myriad of +misinformation and disinformation; thus, misinformation has attracted a great +deal of attention in research fields and as a social issue. To address the +problem, we propose a framework, Pre-CoFact, composed of two pre-trained models +for extracting features from text and images, and multiple co-attention +networks for fusing the same modality but different sources and different +modalities. Besides, we adopt the ensemble method by using different +pre-trained models in Pre-CoFact to achieve better performance. We further +illustrate the effectiveness from the ablation study and examine different +pre-trained models for comparison. Our team, Yao, won the fifth prize +(F1-score: 74.585\%) in the Factify challenge hosted by De-Factify @ AAAI 2022, +which demonstrates that our model achieved competitive performance without +using auxiliary tasks or extra information. The source code of our work is +publicly available at +https://github.com/wywyWang/Multi-Modal-Fact-Verification-2021",Misinformation Verification, +Compose & Embellish: Well-Structured Piano Performance Generation via A Two-Stage Approach,"Even with strong sequence models like Transformers, generating expressive +piano performances with long-range musical structures remains challenging. +Meanwhile, methods to compose well-structured melodies or lead sheets (melody + +chords), i.e., simpler forms of music, gained more success. Observing the +above, we devise a two-stage Transformer-based framework that Composes a lead +sheet first, and then Embellishes it with accompaniment and expressive touches. +Such a factorization also enables pretraining on non-piano data. Our objective +and subjective experiments show that Compose & Embellish shrinks the gap in +structureness between a current state of the art and real performances by half, +and improves other musical aspects such as richness and coherence as well.",Music Generation, +Focus-Driven Contrastive Learniang for Medical Question Summarization,"Automatic medical question summarization can significantly help the system to +understand consumer health questions and retrieve correct answers. The Seq2Seq +model based on maximum likelihood estimation (MLE) has been applied in this +task, which faces two general problems: the model can not capture well question +focus and and the traditional MLE strategy lacks the ability to understand +sentence-level semantics. To alleviate these problems, we propose a novel +question focus-driven contrastive learning framework (QFCL). Specially, we +propose an easy and effective approach to generate hard negative samples based +on the question focus, and exploit contrastive learning at both encoder and +decoder to obtain better sentence level representations. On three medical +benchmark datasets, our proposed model achieves new state-of-the-art results, +and obtains a performance gain of 5.33, 12.85 and 3.81 points over the baseline +BART model on three datasets respectively. Further human judgement and detailed +analysis prove that our QFCL model learns better sentence representations with +the ability to distinguish different sentence meanings, and generates +high-quality summaries by capturing question focus.",medical question summarization, +Using Natural Sentences for Understanding Biases in Language Models,"Evaluation of biases in language models is often limited to synthetically +generated datasets. This dependence traces back to the need for a prompt-style +dataset to trigger specific behaviors of language models. In this paper, we +address this gap by creating a prompt dataset with respect to occupations +collected from real-world natural sentences present in Wikipedia. We aim to +understand the differences between using template-based prompts and natural +sentence prompts when studying gender-occupation biases in language models. We +find bias evaluations are very sensitive to the design choices of template +prompts, and we propose using natural sentence prompts for systematic +evaluations to step away from design choices that could introduce bias in the +observations.",Language Models Biases, +Multi Document Reading Comprehension,"Reading Comprehension (RC) is a task of answering a question from a given +passage or a set of passages. In the case of multiple passages, the task is to +find the best possible answer to the question. Recent trials and experiments in +the field of Natural Language Processing (NLP) have proved that machines can be +provided with the ability to not only process the text in the passage and +understand its meaning to answer the question from the passage, but also can +surpass the Human Performance on many datasets such as Standford's Question +Answering Dataset (SQuAD). This paper presents a study on Reading Comprehension +and its evolution in Natural Language Processing over the past few decades. We +shall also study how the task of Single Document Reading Comprehension acts as +a building block for our Multi-Document Reading Comprehension System. In the +latter half of the paper, we'll be studying about a recently proposed model for +Multi-Document Reading Comprehension - RE3QA that is comprised of a Reader, +Retriever, and a Re-ranker based network to fetch the best possible answer from +a given set of passages.",document reading comprehension, +Vega-MT: The JD Explore Academy Translation System for WMT22,"We describe the JD Explore Academy's submission of the WMT 2022 shared +general translation task. We participated in all high-resource tracks and one +medium-resource track, including Chinese-English, German-English, +Czech-English, Russian-English, and Japanese-English. We push the limit of our +previous work -- bidirectional training for translation by scaling up two main +factors, i.e. language pairs and model sizes, namely the \textbf{Vega-MT} +system. As for language pairs, we scale the ""bidirectional"" up to the +""multidirectional"" settings, covering all participating languages, to exploit +the common knowledge across languages, and transfer them to the downstream +bilingual tasks. As for model sizes, we scale the Transformer-Big up to the +extremely large model that owns nearly 4.7 Billion parameters, to fully enhance +the model capacity for our Vega-MT. Also, we adopt the data augmentation +strategies, e.g. cycle translation for monolingual data, and bidirectional +self-training for bilingual and monolingual data, to comprehensively exploit +the bilingual and monolingual data. To adapt our Vega-MT to the general domain +test set, generalization tuning is designed. Based on the official automatic +scores of constrained systems, in terms of the sacreBLEU shown in Figure-1, we +got the 1st place on {Zh-En (33.5), En-Zh (49.7), De-En (33.7), En-De (37.8), +Cs-En (54.9), En-Cs (41.4) and En-Ru (32.7)}, 2nd place on {Ru-En (45.1) and +Ja-En (25.6)}, and 3rd place on {En-Ja(41.5)}, respectively; W.R.T the COMET, +we got the 1st place on {Zh-En (45.1), En-Zh (61.7), De-En (58.0), En-De +(63.2), Cs-En (74.7), Ru-En (64.9), En-Ru (69.6) and En-Ja (65.1)}, 2nd place +on {En-Cs (95.3) and Ja-En (40.6)}, respectively.",multidirectional translation, +Generic and Trend-aware Curriculum Learning for Relation Extraction in Graph Neural Networks,"We present a generic and trend-aware curriculum learning approach for graph +neural networks. It extends existing approaches by incorporating sample-level +loss trends to better discriminate easier from harder samples and schedule them +for training. The model effectively integrates textual and structural +information for relation extraction in text graphs. Experimental results show +that the model provides robust estimations of sample difficulty and shows +sizable improvement over the state-of-the-art approaches across several +datasets.",relation extraction, +Too Brittle To Touch: Comparing the Stability of Quantization and Distillation Towards Developing Lightweight Low-Resource MT Models,"Leveraging shared learning through Massively Multilingual Models, +state-of-the-art machine translation models are often able to adapt to the +paucity of data for low-resource languages. However, this performance comes at +the cost of significantly bloated models which are not practically deployable. +Knowledge Distillation is one popular technique to develop competitive, +lightweight models: In this work, we first evaluate its use to compress MT +models focusing on languages with extremely limited training data. Through our +analysis across 8 languages, we find that the variance in the performance of +the distilled models due to their dependence on priors including the amount of +synthetic data used for distillation, the student architecture, training +hyperparameters and confidence of the teacher models, makes distillation a +brittle compression mechanism. To mitigate this, we explore the use of +post-training quantization for the compression of these models. Here, we find +that while distillation provides gains across some low-resource languages, +quantization provides more consistent performance trends for the entire range +of languages, especially the lowest-resource languages in our target set.",low-resource machine translation, +Is explainable AI a race against model complexity?,"Explaining the behaviour of intelligent systems will get increasingly and +perhaps intractably challenging as models grow in size and complexity. We may +not be able to expect an explanation for every prediction made by a brain-scale +model, nor can we expect explanations to remain objective or apolitical. Our +functionalist understanding of these models is of less advantage than we might +assume. Models precede explanations, and can be useful even when both model and +explanation are incorrect. Explainability may never win the race against +complexity, but this is less problematic than it seems.",explainable AI, +Learning Compositional Representations for Effective Low-Shot Generalization,"We propose Recognition as Part Composition (RPC), an image encoding approach +inspired by human cognition. It is based on the cognitive theory that humans +recognize complex objects by components, and that they build a small compact +vocabulary of concepts to represent each instance with. RPC encodes images by +first decomposing them into salient parts, and then encoding each part as a +mixture of a small number of prototypes, each representing a certain concept. +We find that this type of learning inspired by human cognition can overcome +hurdles faced by deep convolutional networks in low-shot generalization tasks, +like zero-shot learning, few-shot learning and unsupervised domain adaptation. +Furthermore, we find a classifier using an RPC image encoder is fairly robust +to adversarial attacks, that deep neural networks are known to be prone to. +Given that our image encoding principle is based on human cognition, one would +expect the encodings to be interpretable by humans, which we find to be the +case via crowd-sourcing experiments. Finally, we propose an application of +these interpretable encodings in the form of generating synthetic attribute +annotations for evaluating zero-shot learning methods on new datasets.",Low-Shot Generalization, +SNeRF: Stylized Neural Implicit Representations for 3D Scenes,"This paper presents a stylized novel view synthesis method. Applying +state-of-the-art stylization methods to novel views frame by frame often causes +jittering artifacts due to the lack of cross-view consistency. Therefore, this +paper investigates 3D scene stylization that provides a strong inductive bias +for consistent novel view synthesis. Specifically, we adopt the emerging neural +radiance fields (NeRF) as our choice of 3D scene representation for their +capability to render high-quality novel views for a variety of scenes. However, +as rendering a novel view from a NeRF requires a large number of samples, +training a stylized NeRF requires a large amount of GPU memory that goes beyond +an off-the-shelf GPU capacity. We introduce a new training method to address +this problem by alternating the NeRF and stylization optimization steps. Such a +method enables us to make full use of our hardware memory capacity to both +generate images at higher resolution and adopt more expressive image style +transfer methods. Our experiments show that our method produces stylized NeRFs +for a wide range of content, including indoor, outdoor and dynamic scenes, and +synthesizes high-quality novel views with cross-view consistency.",3D scene stylization, +Atlas: Few-shot Learning with Retrieval Augmented Language Models,"Large language models have shown impressive few-shot results on a wide range +of tasks. However, when knowledge is key for such results, as is the case for +tasks such as question answering and fact checking, massive parameter counts to +store knowledge seem to be needed. Retrieval augmented models are known to +excel at knowledge intensive tasks without the need for as many parameters, but +it is unclear whether they work in few-shot settings. In this work we present +Atlas, a carefully designed and pre-trained retrieval augmented language model +able to learn knowledge intensive tasks with very few training examples. We +perform evaluations on a wide range of tasks, including MMLU, KILT and +NaturalQuestions, and study the impact of the content of the document index, +showing that it can easily be updated. Notably, Atlas reaches over 42% accuracy +on Natural Questions using only 64 examples, outperforming a 540B parameters +model by 3% despite having 50x fewer parameters.",Retrieval-Augmented Generation, +Bridging Cross-Lingual Gaps During Leveraging the Multilingual Sequence-to-Sequence Pretraining for Text Generation and Understanding,"For multilingual sequence-to-sequence pretrained language models +(multilingual Seq2Seq PLMs), e.g. mBART, the self-supervised pretraining task +is trained on a wide range of monolingual languages, e.g. 25 languages from +CommonCrawl, while the downstream cross-lingual tasks generally progress on a +bilingual language subset, e.g. English-German, making there exists the data +discrepancy, namely domain discrepancy, and cross-lingual learning objective +discrepancy, namely task discrepancy, between the pretraining and finetuning +stages. To bridge the above cross-lingual domain and task gaps, we extend the +vanilla pretrain-finetune pipeline with extra code-switching restore task. +Specifically, the first stage employs the self-supervised code-switching +restore task as a pretext task, allowing the multilingual Seq2Seq PLMs to +acquire some in-domain alignment information. And for the second stage, we +fine-tune the model on downstream data normally. Experiments on both NLG +evaluation (12 bilingual translation tasks, 30 zero-shot translation tasks, and +2 cross-lingual summarization tasks) and NLU evaluation (7 cross-lingual +natural language inference tasks) show our model outperforms the strong +baseline mBART with standard finetuning strategy, consistently. Analyses +indicate our approach could narrow the Euclidean distance of cross-lingual +sentence representations, and improve the model generalization with trivial +computational cost. We release the code at: +https://github.com/zanchangtong/CSR4mBART.",cross-lingual text generation, +How Large Language Models are Transforming Machine-Paraphrased Plagiarism,"The recent success of large language models for text generation poses a +severe threat to academic integrity, as plagiarists can generate realistic +paraphrases indistinguishable from original work. However, the role of large +autoregressive transformers in generating machine-paraphrased plagiarism and +their detection is still developing in the literature. This work explores T5 +and GPT-3 for machine-paraphrase generation on scientific articles from arXiv, +student theses, and Wikipedia. We evaluate the detection performance of six +automated solutions and one commercial plagiarism detection software and +perform a human study with 105 participants regarding their detection +performance and the quality of generated examples. Our results suggest that +large models can rewrite text humans have difficulty identifying as +machine-paraphrased (53% mean acc.). Human experts rate the quality of +paraphrases generated by GPT-3 as high as original texts (clarity 4.0/5, +fluency 4.2/5, coherence 3.8/5). The best-performing detection model (GPT-3) +achieves a 66% F1-score in detecting paraphrases.",machine-paraphrased plagiarism detection, +UniSAr: A Unified Structure-Aware Autoregressive Language Model for Text-to-SQL,"Existing text-to-SQL semantic parsers are typically designed for particular +settings such as handling queries that span multiple tables, domains or turns +which makes them ineffective when applied to different settings. We present +UniSAr (Unified Structure-Aware Autoregressive Language Model), which benefits +from directly using an off-the-shelf language model architecture and +demonstrates consistently high performance under different settings. +Specifically, UniSAr extends existing autoregressive language models to +incorporate three non-invasive extensions to make them structure-aware: (1) +adding structure mark to encode database schema, conversation context, and +their relationships; (2) constrained decoding to decode well structured SQL for +a given database schema; and (3) SQL completion to complete potential missing +JOIN relationships in SQL based on database schema. On seven well-known +text-to-SQL datasets covering multi-domain, multi-table and multi-turn, UniSAr +demonstrates highly comparable or better performance to the most advanced +specifically-designed text-to-SQL models. Importantly, our UniSAr is +non-invasive, such that other core model advances in text-to-SQL can also adopt +our extensions to further enhance performance.",text-to-SQL, +Contextualized language models for semantic change detection: lessons learned,"We present a qualitative analysis of the (potentially erroneous) outputs of +contextualized embedding-based methods for detecting diachronic semantic +change. First, we introduce an ensemble method outperforming previously +described contextualized approaches. This method is used as a basis for an +in-depth analysis of the degrees of semantic change predicted for English words +across 5 decades. Our findings show that contextualized methods can often +predict high change scores for words which are not undergoing any real +diachronic semantic shift in the lexicographic sense of the term (or at least +the status of these shifts is questionable). Such challenging cases are +discussed in detail with examples, and their linguistic categorization is +proposed. Our conclusion is that pre-trained contextualized language models are +prone to confound changes in lexicographic senses and changes in contextual +variance, which naturally stem from their distributional nature, but is +different from the types of issues observed in methods based on static +embeddings. Additionally, they often merge together syntactic and semantic +aspects of lexical entities. We propose a range of possible future solutions to +these issues.",semantic change detection, +Enhanced Knowledge Selection for Grounded Dialogues via Document Semantic Graphs,"Providing conversation models with background knowledge has been shown to +make open-domain dialogues more informative and engaging. Existing models treat +knowledge selection as a sentence ranking or classification problem where each +sentence is handled individually, ignoring the internal semantic connection +among sentences in the background document. In this work, we propose to +automatically convert the background knowledge documents into document semantic +graphs and then perform knowledge selection over such graphs. Our document +semantic graphs preserve sentence-level information through the use of sentence +nodes and provide concept connections between sentences. We jointly apply +multi-task learning for sentence-level and concept-level knowledge selection +and show that it improves sentence-level selection. Our experiments show that +our semantic graph-based knowledge selection improves over sentence selection +baselines for both the knowledge selection task and the end-to-end response +generation task on HollE and improves generalization on unseen topics in WoW.",Knowledge Grounded Dialogue, +ML4CO-KIDA: Knowledge Inheritance in Dataset Aggregation,"The Machine Learning for Combinatorial Optimization (ML4CO) NeurIPS 2021 +competition aims to improve state-of-the-art combinatorial optimization solvers +by replacing key heuristic components with machine learning models. On the dual +task, we design models to make branching decisions to promote the dual bound +increase faster. We propose a knowledge inheritance method to generalize +knowledge of different models from the dataset aggregation process, named KIDA. +Our improvement overcomes some defects of the baseline +graph-neural-networks-based methods. Further, we won the +$1$\textsuperscript{st} Place on the dual task. We hope this report can provide +useful experience for developers and researchers. The code is available at +https://github.com/megvii-research/NeurIPS2021-ML4CO-KIDA.",Combinatorial Optimization, +AdaEnlight: Energy-aware Low-light Video Stream Enhancement on Mobile Devices,"The ubiquity of camera-embedded devices and the advances in deep learning +have stimulated various intelligent mobile video applications. These +applications often demand on-device processing of video streams to deliver +real-time, high-quality services for privacy and robustness concerns. However, +the performance of these applications is constrained by the raw video streams, +which tend to be taken with small-aperture cameras of ubiquitous mobile +platforms in dim light. Despite extensive low-light video enhancement +solutions, they are unfit for deployment to mobile devices due to their complex +models and and ignorance of system dynamics like energy budgets. In this paper, +we propose AdaEnlight, an energy-aware low-light video stream enhancement +system on mobile devices. It achieves real-time video enhancement with +competitive visual quality while allowing runtime behavior adaptation to the +platform-imposed dynamic energy budgets. We report extensive experiments on +diverse datasets, scenarios, and platforms and demonstrate the superiority of +AdaEnlight compared with state-of-the-art low-light image and video enhancement +solutions.",low-light video enhancement, +Unsupervised Extractive Summarization with Heterogeneous Graph Embeddings for Chinese Document,"In the scenario of unsupervised extractive summarization, learning +high-quality sentence representations is essential to select salient sentences +from the input document. Previous studies focus more on employing statistical +approaches or pre-trained language models (PLMs) to extract sentence +embeddings, while ignoring the rich information inherent in the heterogeneous +types of interaction between words and sentences. In this paper, we are the +first to propose an unsupervised extractive summarizaiton method with +heterogeneous graph embeddings (HGEs) for Chinese document. A heterogeneous +text graph is constructed to capture different granularities of interactions by +incorporating graph structural information. Moreover, our proposed graph is +general and flexible where additional nodes such as keywords can be easily +integrated. Experimental results demonstrate that our method consistently +outperforms the strong baseline in three summarization datasets.",Extractive Summarization, +Unsupervised Severely Deformed Mesh Reconstruction (DMR) from a Single-View Image,"Much progress has been made in the supervised learning of 3D reconstruction +of rigid objects from multi-view images or a video. However, it is more +challenging to reconstruct severely deformed objects from a single-view RGB +image in an unsupervised manner. Although training-based methods, such as +specific category-level training, have been shown to successfully reconstruct +rigid objects and slightly deformed objects like birds from a single-view +image, they cannot effectively handle severely deformed objects and neither can +be applied to some downstream tasks in the real world due to the inconsistent +semantic meaning of vertices, which are crucial in defining the adopted 3D +templates of objects to be reconstructed. In this work, we introduce a +template-based method to infer 3D shapes from a single-view image and apply the +reconstructed mesh to a downstream task, i.e., absolute length measurement. +Without using 3D ground truth, our method faithfully reconstructs 3D meshes and +achieves state-of-the-art accuracy in a length measurement task on a severely +deformed fish dataset.",3D reconstruction, +Memory-based Message Passing: Decoupling the Message for Propogation from Discrimination,"Message passing is a fundamental procedure for graph neural networks in the +field of graph representation learning. Based on the homophily assumption, the +current message passing always aggregates features of connected nodes, such as +the graph Laplacian smoothing process. However, real-world graphs tend to be +noisy and/or non-smooth. The homophily assumption does not always hold, leading +to sub-optimal results. A revised message passing method needs to maintain each +node's discriminative ability when aggregating the message from neighbors. To +this end, we propose a Memory-based Message Passing (MMP) method to decouple +the message of each node into a self-embedding part for discrimination and a +memory part for propagation. Furthermore, we develop a control mechanism and a +decoupling regularization to control the ratio of absorbing and excluding the +message in the memory for each node. More importantly, our MMP is a general +skill that can work as an additional layer to help improve traditional GNNs +performance. Extensive experiments on various datasets with different homophily +ratios demonstrate the effectiveness and robustness of the proposed method.",graph representation learning, +On the vulnerability of fingerprint verification systems to fake fingerprint attacks,"A new method to generate gummy fingers is presented. A medium-size fake +fingerprint database is described and two different fingerprint verification +systems are evaluated on it. Three different scenarios are considered in the +experiments, namely: enrollment and test with real fingerprints, enrollment and +test with fake fingerprints, and enrollment with real fingerprints and test +with fake fingerprints. Results for an optical and a thermal sweeping sensors +are given. Both systems are shown to be vulnerable to direct attacks.",fake fingerprint attacks, +"Benchmarking zero-shot and few-shot approaches for tokenization, tagging, and dependency parsing of Tagalog text","The grammatical analysis of texts in any written language typically involves +a number of basic processing tasks, such as tokenization, morphological +tagging, and dependency parsing. State-of-the-art systems can achieve high +accuracy on these tasks for languages with large datasets, but yield poor +results for languages which have little to no annotated data. To address this +issue for the Tagalog language, we investigate the use of alternative language +resources for creating task-specific models in the absence of +dependency-annotated Tagalog data. We also explore the use of word embeddings +and data augmentation to improve performance when only a small amount of +annotated Tagalog data is available. We show that these zero-shot and few-shot +approaches yield substantial improvements on grammatical analysis of both +in-domain and out-of-domain Tagalog text compared to state-of-the-art +supervised baselines.",Tagalog text processing, +Fairness in generative modeling,"We design general-purpose algorithms for addressing fairness issues and mode +collapse in generative modeling. More precisely, to design fair algorithms for +as many sensitive variables as possible, including variables we might not be +aware of, we assume no prior knowledge of sensitive variables: our algorithms +use unsupervised fairness only, meaning no information related to the sensitive +variables is used for our fairness-improving methods. All images of faces (even +generated ones) have been removed to mitigate legal risks.",generative modeling, +A Variational Approach for Joint Image Recovery and Feature Extraction Based on Spatially-Varying Generalised Gaussian Models,"The joint problem of reconstruction / feature extraction is a challenging +task in image processing. It consists in performing, in a joint manner, the +restoration of an image and the extraction of its features. In this work, we +firstly propose a novel nonsmooth and non-convex variational formulation of the +problem. For this purpose, we introduce a versatile generalised Gaussian prior +whose parameters, including its exponent, are space-variant. Secondly, we +design an alternating proximal-based optimisation algorithm that efficiently +exploits the structure of the proposed non-convex objective function. We also +analyse the convergence of this algorithm. As shown in numerical experiments +conducted on joint deblurring/segmentation tasks, the proposed method provides +high-quality results.",image recovery, +Goal-directed Planning and Goal Understanding by Active Inference: Evaluation Through Simulated and Physical Robot Experiments,"We show that goal-directed action planning and generation in a teleological +framework can be formulated using the free energy principle. The proposed +model, which is built on a variational recurrent neural network model, is +characterized by three essential features. These are that (1) goals can be +specified for both static sensory states, e.g., for goal images to be reached +and dynamic processes, e.g., for moving around an object, (2) the model can not +only generate goal-directed action plans, but can also understand goals by +sensory observation, and (3) the model generates future action plans for given +goals based on the best estimate of the current state, inferred using past +sensory observations. The proposed model is evaluated by conducting experiments +on a simulated mobile agent as well as on a real humanoid robot performing +object manipulation.",action planning, +BOS at LSCDiscovery: Lexical Substitution for Interpretable Lexical Semantic Change Detection,"We propose a solution for the LSCDiscovery shared task on Lexical Semantic +Change Detection in Spanish. Our approach is based on generating lexical +substitutes that describe old and new senses of a given word. This approach +achieves the second best result in sense loss and sense gain detection +subtasks. By observing those substitutes that are specific for only one time +period, one can understand which senses were obtained or lost. This allows +providing more detailed information about semantic change to the user and makes +our method interpretable.",lexical semantic change detection, +Learning to Find Proofs and Theorems by Learning to Refine Search Strategies: The Case of Loop Invariant Synthesis,"We propose a new approach to automated theorem proving where an +AlphaZero-style agent is self-training to refine a generic high-level expert +strategy expressed as a nondeterministic program. An analogous teacher agent is +self-training to generate tasks of suitable relevance and difficulty for the +learner. This allows leveraging minimal amounts of domain knowledge to tackle +problems for which training data is unavailable or hard to synthesize. As a +specific illustration, we consider loop invariant synthesis for imperative +programs and use neural networks to refine both the teacher and solver +strategies.",Automated Theorem Proving, +Post-Train Adaptive MobileNet for Fast Anti-Spoofing,"Many applications require high accuracy of neural networks as well as low +latency and user data privacy guaranty. Face anti-spoofing is one of such +tasks. However, a single model might not give the best results for different +device performance categories, while training multiple models is time +consuming. In this work we present Post-Train Adaptive (PTA) block. Such a +block is simple in structure and offers a drop-in replacement for the +MobileNetV2 Inverted Residual block. The PTA block has multiple branches with +different computation costs. The branch to execute can be selected on-demand +and at runtime; thus, offering different inference times and configuration +capability for multiple device tiers. Crucially, the model is trained once and +can be easily reconfigured after training, even directly on a mobile device. In +addition, the proposed approach shows substantially better overall performance +in comparison to the original MobileNetV2 as tested on CelebA-Spoof dataset. +Different PTA block configurations are sampled at training time, which also +decreases overall wall-clock time needed to train the model. While we present +computational results for the anti-spoofing problem, the MobileNetV2 with PTA +blocks is applicable to any problem solvable with convolutional neural +networks, which makes the results presented practically significant.",Face anti-spoofing, +Backtracking Counterfactuals,"Counterfactual reasoning -- envisioning hypothetical scenarios, or possible +worlds, where some circumstances are different from what (f)actually occurred +(counter-to-fact) -- is ubiquitous in human cognition. Conventionally, +counterfactually-altered circumstances have been treated as ""small miracles"" +that locally violate the laws of nature while sharing the same initial +conditions. In Pearl's structural causal model (SCM) framework this is made +mathematically rigorous via interventions that modify the causal laws while the +values of exogenous variables are shared. In recent years, however, this purely +interventionist account of counterfactuals has increasingly come under scrutiny +from both philosophers and psychologists. Instead, they suggest a backtracking +account of counterfactuals, according to which the causal laws remain unchanged +in the counterfactual world; differences to the factual world are instead +""backtracked"" to altered initial conditions (exogenous variables). In the +present work, we explore and formalise this alternative mode of counterfactual +reasoning within the SCM framework. Despite ample evidence that humans +backtrack, the present work constitutes, to the best of our knowledge, the +first general account and algorithmisation of backtracking counterfactuals. We +discuss our backtracking semantics in the context of related literature and +draw connections to recent developments in explainable artificial intelligence +(XAI).",Counterfactual reasoning, +DocAsRef: An Empirical Study on Repurposing Reference-Based Summary Quality Metrics Reference-Freely,"Automated summary quality assessment falls into two categories: +reference-based and reference-free. Reference-based metrics, historically +deemed more accurate due to the additional information provided by +human-written references, are limited by their reliance on human input. In this +paper, we hypothesize that the comparison methodologies used by some +reference-based metrics to evaluate a system summary against its corresponding +reference can be effectively adapted to assess it against its source document, +thereby transforming these metrics into reference-free ones. Experimental +results support this hypothesis. After being repurposed reference-freely, the +zero-shot BERTScore using the pretrained DeBERTa-large-MNLI model of <0.5B +parameters consistently outperforms its original reference-based version across +various aspects on the SummEval and Newsroom datasets. It also excels in +comparison to most existing reference-free metrics and closely competes with +zero-shot summary evaluators based on GPT-3.5.",summary quality assessment, +NeROIC: Neural Rendering of Objects from Online Image Collections,"We present a novel method to acquire object representations from online image +collections, capturing high-quality geometry and material properties of +arbitrary objects from photographs with varying cameras, illumination, and +backgrounds. This enables various object-centric rendering applications such as +novel-view synthesis, relighting, and harmonized background composition from +challenging in-the-wild input. Using a multi-stage approach extending neural +radiance fields, we first infer the surface geometry and refine the coarsely +estimated initial camera parameters, while leveraging coarse foreground object +masks to improve the training efficiency and geometry quality. We also +introduce a robust normal estimation technique which eliminates the effect of +geometric noise while retaining crucial details. Lastly, we extract surface +material properties and ambient illumination, represented in spherical +harmonics with extensions that handle transient elements, e.g. sharp shadows. +The union of these components results in a highly modular and efficient object +acquisition framework. Extensive evaluations and comparisons demonstrate the +advantages of our approach in capturing high-quality geometry and appearance +properties useful for rendering applications.",neural rendering, +Intra and Cross-spectrum Iris Presentation Attack Detection in the NIR and Visible Domains,"Iris Presentation Attack Detection (PAD) is essential to secure iris +recognition systems. Recent iris PAD solutions achieved good performance by +leveraging deep learning techniques. However, most results were reported under +intra-database scenarios and it is unclear if such solutions can generalize +well across databases and capture spectra. These PAD methods run the risk of +overfitting because of the binary label supervision during the network +training, which serves global information learning but weakens the capture of +local discriminative features. This chapter presents a novel attention-based +deep pixel-wise binary supervision (A-PBS) method. A-PBS utilizes pixel-wise +supervision to capture the fine-grained pixel/patch-level cues and attention +mechanism to guide the network to automatically find regions where most +contribute to an accurate PAD decision. Extensive experiments are performed on +six NIR and one visible-light iris databases to show the effectiveness and +robustness of proposed A-PBS methods. We additionally conduct extensive +experiments under intra-/cross-database and intra-/cross-spectrum for detailed +analysis. The results of our experiments indicates the generalizability of the +A-PBS iris PAD approach.",iris presentation attack detection, +Attention Mechanism Meets with Hybrid Dense Network for Hyperspectral Image Classification,"Convolutional Neural Networks (CNN) are more suitable, indeed. However, fixed +kernel sizes make traditional CNN too specific, neither flexible nor conducive +to feature learning, thus impacting on the classification accuracy. The +convolution of different kernel size networks may overcome this problem by +capturing more discriminating and relevant information. In light of this, the +proposed solution aims at combining the core idea of 3D and 2D Inception net +with the Attention mechanism to boost the HSIC CNN performance in a hybrid +scenario. The resulting \textit{attention-fused hybrid network} (AfNet) is +based on three attention-fused parallel hybrid sub-nets with different kernels +in each block repeatedly using high-level features to enhance the final +ground-truth maps. In short, AfNet is able to selectively filter out the +discriminative features critical for classification. Several tests on HSI +datasets provided competitive results for AfNet compared to state-of-the-art +models. The proposed pipeline achieved, indeed, an overall accuracy of 97\% for +the Indian Pines, 100\% for Botswana, 99\% for Pavia University, Pavia Center, +and Salinas datasets.",hyperspectral image classification, +A Framework for CSI-Based Indoor Localization with 1D Convolutional Neural Networks,"Modern indoor localization techniques are essential to overcome the weak GPS +coverage in indoor environments. Recently, considerable progress has been made +in Channel State Information (CSI) based indoor localization with signal +fingerprints. However, CSI signal patterns can be complicated in the large and +highly dynamic indoor spaces with complex interiors, thus a solution for +solving this issue is urgently needed to expand the applications of CSI to a +broader indoor space. In this paper, we propose an end-to-end solution +including data collection, pattern clustering, denoising, calibration and a +lightweight one-dimensional convolutional neural network (1D CNN) model with +CSI fingerprinting to tackle this problem. We have also created and plan to +open source a CSI dataset with a large amount of data collected across complex +indoor environments at Colorado State University. Experiments indicate that our +approach achieves up to 68.5% improved performance (mean distance error) with +minimal number of parameters, compared to the best-known deep machine learning +and CSI-based indoor localization works.",CSI-based indoor localization, +A Feasibility Study of Answer-Agnostic Question Generation for Education,"We conduct a feasibility study into the applicability of answer-agnostic +question generation models to textbook passages. We show that a significant +portion of errors in such systems arise from asking irrelevant or +uninterpretable questions and that such errors can be ameliorated by providing +summarized input. We find that giving these models human-written summaries +instead of the original text results in a significant increase in acceptability +of generated questions (33% $\rightarrow$ 83%) as determined by expert +annotators. We also find that, in the absence of human-written summaries, +automatic summarization can serve as a good middle ground.",answer-agnostic question generation, +A Unified Granular-ball Learning Model of Pawlak Rough Set and Neighborhood Rough Set,"Pawlak rough set and neighborhood rough set are the two most common rough set +theoretical models. Pawlak can use equivalence classes to represent knowledge, +but it cannot process continuous data; neighborhood rough sets can process +continuous data, but it loses the ability of using equivalence classes to +represent knowledge. To this end, this paper presents a granular-ball rough set +based on the granular-ball computing. The granular-ball rough set can +simultaneously represent Pawlak rough sets, and the neighborhood rough set, so +as to realize the unified representation of the two. This makes the +granular-ball rough set not only can deal with continuous data, but also can +use equivalence classes for knowledge representation. In addition, we propose +an implementation algorithms of granular-ball rough sets. The experimental +results on benchmark datasets demonstrate that, due to the combination of the +robustness and adaptability of the granular-ball computing, the learning +accuracy of the granular-ball rough set has been greatly improved compared with +the Pawlak rough set and the traditional neighborhood rough set. The +granular-ball rough set also outperforms nine popular or the state-of-the-art +feature selection methods.",granular-ball learning, +Image Reconstruction of Multi Branch Feature Multiplexing Fusion Network with Mixed Multi-layer Attention,"Image super-resolution reconstruction achieves better results than +traditional methods with the help of the powerful nonlinear representation +ability of convolution neural network. However, some existing algorithms also +have some problems, such as insufficient utilization of phased features, +ignoring the importance of early phased feature fusion to improve network +performance, and the inability of the network to pay more attention to +high-frequency information in the reconstruction process. To solve these +problems, we propose a multi-branch feature multiplexing fusion network with +mixed multi-layer attention (MBMFN), which realizes the multiple utilization of +features and the multistage fusion of different levels of features. To further +improve the networks performance, we propose a lightweight enhanced residual +channel attention (LERCA), which can not only effectively avoid the loss of +channel information but also make the network pay more attention to the key +channel information and benefit from it. Finally, the attention mechanism is +introduced into the reconstruction process to strengthen the restoration of +edge texture and other details. A large number of experiments on several +benchmark sets show that, compared with other advanced reconstruction +algorithms, our algorithm produces highly competitive objective indicators and +restores more image detail texture information.",image super-resolution reconstruction, +SPACE-3: Unified Dialog Model Pre-training for Task-Oriented Dialog Understanding and Generation,"Recently, pre-training methods have shown remarkable success in task-oriented +dialog (TOD) systems. However, most existing pre-trained models for TOD focus +on either dialog understanding or dialog generation, but not both. In this +paper, we propose SPACE-3, a novel unified semi-supervised pre-trained +conversation model learning from large-scale dialog corpora with limited +annotations, which can be effectively fine-tuned on a wide range of downstream +dialog tasks. Specifically, SPACE-3 consists of four successive components in a +single transformer to maintain a task-flow in TOD systems: (i) a dialog +encoding module to encode dialog history, (ii) a dialog understanding module to +extract semantic vectors from either user queries or system responses, (iii) a +dialog policy module to generate a policy vector that contains high-level +semantics of the response, and (iv) a dialog generation module to produce +appropriate responses. We design a dedicated pre-training objective for each +component. Concretely, we pre-train the dialog encoding module with span mask +language modeling to learn contextualized dialog information. To capture the +structured dialog semantics, we pre-train the dialog understanding module via a +novel tree-induced semi-supervised contrastive learning objective with the help +of extra dialog annotations. In addition, we pre-train the dialog policy module +by minimizing the L2 distance between its output policy vector and the semantic +vector of the response for policy optimization. Finally, the dialog generation +model is pre-trained by language modeling. Results show that SPACE-3 achieves +state-of-the-art performance on eight downstream dialog benchmarks, including +intent prediction, dialog state tracking, and end-to-end dialog modeling. We +also show that SPACE-3 has a stronger few-shot ability than existing models +under the low-resource setting.",task-oriented dialog, +The YiTrans End-to-End Speech Translation System for IWSLT 2022 Offline Shared Task,"This paper describes the submission of our end-to-end YiTrans speech +translation system for the IWSLT 2022 offline task, which translates from +English audio to German, Chinese, and Japanese. The YiTrans system is built on +large-scale pre-trained encoder-decoder models. More specifically, we first +design a multi-stage pre-training strategy to build a multi-modality model with +a large amount of labeled and unlabeled data. We then fine-tune the +corresponding components of the model for the downstream speech translation +tasks. Moreover, we make various efforts to improve performance, such as data +filtering, data augmentation, speech segmentation, model ensemble, and so on. +Experimental results show that our YiTrans system obtains a significant +improvement than the strong baseline on three translation directions, and it +achieves +5.2 BLEU improvements over last year's optimal end-to-end system on +tst2021 English-German. Our final submissions rank first on English-German and +English-Chinese end-to-end systems in terms of the automatic evaluation metric. +We make our code and models publicly available.",speech translation, +Wasserstein Graph Distance Based on $L_1$-Approximated Tree Edit Distance between Weisfeiler-Lehman Subtrees,"The Weisfeiler-Lehman (WL) test is a widely used algorithm in graph machine +learning, including graph kernels, graph metrics, and graph neural networks. +However, it focuses only on the consistency of the graph, which means that it +is unable to detect slight structural differences. Consequently, this limits +its ability to capture structural information, which also limits the +performance of existing models that rely on the WL test. This limitation is +particularly severe for traditional metrics defined by the WL test, which +cannot precisely capture slight structural differences. In this paper, we +propose a novel graph metric called the Wasserstein WL Subtree (WWLS) distance +to address this problem. Our approach leverages the WL subtree as structural +information for node neighborhoods and defines node metrics using the +$L_1$-approximated tree edit distance ($L_1$-TED) between WL subtrees of nodes. +Subsequently, we combine the Wasserstein distance and the $L_1$-TED to define +the WWLS distance, which can capture slight structural differences that may be +difficult to detect using conventional metrics. We demonstrate that the +proposed WWLS distance outperforms baselines in both metric validation and +graph classification experiments.",Graph Metric, +Automatic Rule Induction for Interpretable Semi-Supervised Learning,"Semi-supervised learning has shown promise in allowing NLP models to +generalize from small amounts of labeled data. Meanwhile, pretrained +transformer models act as black-box correlation engines that are difficult to +explain and sometimes behave unreliably. In this paper, we propose tackling +both of these challenges via Automatic Rule Induction (ARI), a simple and +general-purpose framework for the automatic discovery and integration of +symbolic rules into pretrained transformer models. First, we extract weak +symbolic rules from low-capacity machine learning models trained on small +amounts of labeled data. Next, we use an attention mechanism to integrate these +rules into high-capacity pretrained transformer models. Last, the +rule-augmented system becomes part of a self-training framework to boost +supervision signal on unlabeled data. These steps can be layered beneath a +variety of existing weak supervision and semi-supervised NLP algorithms in +order to improve performance and interpretability. Experiments across nine +sequence classification and relation extraction tasks suggest that ARI can +improve state-of-the-art methods with no manual effort and minimal +computational overhead.",Semi-supervised learning, +Measuring the Impact of Individual Domain Factors in Self-Supervised Pre-Training,"Human speech data comprises a rich set of domain factors such as accent, +syntactic and semantic variety, or acoustic environment. Previous work explores +the effect of domain mismatch in automatic speech recognition between +pre-training and fine-tuning as a whole but does not dissect the contribution +of individual factors. In this paper, we present a controlled study to better +understand the effect of such factors on the performance of pre-trained +representations on automatic speech recognition. To do so, we pre-train models +either on modified natural speech or synthesized audio, with a single domain +factor modified, and then measure performance after fine-tuning. Results show +that phonetic domain factors play an important role during pre-training while +grammatical and syntactic factors are far less important. To our knowledge, +this is the first study to better understand the domain characteristics of +pre-trained sets in self-supervised pre-training for speech.",self-supervised pre-training, +Label uncertainty-guided multi-stream model for disease screening,"The annotation of disease severity for medical image datasets often relies on +collaborative decisions from multiple human graders. The intra-observer +variability derived from individual differences always persists in this +process, yet the influence is often underestimated. In this paper, we cast the +intra-observer variability as an uncertainty problem and incorporate the label +uncertainty information as guidance into the disease screening model to improve +the final decision. The main idea is dividing the images into simple and hard +cases by uncertainty information, and then developing a multi-stream network to +deal with different cases separately. Particularly, for hard cases, we +strengthen the network's capacity in capturing the correct disease features and +resisting the interference of uncertainty. Experiments on a fundus image-based +glaucoma screening case study show that the proposed model outperforms several +baselines, especially in screening hard cases.",disease screening, +NPBG++: Accelerating Neural Point-Based Graphics,"We present a new system (NPBG++) for the novel view synthesis (NVS) task that +achieves high rendering realism with low scene fitting time. Our method +efficiently leverages the multiview observations and the point cloud of a +static scene to predict a neural descriptor for each point, improving upon the +pipeline of Neural Point-Based Graphics in several important ways. By +predicting the descriptors with a single pass through the source images, we +lift the requirement of per-scene optimization while also making the neural +descriptors view-dependent and more suitable for scenes with strong +non-Lambertian effects. In our comparisons, the proposed system outperforms +previous NVS approaches in terms of fitting and rendering runtimes while +producing images of similar quality.",novel view synthesis, +WikiLink: an encyclopedia-based semantic network for design innovation,"Data-driven design and innovation is a process to reuse and provide valuable +and useful information. However, existing semantic networks for design +innovation is built on data source restricted to technological and scientific +information. Besides, existing studies build the edges of a semantic network +only on either statistical or semantic relationships, which is less likely to +make full use of the benefits from both types of relationships and discover +implicit knowledge for design innovation. Therefore, we constructed WikiLink, a +semantic network based on Wikipedia. Combined weight which fuses both the +statistic and semantic weights between concepts is introduced in WikiLink, and +four algorithms are developed for inspiring new ideas. Evaluation experiments +are undertaken and results show that the network is characterised by high +coverage of terms, relationships and disciplines, which proves the network's +effectiveness and usefulness. Then a demonstration and case study results +indicate that WikiLink can serve as an idea generation tool for innovation in +conceptual design. The source code of WikiLink and the backend data are +provided open-source for more users to explore and build on.",design innovation, +Cross-Architecture Knowledge Distillation,"Transformer attracts much attention because of its ability to learn global +relations and superior performance. In order to achieve higher performance, it +is natural to distill complementary knowledge from Transformer to convolutional +neural network (CNN). However, most existing knowledge distillation methods +only consider homologous-architecture distillation, such as distilling +knowledge from CNN to CNN. They may not be suitable when applying to +cross-architecture scenarios, such as from Transformer to CNN. To deal with +this problem, a novel cross-architecture knowledge distillation method is +proposed. Specifically, instead of directly mimicking output/intermediate +features of the teacher, partially cross attention projector and group-wise +linear projector are introduced to align the student features with the +teacher's in two projected feature spaces. And a multi-view robust training +scheme is further presented to improve the robustness and stability of the +framework. Extensive experiments show that the proposed method outperforms 14 +state-of-the-arts on both small-scale and large-scale datasets.",knowledge distillation, +WIDER & CLOSER: Mixture of Short-channel Distillers for Zero-shot Cross-lingual Named Entity Recognition,"Zero-shot cross-lingual named entity recognition (NER) aims at transferring +knowledge from annotated and rich-resource data in source languages to +unlabeled and lean-resource data in target languages. Existing mainstream +methods based on the teacher-student distillation framework ignore the rich and +complementary information lying in the intermediate layers of pre-trained +language models, and domain-invariant information is easily lost during +transfer. In this study, a mixture of short-channel distillers (MSD) method is +proposed to fully interact the rich hierarchical information in the teacher +model and to transfer knowledge to the student model sufficiently and +efficiently. Concretely, a multi-channel distillation framework is designed for +sufficient information transfer by aggregating multiple distillers as a +mixture. Besides, an unsupervised method adopting parallel domain adaptation is +proposed to shorten the channels between the teacher and student models to +preserve domain-invariant features. Experiments on four datasets across nine +languages demonstrate that the proposed method achieves new state-of-the-art +performance on zero-shot cross-lingual NER and shows great generalization and +compatibility across languages and fields.",cross-lingual named entity recognition, +Towards an efficient Iris Recognition System on Embedded Devices,"Iris Recognition (IR) is one of the market's most reliable and accurate +biometric systems. Today, it is challenging to build NIR-capturing devices +under the premise of hardware price reduction. Commercial NIR sensors are +protected from modification. The process of building a new device is not +trivial because it is required to start from scratch with the process of +capturing images with quality, calibrating operational distances, and building +lightweight software such as eyes/iris detectors and segmentation sub-systems. +In light of such challenges, this work aims to develop and implement iris +recognition software in an embedding system and calibrate NIR in a contactless +binocular setup. We evaluate and contrast speed versus performance obtained +with two embedded computers and infrared cameras. Further, a lightweight +segmenter sub-system called ""Unet_xxs"" is proposed, which can be used for iris +semantic segmentation under restricted memory resources.",iris recognition, +Temporal Link Prediction via Adjusted Sigmoid Function and 2-Simplex Sructure,"Temporal network link prediction is an important task in the field of network +science, and has a wide range of applications in practical scenarios. Revealing +the evolutionary mechanism of the network is essential for link prediction, and +how to effectively utilize the historical information for temporal links and +efficiently extract the high-order patterns of network structure remains a +vital challenge. To address these issues, in this paper, we propose a novel +temporal link prediction model with adjusted sigmoid function and 2-simplex +structure (TLPSS). The adjusted sigmoid decay mode takes the active, decay and +stable states of edges into account, which properly fits the life cycle of +information. Moreover, the latent matrix sequence is introduced, which is +composed of simplex high-order structure, to enhance the performance of link +prediction method since it is highly feasible in sparse network. Combining the +life cycle of information and simplex high-order structure, the overall +performance of TLPSS is achieved by satisfying the consistency of temporal and +structural information in dynamic networks. Experimental results on six +real-world datasets demonstrate the effectiveness of TLPSS, and our proposed +model improves the performance of link prediction by an average of 15% compared +to other baseline methods.",temporal link prediction, +An Overview & Analysis of Sequence-to-Sequence Emotional Voice Conversion,"Emotional voice conversion (EVC) focuses on converting a speech utterance +from a source to a target emotion; it can thus be a key enabling technology for +human-computer interaction applications and beyond. However, EVC remains an +unsolved research problem with several challenges. In particular, as speech +rate and rhythm are two key factors of emotional conversion, models have to +generate output sequences of differing length. Sequence-to-sequence modelling +is recently emerging as a competitive paradigm for models that can overcome +those challenges. In an attempt to stimulate further research in this promising +new direction, recent sequence-to-sequence EVC papers were systematically +investigated and reviewed from six perspectives: their motivation, training +strategies, model architectures, datasets, model inputs, and evaluation +methods. This information is organised to provide the research community with +an easily digestible overview of the current state-of-the-art. Finally, we +discuss existing challenges of sequence-to-sequence EVC.",Emotional voice conversion, +Testing the limits of natural language models for predicting human language judgments,"Neural network language models can serve as computational hypotheses about +how humans process language. We compared the model-human consistency of diverse +language models using a novel experimental approach: controversial sentence +pairs. For each controversial sentence pair, two language models disagree about +which sentence is more likely to occur in natural text. Considering nine +language models (including n-gram, recurrent neural networks, and transformer +models), we created hundreds of such controversial sentence pairs by either +selecting sentences from a corpus or synthetically optimizing sentence pairs to +be highly controversial. Human subjects then provided judgments indicating for +each pair which of the two sentences is more likely. Controversial sentence +pairs proved highly effective at revealing model failures and identifying +models that aligned most closely with human judgments. The most +human-consistent model tested was GPT-2, although experiments also revealed +significant shortcomings of its alignment with human perception.",predicting human judgments, +Consistency and Accuracy of CelebA Attribute Values,"We report the first systematic analysis of the experimental foundations of +facial attribute classification. Two annotators independently assigning +attribute values shows that only 12 of 40 common attributes are assigned values +with >= 95% consistency, and three (high cheekbones, pointed nose, oval face) +have essentially random consistency. Of 5,068 duplicate face appearances in +CelebA, attributes have contradicting values on from 10 to 860 of the 5,068 +duplicates. Manual audit of a subset of CelebA estimates error rates as high as +40% for (no beard=false), even though the labeling consistency experiment +indicates that no beard could be assigned with >= 95% consistency. Selecting +the mouth slightly open (MSO) for deeper analysis, we estimate the error rate +for (MSO=true) at about 20% and (MSO=false) at about 2%. A corrected version of +the MSO attribute values enables learning a model that achieves higher accuracy +than previously reported for MSO. Corrected values for CelebA MSO are available +at https://github.com/HaiyuWu/CelebAMSO.",facial attribute classification, +Measurement-Consistent Networks via a Deep Implicit Layer for Solving Inverse Problems,"End-to-end deep neural networks (DNNs) have become the state-of-the-art +(SOTA) for solving inverse problems. Despite their outstanding performance, +during deployment, such networks are sensitive to minor variations in the +testing pipeline and often fail to reconstruct small but important details, a +feature critical in medical imaging, astronomy, or defence. Such instabilities +in DNNs can be explained by the fact that they ignore the forward measurement +model during deployment, and thus fail to enforce consistency between their +output and the input measurements. To overcome this, we propose a framework +that transforms any DNN for inverse problems into a measurement-consistent one. +This is done by appending to it an implicit layer (or deep equilibrium network) +designed to solve a model-based optimization problem. The implicit layer +consists of a shallow learnable network that can be integrated into the +end-to-end training while keeping the SOTA DNN fixed. Experiments on +single-image super-resolution show that the proposed framework leads to +significant improvements in reconstruction quality and robustness over the SOTA +DNNs.",Inverse Problems Solving, +Fingerprint Image-Quality Estimation and its Application to Multialgorithm Verification,"Signal-quality awareness has been found to increase recognition rates and to +support decisions in multisensor environments significantly. Nevertheless, +automatic quality assessment is still an open issue. Here, we study the +orientation tensor of fingerprint images to quantify signal impairments, such +as noise, lack of structure, blur, with the help of symmetry descriptors. A +strongly reduced reference is especially favorable in biometrics, but less +information is not sufficient for the approach. This is also supported by +numerous experiments involving a simpler quality estimator, a trained method +(NFIQ), as well as the human perception of fingerprint quality on several +public databases. Furthermore, quality measurements are extensively reused to +adapt fusion parameters in a monomodal multialgorithm fingerprint recognition +environment. In this study, several trained and nontrained score-level fusion +schemes are investigated. A Bayes-based strategy for incorporating experts past +performances and current quality conditions, a novel cascaded scheme for +computational efficiency, besides simple fusion rules, is presented. The +quantitative results favor quality awareness under all aspects, boosting +recognition rates and fusing differently skilled experts efficiently as well as +effectively (by training).",Signal Quality Assessment, +$\textit{latent}$-GLAT: Glancing at Latent Variables for Parallel Text Generation,"Recently, parallel text generation has received widespread attention due to +its success in generation efficiency. Although many advanced techniques are +proposed to improve its generation quality, they still need the help of an +autoregressive model for training to overcome the one-to-many multi-modal +phenomenon in the dataset, limiting their applications. In this paper, we +propose $\textit{latent}$-GLAT, which employs the discrete latent variables to +capture word categorical information and invoke an advanced curriculum learning +technique, alleviating the multi-modality problem. Experiment results show that +our method outperforms strong baselines without the help of an autoregressive +model, which further broadens the application scenarios of the parallel +decoding paradigm.",parallel text generation, +Cooperative Artificial Intelligence,"In the future, artificial learning agents are likely to become increasingly +widespread in our society. They will interact with both other learning agents +and humans in a variety of complex settings including social dilemmas. We argue +that there is a need for research on the intersection between game theory and +artificial intelligence, with the goal of achieving cooperative artificial +intelligence that can navigate social dilemmas well. We consider the problem of +how an external agent can promote cooperation between artificial learners by +distributing additional rewards and punishments based on observing the actions +of the learners. We propose a rule for automatically learning how to create the +right incentives by considering the anticipated parameter updates of each +agent. Using this learning rule leads to cooperation with high social welfare +in matrix games in which the agents would otherwise learn to defect with high +probability. We show that the resulting cooperative outcome is stable in +certain games even if the planning agent is turned off after a given number of +episodes, while other games require ongoing intervention to maintain mutual +cooperation. Finally, we reflect on what the goals of multi-agent reinforcement +learning should be in the first place, and discuss the necessary building +blocks towards the goal of building cooperative AI.",Cooperative agents, +Reward Uncertainty for Exploration in Preference-based Reinforcement Learning,"Conveying complex objectives to reinforcement learning (RL) agents often +requires meticulous reward engineering. Preference-based RL methods are able to +learn a more flexible reward model based on human preferences by actively +incorporating human feedback, i.e. teacher's preferences between two clips of +behaviors. However, poor feedback-efficiency still remains a problem in current +preference-based RL algorithms, as tailored human feedback is very expensive. +To handle this issue, previous methods have mainly focused on improving query +selection and policy initialization. At the same time, recent exploration +methods have proven to be a recipe for improving sample-efficiency in RL. We +present an exploration method specifically for preference-based RL algorithms. +Our main idea is to design an intrinsic reward by measuring the novelty based +on learned reward. Specifically, we utilize disagreement across ensemble of +learned reward models. Our intuition is that disagreement in learned reward +model reflects uncertainty in tailored human feedback and could be useful for +exploration. Our experiments show that exploration bonus from uncertainty in +learned reward improves both feedback- and sample-efficiency of +preference-based RL algorithms on complex robot manipulation tasks from +MetaWorld benchmarks, compared with other existing exploration methods that +measure the novelty of state visitation.",preference-based reinforcement learning, +PRBoost: Prompt-Based Rule Discovery and Boosting for Interactive Weakly-Supervised Learning,"Weakly-supervised learning (WSL) has shown promising results in addressing +label scarcity on many NLP tasks, but manually designing a comprehensive, +high-quality labeling rule set is tedious and difficult. We study interactive +weakly-supervised learning -- the problem of iteratively and automatically +discovering novel labeling rules from data to improve the WSL model. Our +proposed model, named PRBoost, achieves this goal via iterative prompt-based +rule discovery and model boosting. It uses boosting to identify large-error +instances and then discovers candidate rules from them by prompting pre-trained +LMs with rule templates. The candidate rules are judged by human experts, and +the accepted rules are used to generate complementary weak labels and +strengthen the current model. Experiments on four tasks show PRBoost +outperforms state-of-the-art WSL baselines up to 7.1% and bridges the gaps with +fully supervised models. Our Implementation is available at +\url{https://github.com/rz-zhang/PRBoost}.",weakly-supervised learning, +Real-Time Multi-Modal Semantic Fusion on Unmanned Aerial Vehicles with Label Propagation for Cross-Domain Adaptation,"Unmanned aerial vehicles (UAVs) equipped with multiple complementary sensors +have tremendous potential for fast autonomous or remote-controlled semantic +scene analysis, e.g., for disaster examination. Here, we propose a UAV system +for real-time semantic inference and fusion of multiple sensor modalities. +Semantic segmentation of LiDAR scans and RGB images, as well as object +detection on RGB and thermal images, run online onboard the UAV computer using +lightweight CNN architectures and embedded inference accelerators. We follow a +late fusion approach where semantic information from multiple sensor modalities +augments 3D point clouds and image segmentation masks while also generating an +allocentric semantic map. Label propagation on the semantic map allows for +sensor-specific adaptation with cross-modality and cross-domain supervision. +Our system provides augmented semantic images and point clouds with $\approx$ 9 +Hz. We evaluate the integrated system in real-world experiments in an urban +environment and at a disaster test site.",Cross-Domain Adaptation, +"Provably Convergent Plug & Play Linearized ADMM, applied to Deblurring Spatially Varying Kernels","Plug & Play methods combine proximal algorithms with denoiser priors to solve +inverse problems. These methods rely on the computability of the proximal +operator of the data fidelity term. In this paper, we propose a Plug & Play +framework based on linearized ADMM that allows us to bypass the computation of +intractable proximal operators. We demonstrate the convergence of the algorithm +and provide results on restoration tasks such as super-resolution and +deblurring with non-uniform blur.",inverse problem solving, +Wide Area Network Intelligence with Application to Multimedia Service,"Network intelligence is a discipline that builds on the capabilities of +network systems to act intelligently by the usage of network resources for +delivering high-quality services in a changing environment. Wide area network +intelligence is a class of network intelligence in wide area network which +covers the core and the edge of Internet. In this paper, we propose a system +based on machine learning for wide area network intelligence. The whole system +consists of a core machine for pre-training and many terminal machines to +accomplish faster responses. Each machine is one of dual-hemisphere models +which are made of left and right hemispheres. The left hemisphere is used to +improve latency by terminal response and the right hemisphere is used to +improve communication by data generation. In an application on multimedia +service, the proposed model is superior to the latest deep feed forward neural +network in the data center with respect to the accuracy, latency and +communication. Evaluation shows scalable improvement with regard to the number +of terminal machines. Evaluation also shows the cost of improvement is longer +learning time.",network intelligence, +Trust Your $\nabla$: Gradient-based Intervention Targeting for Causal Discovery,"Inferring causal structure from data is a challenging task of fundamental +importance in science. Observational data are often insufficient to identify a +system's causal structure uniquely. While conducting interventions (i.e., +experiments) can improve the identifiability, such samples are usually +challenging and expensive to obtain. Hence, experimental design approaches for +causal discovery aim to minimize the number of interventions by estimating the +most informative intervention target. In this work, we propose a novel +Gradient-based Intervention Targeting method, abbreviated GIT, that 'trusts' +the gradient estimator of a gradient-based causal discovery framework to +provide signals for the intervention acquisition function. We provide extensive +experiments in simulated and real-world datasets and demonstrate that GIT +performs on par with competitive baselines, surpassing them in the low-data +regime.",Causal Discovery, +BlanketGen - A synthetic blanket occlusion augmentation pipeline for MoCap datasets,"Human motion analysis has seen drastic improvements recently, however, due to +the lack of representative datasets, for clinical in-bed scenarios it is still +lagging behind. To address this issue, we implemented BlanketGen, a pipeline +that augments videos with synthetic blanket occlusions. With this pipeline, we +generated an augmented version of the pose estimation dataset 3DPW called +BlanketGen-3DPW. We then used this new dataset to fine-tune a Deep Learning +model to improve its performance in these scenarios with promising results. +Code and further information are available at +https://gitlab.inesctec.pt/brain-lab/brain-lab-public/blanket-gen-releases.",In-Bed Human Motion Analysis, +Training Vision-Language Transformers from Captions,"Vision-Language Transformers can be learned without low-level human labels +(e.g. class labels, bounding boxes, etc). Existing work, whether explicitly +utilizing bounding boxes or patches, assumes that the visual backbone must +first be trained on ImageNet class prediction before being integrated into a +multimodal linguistic pipeline. We show that this is not necessary and +introduce a new model Vision-Language from Captions (VLC) built on top of +Masked Auto-Encoders that does not require this supervision. In fact, in a +head-to-head comparison between ViLT, the current state-of-the-art patch-based +vision-language transformer which is pretrained with supervised object +classification, and our model, VLC, we find that our approach 1. outperforms +ViLT on standard benchmarks, 2. provides more interpretable and intuitive patch +visualizations, and 3. is competitive with many larger models that utilize ROIs +trained on annotated bounding-boxes.",vision-language transformers, +Predictive linguistic cues for fake news: a societal artificial intelligence problem,"Media news are making a large part of public opinion and, therefore, must not +be fake. News on web sites, blogs, and social media must be analyzed before +being published. In this paper, we present linguistic characteristics of media +news items to differentiate between fake news and real news using machine +learning algorithms. Neural fake news generation, headlines created by +machines, semantic incongruities in text and image captions generated by +machine are other types of fake news problems. These problems use neural +networks which mainly control distributional features rather than evidence. We +propose applying correlation between features set and class, and correlation +among the features to compute correlation attribute evaluation metric and +covariance metric to compute variance of attributes over the news items. +Features unique, negative, positive, and cardinal numbers with high values on +the metrics are observed to provide a high area under the curve (AUC) and +F1-score.",fake news detection, +Differentiable Frequency-based Disentanglement for Aerial Video Action Recognition,"We present a learning algorithm for human activity recognition in videos. Our +approach is designed for UAV videos, which are mainly acquired from obliquely +placed dynamic cameras that contain a human actor along with background motion. +Typically, the human actors occupy less than one-tenth of the spatial +resolution. Our approach simultaneously harnesses the benefits of frequency +domain representations, a classical analysis tool in signal processing, and +data driven neural networks. We build a differentiable static-dynamic frequency +mask prior to model the salient static and dynamic pixels in the video, crucial +for the underlying task of action recognition. We use this differentiable mask +prior to enable the neural network to intrinsically learn disentangled feature +representations via an identity loss function. Our formulation empowers the +network to inherently compute disentangled salient features within its layers. +Further, we propose a cost-function encapsulating temporal relevance and +spatial content to sample the most important frame within uniformly spaced +video segments. We conduct extensive experiments on the UAV Human dataset and +the NEC Drone dataset and demonstrate relative improvements of 5.72% - 13.00% +over the state-of-the-art and 14.28% - 38.05% over the corresponding baseline +model.",human activity recognition, +Weakly Supervised Grounding for VQA in Vision-Language Transformers,"Transformers for visual-language representation learning have been getting a +lot of interest and shown tremendous performance on visual question answering +(VQA) and grounding. But most systems that show good performance of those tasks +still rely on pre-trained object detectors during training, which limits their +applicability to the object classes available for those detectors. To mitigate +this limitation, the following paper focuses on the problem of weakly +supervised grounding in context of visual question answering in transformers. +The approach leverages capsules by grouping each visual token in the visual +encoder and uses activations from language self-attention layers as a +text-guided selection module to mask those capsules before they are forwarded +to the next layer. We evaluate our approach on the challenging GQA as well as +VQA-HAT dataset for VQA grounding. Our experiments show that: while removing +the information of masked objects from standard transformer architectures leads +to a significant drop in performance, the integration of capsules significantly +improves the grounding ability of such systems and provides new +state-of-the-art results compared to other approaches in the field.",visual question answering, +BlanketSet -- A clinical real-world in-bed action recognition and qualitative semi-synchronised MoCap dataset,"Clinical in-bed video-based human motion analysis is a very relevant computer +vision topic for several relevant biomedical applications. Nevertheless, the +main public large datasets (e.g. ImageNet or 3DPW) used for deep learning +approaches lack annotated examples for these clinical scenarios. To address +this issue, we introduce BlanketSet, an RGB-IR-D action recognition dataset of +sequences performed in a hospital bed. This dataset has the potential to help +bridge the improvements attained in more general large datasets to these +clinical scenarios. Information on how to access the dataset is available at +https://rdm.inesctec.pt/dataset/nis-2022-004.",human motion analysis, +TIARA: Multi-grained Retrieval for Robust Question Answering over Large Knowledge Bases,"Pre-trained language models (PLMs) have shown their effectiveness in multiple +scenarios. However, KBQA remains challenging, especially regarding coverage and +generalization settings. This is due to two main factors: i) understanding the +semantics of both questions and relevant knowledge from the KB; ii) generating +executable logical forms with both semantic and syntactic correctness. In this +paper, we present a new KBQA model, TIARA, which addresses those issues by +applying multi-grained retrieval to help the PLM focus on the most relevant KB +contexts, viz., entities, exemplary logical forms, and schema items. Moreover, +constrained decoding is used to control the output space and reduce generation +errors. Experiments over important benchmarks demonstrate the effectiveness of +our approach. TIARA outperforms previous SOTA, including those using PLMs or +oracle entity annotations, by at least 4.1 and 1.1 F1 points on GrailQA and +WebQuestionsSP, respectively.",question answering, +A Framework to Generate High-Quality Datapoints for Multiple Novel Intent Detection,"Systems like Voice-command based conversational agents are characterized by a +pre-defined set of skills or intents to perform user specified tasks. In the +course of time, newer intents may emerge requiring retraining. However, the +newer intents may not be explicitly announced and need to be inferred +dynamically. Thus, there are two important tasks at hand (a). identifying +emerging new intents, (b). annotating data of the new intents so that the +underlying classifier can be retrained efficiently. The tasks become specially +challenging when a large number of new intents emerge simultaneously and there +is a limited budget of manual annotation. In this paper, we propose MNID +(Multiple Novel Intent Detection) which is a cluster based framework to detect +multiple novel intents with budgeted human annotation cost. Empirical results +on various benchmark datasets (of different sizes) demonstrate that MNID, by +intelligently using the budget for annotation, outperforms the baseline methods +in terms of accuracy and F1-score.",intent detection, +"Strong Admissibility, a Tractable Algorithmic Approach (proofs)","Much like admissibility is the key concept underlying preferred semantics, +strong admissibility is the key concept underlying grounded semantics, as +membership of a strongly admissible set is sufficient to show membership of the +grounded extension. As such, strongly admissible sets and labellings can be +used as an explanation of membership of the grounded extension, as is for +instance done in some of the proof procedures for grounded semantics. In the +current paper, we present two polynomial algorithms for constructing relatively +small strongly admissible labellings, with associated min-max numberings, for a +particular argument. These labellings can be used as relatively small +explanations for the argument's membership of the grounded extension. Although +our algorithms are not guaranteed to yield an absolute minimal strongly +admissible labelling for the argument (as doing do would have implied an +exponential complexity), our best performing algorithm yields results that are +only marginally bigger. Moreover, the runtime of this algorithm is an order of +magnitude smaller than that of the existing approach for computing an absolute +minimal strongly admissible labelling for a particular argument. As such, we +believe that our algorithms can be of practical value in situations where the +aim is to construct a minimal or near-minimal strongly admissible labelling in +a time-efficient way.",strong admissibility algorithms, +Learning Modular Structures That Generalize Out-of-Distribution,"Out-of-distribution (O.O.D.) generalization remains to be a key challenge for +real-world machine learning systems. We describe a method for O.O.D. +generalization that, through training, encourages models to only preserve +features in the network that are well reused across multiple training domains. +Our method combines two complementary neuron-level regularizers with a +probabilistic differentiable binary mask over the network, to extract a modular +sub-network that achieves better O.O.D. performance than the original network. +Preliminary evaluation on two benchmark datasets corroborates the promise of +our method.",out-of-distribution generalization, +"Deep Reinforcement Learning, a textbook","Deep reinforcement learning has gathered much attention recently. Impressive +results were achieved in activities as diverse as autonomous driving, game +playing, molecular recombination, and robotics. In all these fields, computer +programs have taught themselves to solve difficult problems. They have learned +to fly model helicopters and perform aerobatic manoeuvers such as loops and +rolls. In some applications they have even become better than the best humans, +such as in Atari, Go, poker and StarCraft. The way in which deep reinforcement +learning explores complex environments reminds us of how children learn, by +playfully trying out things, getting feedback, and trying again. The computer +seems to truly possess aspects of human learning; this goes to the heart of the +dream of artificial intelligence. The successes in research have not gone +unnoticed by educators, and universities have started to offer courses on the +subject. The aim of this book is to provide a comprehensive overview of the +field of deep reinforcement learning. The book is written for graduate students +of artificial intelligence, and for researchers and practitioners who wish to +better understand deep reinforcement learning methods and their challenges. We +assume an undergraduate-level of understanding of computer science and +artificial intelligence; the programming language of this book is Python. We +describe the foundations, the algorithms and the applications of deep +reinforcement learning. We cover the established model-free and model-based +methods that form the basis of the field. Developments go quickly, and we also +cover advanced topics: deep multi-agent reinforcement learning, deep +hierarchical reinforcement learning, and deep meta learning.",deep reinforcement learning, +Language Tokens: A Frustratingly Simple Approach Improves Zero-Shot Performance of Multilingual Translation,"This paper proposes a simple yet effective method to improve direct (X-to-Y) +translation for both cases: zero-shot and when direct data is available. We +modify the input tokens at both the encoder and decoder to include signals for +the source and target languages. We show a performance gain when training from +scratch, or finetuning a pretrained model with the proposed setup. In the +experiments, our method shows nearly 10.0 BLEU points gain on in-house datasets +depending on the checkpoint selection criteria. In a WMT evaluation campaign, +From-English performance improves by 4.17 and 2.87 BLEU points, in the +zero-shot setting, and when direct data is available for training, +respectively. While X-to-Y improves by 1.29 BLEU over the zero-shot baseline, +and 0.44 over the many-to-many baseline. In the low-resource setting, we see a +1.5~1.7 point improvement when finetuning on X-to-Y domain data.",multilingual translation, +UrduFake@FIRE2020: Shared Track on Fake News Identification in Urdu,"This paper gives the overview of the first shared task at FIRE 2020 on fake +news detection in the Urdu language. This is a binary classification task in +which the goal is to identify fake news using a dataset composed of 900 +annotated news articles for training and 400 news articles for testing. The +dataset contains news in five domains: (i) Health, (ii) Sports, (iii) Showbiz, +(iv) Technology, and (v) Business. 42 teams from 6 different countries (India, +China, Egypt, Germany, Pakistan, and the UK) registered for the task. 9 teams +submitted their experimental results. The participants used various machine +learning methods ranging from feature-based traditional machine learning to +neural network techniques. The best performing system achieved an F-score value +of 0.90, showing that the BERT-based approach outperforms other machine +learning classifiers.",fake news detection, +Dynamic Batch Adaptation,"Current deep learning adaptive optimizer methods adjust the step magnitude of +parameter updates by altering the effective learning rate used by each +parameter. Motivated by the known inverse relation between batch size and +learning rate on update step magnitudes, we introduce a novel training +procedure that dynamically decides the dimension and the composition of the +current update step. Our procedure, Dynamic Batch Adaptation (DBA) analyzes the +gradients of every sample and selects the subset that best improves certain +metrics such as gradient variance for each layer of the network. We present +results showing DBA significantly improves the speed of model convergence. +Additionally, we find that DBA produces an increased improvement over standard +optimizers when used in data scarce conditions where, in addition to +convergence speed, it also significantly improves model generalization, +managing to train a network with a single fully connected hidden layer using +only 1% of the MNIST dataset to reach 97.79% test accuracy. In an even more +extreme scenario, it manages to reach 97.44% test accuracy using only 10 +samples per class. These results represent a relative error rate reduction of +81.78% and 88.07% respectively, compared to the standard optimizers, Stochastic +Gradient Descent (SGD) and Adam.",deep learning optimizers, +Scheduling with Predictions,"There is significant interest in deploying machine learning algorithms for +diagnostic radiology, as modern learning techniques have made it possible to +detect abnormalities in medical images within minutes. While machine-assisted +diagnoses cannot yet reliably replace human reviews of images by a radiologist, +they could inform prioritization rules for determining the order by which to +review patient cases so that patients with time-sensitive conditions could +benefit from early intervention. + We study this scenario by formulating it as a learning-augmented online +scheduling problem. We are given information about each arriving patient's +urgency level in advance, but these predictions are inevitably error-prone. In +this formulation, we face the challenges of decision making under imperfect +information, and of responding dynamically to prediction error as we observe +better data in real-time. We propose a simple online policy and show that this +policy is in fact the best possible in certain stylized settings. We also +demonstrate that our policy achieves the two desiderata of online algorithms +with predictions: consistency (performance improvement with prediction +accuracy) and robustness (protection against the worst case). We complement our +theoretical findings with empirical evaluations of the policy under settings +that more accurately reflect clinical scenarios in the real world.",machine-assisted diagnostic radiology, +Language models show human-like content effects on reasoning tasks,"Abstract reasoning is a key ability for an intelligent system. Large language +models (LMs) achieve above-chance performance on abstract reasoning tasks, but +exhibit many imperfections. However, human abstract reasoning is also +imperfect. For example, human reasoning is affected by our real-world knowledge +and beliefs, and shows notable ""content effects""; humans reason more reliably +when the semantic content of a problem supports the correct logical inferences. +These content-entangled reasoning patterns play a central role in debates about +the fundamental nature of human intelligence. Here, we investigate whether +language models $\unicode{x2014}$ whose prior expectations capture some aspects +of human knowledge $\unicode{x2014}$ similarly mix content into their answers +to logical problems. We explored this question across three logical reasoning +tasks: natural language inference, judging the logical validity of syllogisms, +and the Wason selection task. We evaluate state of the art large language +models, as well as humans, and find that the language models reflect many of +the same patterns observed in humans across these tasks $\unicode{x2014}$ like +humans, models answer more accurately when the semantic content of a task +supports the logical inferences. These parallels are reflected both in answer +patterns, and in lower-level features like the relationship between model +answer distributions and human response times. Our findings have implications +for understanding both these cognitive effects in humans, and the factors that +contribute to language model performance.",LLM reasoning, +ECOLA: Enhanced Temporal Knowledge Embeddings with Contextualized Language Representations,"Since conventional knowledge embedding models cannot take full advantage of +the abundant textual information, there have been extensive research efforts in +enhancing knowledge embedding using texts. However, existing enhancement +approaches cannot apply to temporal knowledge graphs (tKGs), which contain +time-dependent event knowledge with complex temporal dynamics. Specifically, +existing enhancement approaches often assume knowledge embedding is +time-independent. In contrast, the entity embedding in tKG models usually +evolves, which poses the challenge of aligning temporally relevant texts with +entities. To this end, we propose to study enhancing temporal knowledge +embedding with textual data in this paper. As an approach to this task, we +propose Enhanced Temporal Knowledge Embeddings with Contextualized Language +Representations (ECOLA), which takes the temporal aspect into account and +injects textual information into temporal knowledge embedding. To evaluate +ECOLA, we introduce three new datasets for training and evaluating ECOLA. +Extensive experiments show that ECOLA significantly enhances temporal KG +embedding models with up to 287% relative improvements regarding Hits@1 on the +link prediction task. The code and models are publicly available on +https://anonymous.4open.science/r/ECOLA.",knowledge embedding, +$ \text{T}^3 $OMVP: A Transformer-based Time and Team Reinforcement Learning Scheme for Observation-constrained Multi-Vehicle Pursuit in Urban Area,"Smart Internet of Vehicles (IoVs) combined with Artificial Intelligence (AI) +will contribute to vehicle decision-making in the Intelligent Transportation +System (ITS). Multi-Vehicle Pursuit games (MVP), a multi-vehicle cooperative +ability to capture mobile targets, is becoming a hot research topic gradually. +Although there are some achievements in the field of MVP in the open space +environment, the urban area brings complicated road structures and restricted +moving spaces as challenges to the resolution of MVP games. We define an +Observation-constrained MVP (OMVP) problem in this paper and propose a +Transformer-based Time and Team Reinforcement Learning scheme ($ \text{T}^3 +$OMVP) to address the problem. First, a new multi-vehicle pursuit model is +constructed based on decentralized partially observed Markov decision processes +(Dec-POMDP) to instantiate this problem. Second, by introducing and modifying +the transformer-based observation sequence, QMIX is redefined to adapt to the +complicated road structure, restricted moving spaces and constrained +observations, so as to control vehicles to pursue the target combining the +vehicle's observations. Third, a multi-intersection urban environment is built +to verify the proposed scheme. Extensive experimental results demonstrate that +the proposed $ \text{T}^3 $OMVP scheme achieves significant improvements +relative to state-of-the-art QMIX approaches by 9.66%~106.25%. Code is +available at https://github.com/pipihaiziguai/T3OMVP.",Multi-Vehicle Pursuit, +BInGo: Bayesian Intrinsic Groupwise Registration via Explicit Hierarchical Disentanglement,"Multimodal groupwise registration aligns internal structures in a group of +medical images. Current approaches to this problem involve developing +similarity measures over the joint intensity profile of all images, which may +be computationally prohibitive for large image groups and unstable under +various conditions. To tackle these issues, we propose BInGo, a general +unsupervised hierarchical Bayesian framework based on deep learning, to learn +intrinsic structural representations to measure the similarity of multimodal +images. Particularly, a variational auto-encoder with a novel posterior is +proposed, which facilitates the disentanglement learning of structural +representations and spatial transformations, and characterizes the imaging +process from the common structure with shape transition and appearance +variation. Notably, BInGo is scalable to learn from small groups, whereas being +tested for large-scale groupwise registration, thus significantly reducing +computational costs. We compared BInGo with five iterative or deep learning +methods on three public intrasubject and intersubject datasets, i.e. BraTS, +MS-CMR of the heart, and Learn2Reg abdomen MR-CT, and demonstrated its superior +accuracy and computational efficiency, even for very large group sizes (e.g., +over 1300 2D images from MS-CMR in each group).",Multimodal groupwise registration , +IDE-3D: Interactive Disentangled Editing for High-Resolution 3D-aware Portrait Synthesis,"Existing 3D-aware facial generation methods face a dilemma in quality versus +editability: they either generate editable results in low resolution or +high-quality ones with no editing flexibility. In this work, we propose a new +approach that brings the best of both worlds together. Our system consists of +three major components: (1) a 3D-semantics-aware generative model that produces +view-consistent, disentangled face images and semantic masks; (2) a hybrid GAN +inversion approach that initialize the latent codes from the semantic and +texture encoder, and further optimized them for faithful reconstruction; and +(3) a canonical editor that enables efficient manipulation of semantic masks in +canonical view and product high-quality editing results. Our approach is +competent for many applications, e.g. free-view face drawing, editing, and +style control. Both quantitative and qualitative results show that our method +reaches the state-of-the-art in terms of photorealism, faithfulness, and +efficiency.",3D-aware portrait synthesis, +MiniDisc: Minimal Distillation Schedule for Language Model Compression,"Recent studies have uncovered that language model distillation is less +effective when facing a large capacity gap between the teacher and the student, +and introduced teacher assistant-based distillation to bridge the gap. As a +connection, the scale and the performance of the teacher assistant is of vital +importance to bring the knowledge from the teacher to the student. However, +existing teacher assistant-based methods require maximally many trials before +scheduling an optimal teacher assistant. To this end, we propose a minimal +distillation schedule (MiniDisc) for scheduling the optimal teacher assistant +in minimally one trial. In particular, motivated by the finding that the +performance of the student is positively correlated to the scale-performance +tradeoff of the teacher assistant, MiniDisc is designed with a +$\lambda$-tradeoff to measure the optimality of the teacher assistant without +trial distillation to the student. MiniDisc then can schedule the optimal +teacher assistant with the best $\lambda$-tradeoff in a sandwich framework. +MiniDisc is evaluated with an extensive set of experiments on GLUE. +Experimental results demonstrate the improved efficiency our MiniDisc compared +to several state-of-the-art baselines. We further apply MiniDisc to a language +model with billions of parameters and show its scalability.",language model distillation, +Language and Culture Internalisation for Human-Like Autotelic AI,"Building autonomous agents able to grow open-ended repertoires of skills +across their lives is a fundamental goal of artificial intelligence (AI). A +promising developmental approach recommends the design of intrinsically +motivated agents that learn new skills by generating and pursuing their own +goals - autotelic agents. But despite recent progress, existing algorithms +still show serious limitations in terms of goal diversity, exploration, +generalisation or skill composition. This perspective calls for the immersion +of autotelic agents into rich socio-cultural worlds, an immensely important +attribute of our environment that shapes human cognition but is mostly omitted +in modern AI. Inspired by the seminal work of Vygotsky, we propose Vygotskian +autotelic agents - agents able to internalise their interactions with others +and turn them into cognitive tools. We focus on language and show how its +structure and informational content may support the development of new +cognitive functions in artificial agents as it does in humans. We justify the +approach by uncovering several examples of new artificial cognitive functions +emerging from interactions between language and embodiment in recent works at +the intersection of deep reinforcement learning and natural language +processing. Looking forward, we highlight future opportunities and challenges +for Vygotskian Autotelic AI research, including the use of language models as +cultural models supporting artificial cognitive development.",autonomous agents, +Learning with Signatures,"In this work we investigate the use of the Signature Transform in the context +of Learning. Under this assumption, we advance a supervised framework that +potentially provides state-of-the-art classification accuracy with the use of +few labels without the need of credit assignment and with minimal or no +overfitting. We leverage tools from harmonic analysis by the use of the +signature and log-signature, and use as a score function RMSE and MAE Signature +and log-signature. We develop a closed-form equation to compute probably good +optimal scale factors, as well as the formulation to obtain them by +optimization. Techniques of Signal Processing are addressed to further +characterize the problem. Classification is performed at the CPU level orders +of magnitude faster than other methods. We report results on AFHQ, MNIST and +CIFAR10, achieving 100% accuracy on all tasks assuming we can determine at test +time which probably good optimal scale factor to use for each category.",signature transform, +Where and What: Driver Attention-based Object Detection,"Human drivers use their attentional mechanisms to focus on critical objects +and make decisions while driving. As human attention can be revealed from gaze +data, capturing and analyzing gaze information has emerged in recent years to +benefit autonomous driving technology. Previous works in this context have +primarily aimed at predicting ""where"" human drivers look at and lack knowledge +of ""what"" objects drivers focus on. Our work bridges the gap between +pixel-level and object-level attention prediction. Specifically, we propose to +integrate an attention prediction module into a pretrained object detection +framework and predict the attention in a grid-based style. Furthermore, +critical objects are recognized based on predicted attended-to areas. We +evaluate our proposed method on two driver attention datasets, BDD-A and +DR(eye)VE. Our framework achieves competitive state-of-the-art performance in +the attention prediction on both pixel-level and object-level but is far more +efficient (75.3 GFLOPs less) in computation.",object detection, +Correcting diacritics and typos with a ByT5 transformer model,"Due to the fast pace of life and online communications and the prevalence of +English and the QWERTY keyboard, people tend to forgo using diacritics, make +typographical errors (typos) when typing in other languages. Restoring +diacritics and correcting spelling is important for proper language use and the +disambiguation of texts for both humans and downstream algorithms. However, +both of these problems are typically addressed separately: the state-of-the-art +diacritics restoration methods do not tolerate other typos, but classical +spellcheckers also cannot deal adequately with all the diacritics missing. In +this work, we tackle both problems at once by employing the newly-developed +universal ByT5 byte-level seq2seq transformer model that requires no +language-specific model structures. For a comparison, we perform diacritics +restoration on benchmark datasets of 12 languages, with the addition of +Lithuanian. The experimental investigation proves that our approach is able to +achieve results (> 98%) comparable to the previous state-of-the-art, despite +being trained less and on fewer data. Our approach is also able to restore +diacritics in words not seen during training with > 76% accuracy. Our +simultaneous diacritics restoration and typos correction approach reaches > 94% +alpha-word accuracy on the 13 languages. It has no direct competitors and +strongly outperforms classical spell-checking or dictionary-based approaches. +We also demonstrate all the accuracies to further improve with more training. +Taken together, this shows the great real-world application potential of our +suggested methods to more data, languages, and error classes.",typos correction, +Egocentric Video Task Translation,"Different video understanding tasks are typically treated in isolation, and +even with distinct types of curated data (e.g., classifying sports in one +dataset, tracking animals in another). However, in wearable cameras, the +immersive egocentric perspective of a person engaging with the world around +them presents an interconnected web of video understanding tasks -- hand-object +manipulations, navigation in the space, or human-human interactions -- that +unfold continuously, driven by the person's goals. We argue that this calls for +a much more unified approach. We propose EgoTask Translation (EgoT2), which +takes a collection of models optimized on separate tasks and learns to +translate their outputs for improved performance on any or all of them at once. +Unlike traditional transfer or multi-task learning, EgoT2's flipped design +entails separate task-specific backbones and a task translator shared across +all tasks, which captures synergies between even heterogeneous tasks and +mitigates task competition. Demonstrating our model on a wide array of video +tasks from Ego4D, we show its advantages over existing transfer paradigms and +achieve top-ranked results on four of the Ego4D 2022 benchmark challenges.",egocentric video understanding, +ROCK: Causal Inference Principles for Reasoning about Commonsense Causality,"Commonsense causality reasoning (CCR) aims at identifying plausible causes +and effects in natural language descriptions that are deemed reasonable by an +average person. Although being of great academic and practical interest, this +problem is still shadowed by the lack of a well-posed theoretical framework; +existing work usually relies on deep language models wholeheartedly, and is +potentially susceptible to confounding co-occurrences. Motivated by classical +causal principles, we articulate the central question of CCR and draw parallels +between human subjects in observational studies and natural languages to adopt +CCR to the potential-outcomes framework, which is the first such attempt for +commonsense tasks. We propose a novel framework, ROCK, to Reason O(A)bout +Commonsense K(C)ausality, which utilizes temporal signals as incidental +supervision, and balances confounding effects using temporal propensities that +are analogous to propensity scores. The ROCK implementation is modular and +zero-shot, and demonstrates good CCR capabilities.",commonsense causality reasoning, +Romantic-Computing,"In this paper we compare various text generation models' ability to write +poetry in the style of early English Romanticism. These models include: +Character-Level Recurrent Neural Networks with Long Short-Term Memory, Hugging +Face's GPT-2, OpenAI's GPT-3, and EleutherAI's GPT-NEO. Quality was measured +based syllable count and coherence with the automatic evaluation metric GRUEN. +Character-Level Recurrent Neural Networks performed far worse compared to +transformer models. And, as parameter-size increased, the quality of +transformer models' poems improved. These models are typically not compared in +a creative context, and we are happy to contribute.",poetry generation, +End-to-End Multi-View Structure-from-Motion with Hypercorrelation Volumes,"Image-based 3D reconstruction is one of the most important tasks in Computer +Vision with many solutions proposed over the last few decades. The objective is +to extract metric information i.e. the geometry of scene objects directly from +images. These can then be used in a wide range of applications such as film, +games, virtual reality, etc. Recently, deep learning techniques have been +proposed to tackle this problem. They rely on training on vast amounts of data +to learn to associate features between images through deep convolutional neural +networks and have been shown to outperform traditional procedural techniques. +In this paper, we improve on the state-of-the-art two-view +structure-from-motion(SfM) approach of [11] by incorporating 4D correlation +volume for more accurate feature matching and reconstruction. Furthermore, we +extend it to the general multi-view case and evaluate it on the complex +benchmark dataset DTU [4]. Quantitative evaluations and comparisons with +state-of-the-art multi-view 3D reconstruction methods demonstrate its +superiority in terms of the accuracy of reconstructions.",Image-based 3D reconstruction, +Automated Imbalanced Learning,"Automated Machine Learning has grown very successful in automating the +time-consuming, iterative tasks of machine learning model development. However, +current methods struggle when the data is imbalanced. Since many real-world +datasets are naturally imbalanced, and improper handling of this issue can lead +to quite useless models, this issue should be handled carefully. This paper +first introduces a new benchmark to study how different AutoML methods are +affected by label imbalance. Second, we propose strategies to better deal with +imbalance and integrate them into an existing AutoML framework. Finally, we +present a systematic study which evaluates the impact of these strategies and +find that their inclusion in AutoML systems significantly increases their +robustness against label imbalance.",Imbalanced Data Machine Learning, +Making Large Language Models Better Reasoners with Step-Aware Verifier,"Few-shot learning is a challenging task that requires language models to +generalize from limited examples. Large language models like GPT-3 and PaLM +have made impressive progress in this area, but they still face difficulties in +reasoning tasks such as GSM8K, a benchmark for arithmetic problems. To improve +their reasoning skills, previous work has proposed to guide the language model +with prompts that elicit a series of reasoning steps before giving the final +answer, achieving a significant improvement on GSM8K from 17.9% to 58.1% in +problem-solving rate. In this paper, we present DIVERSE (Diverse Verifier on +Reasoning Step), a novel approach that further enhances the reasoning +capability of language models. DIVERSE has three main components: first, it +generates diverse prompts to explore different reasoning paths for the same +question; second, it uses a verifier to filter out incorrect answers based on a +weighted voting scheme; and third, it verifies each reasoning step individually +instead of the whole chain. We evaluate DIVERSE on the latest language model +code-davinci-002 and show that it achieves new state-of-the-art results on six +of eight reasoning benchmarks (e.g., GSM8K 74.4% to 83.2%).",language models reasoning, +Building Inspection Toolkit: Unified Evaluation and Strong Baselines for Damage Recognition,"In recent years, several companies and researchers have started to tackle the +problem of damage recognition within the scope of automated inspection of built +structures. While companies are neither willing to publish associated data nor +models, researchers are facing the problem of data shortage on one hand and +inconsistent dataset splitting with the absence of consistent metrics on the +other hand. This leads to incomparable results. Therefore, we introduce the +building inspection toolkit -- bikit -- which acts as a simple to use data hub +containing relevant open-source datasets in the field of damage recognition. +The datasets are enriched with evaluation splits and predefined metrics, +suiting the specific task and their data distribution. For the sake of +compatibility and to motivate researchers in this domain, we also provide a +leaderboard and the possibility to share model weights with the community. As +starting point we provide strong baselines for multi-target classification +tasks utilizing extensive hyperparameter search using three transfer learning +approaches for state-of-the-art algorithms. The toolkit and the leaderboard are +available online.",building damage recognition, +PCQA-GRAPHPOINT: Efficients Deep-Based Graph Metric For Point Cloud Quality Assessment,"Following the advent of immersive technologies and the increasing interest in +representing interactive geometrical format, 3D Point Clouds (PC) have emerged +as a promising solution and effective means to display 3D visual information. +In addition to other challenges in immersive applications, objective and +subjective quality assessments of compressed 3D content remain open problems +and an area of research interest. Yet most of the efforts in the research area +ignore the local geometrical structures between points representation. In this +paper, we overcome this limitation by introducing a novel and efficient +objective metric for Point Clouds Quality Assessment, by learning local +intrinsic dependencies using Graph Neural Network (GNN). To evaluate the +performance of our method, two well-known datasets have been used. The results +demonstrate the effectiveness and reliability of our solution compared to +state-of-the-art metrics.",point cloud quality assessment, +LSNet: Extremely Light-Weight Siamese Network For Change Detection in Remote Sensing Image,"The Siamese network is becoming the mainstream in change detection of remote +sensing images (RSI). However, in recent years, the development of more +complicated structure, module and training processe has resulted in the +cumbersome model, which hampers their application in large-scale RSI +processing. To this end, this paper proposes an extremely lightweight Siamese +network (LSNet) for RSI change detection, which replaces standard convolution +with depthwise separable atrous convolution, and removes redundant dense +connections, retaining only valid feature flows while performing Siamese +feature fusion, greatly compressing parameters and computation amount. Compared +with the first-place model on the CCD dataset, the parameters and the +computation amount of LSNet is greatly reduced by 90.35\% and 91.34\% +respectively, with only a 1.5\% drops in accuracy.",lightweight siamese network, +Persuasion Strategies in Advertisements,"Modeling what makes an advertisement persuasive, i.e., eliciting the desired +response from consumer, is critical to the study of propaganda, social +psychology, and marketing. Despite its importance, computational modeling of +persuasion in computer vision is still in its infancy, primarily due to the +lack of benchmark datasets that can provide persuasion-strategy labels +associated with ads. Motivated by persuasion literature in social psychology +and marketing, we introduce an extensive vocabulary of persuasion strategies +and build the first ad image corpus annotated with persuasion strategies. We +then formulate the task of persuasion strategy prediction with multi-modal +learning, where we design a multi-task attention fusion model that can leverage +other ad-understanding tasks to predict persuasion strategies. Further, we +conduct a real-world case study on 1600 advertising campaigns of 30 Fortune-500 +companies where we use our model's predictions to analyze which strategies work +with different demographics (age and gender). The dataset also provides image +segmentation masks, which labels persuasion strategies in the corresponding ad +images on the test split. We publicly release our code and dataset +https://midas-research.github.io/persuasion-advertisements/.",persuasion strategy prediction, +Effective Shortcut Technique for GAN,"In recent years, generative adversarial network (GAN)-based image generation +techniques design their generators by stacking up multiple residual blocks. The +residual block generally contains a shortcut, \ie skip connection, which +effectively supports information propagation in the network. In this paper, we +propose a novel shortcut method, called the gated shortcut, which not only +embraces the strength point of the residual block but also further boosts the +GAN performance. More specifically, based on the gating mechanism, the proposed +method leads the residual block to keep (or remove) information that is +relevant (or irrelevant) to the image being generated. To demonstrate that the +proposed method brings significant improvements in the GAN performance, this +paper provides extensive experimental results on the various standard datasets +such as CIFAR-10, CIFAR-100, LSUN, and tiny-ImageNet. Quantitative evaluations +show that the gated shortcut achieves the impressive GAN performance in terms +of Frechet inception distance (FID) and Inception score (IS). For instance, the +proposed method improves the FID and IS scores on the tiny-ImageNet dataset +from 35.13 to 27.90 and 20.23 to 23.42, respectively.",generative adversarial network, +$\mathcal{Y}$-Tuning: An Efficient Tuning Paradigm for Large-Scale Pre-Trained Models via Label Representation Learning,"With the success of large-scale pre-trained models (PTMs), how efficiently +adapting PTMs to downstream tasks has attracted tremendous attention, +especially for PTMs with billions of parameters. Although some +parameter-efficient tuning paradigms have been proposed to address this +problem, they still require large resources to compute the gradients in the +training phase. In this paper, we propose $\mathcal{Y}$-Tuning, an efficient +yet effective paradigm to adapt frozen large-scale PTMs to specific downstream +tasks. $\mathcal{Y}$-tuning learns dense representations for labels +$\mathcal{Y}$ defined in a given task and aligns them to fixed feature +representation. Without tuning the features of input text and model parameters, +$\mathcal{Y}$-tuning is both parameter-efficient and training-efficient. For +$\text{DeBERTa}_\text{XXL}$ with 1.6 billion parameters, $\mathcal{Y}$-tuning +achieves performance more than $96\%$ of full fine-tuning on GLUE Benchmark +with only $2\%$ tunable parameters and much fewer training costs.",parameter-efficient tuning , +Towards Effective Image Manipulation Detection with Proposal Contrastive Learning,"Deep models have been widely and successfully used in image manipulation +detection, which aims to classify tampered images and localize tampered +regions. Most existing methods mainly focus on extracting global features from +tampered images, while neglecting the relationships of local features between +tampered and authentic regions within a single tampered image. To exploit such +spatial relationships, we propose Proposal Contrastive Learning (PCL) for +effective image manipulation detection. Our PCL consists of a two-stream +architecture by extracting two types of global features from RGB and noise +views respectively. To further improve the discriminative power, we exploit the +relationships of local features through a proxy proposal contrastive learning +task by attracting/repelling proposal-based positive/negative sample pairs. +Moreover, we show that our PCL can be easily adapted to unlabeled data in +practice, which can reduce manual labeling costs and promote more generalizable +features. Extensive experiments among several standard datasets demonstrate +that our PCL can be a general module to obtain consistent improvement. The code +is available at https://github.com/Sandy-Zeng/PCL.",image manipulation detection, +ReAFFPN: Rotation-equivariant Attention Feature Fusion Pyramid Networks for Aerial Object Detection,"This paper proposes a Rotation-equivariant Attention Feature Fusion Pyramid +Networks for Aerial Object Detection named ReAFFPN. ReAFFPN aims at improving +the effect of rotation-equivariant features fusion between adjacent layers +which suffers from the semantic and scale discontinuity. Due to the +particularity of rotational equivariant convolution, general methods are unable +to achieve their original effect while ensuring rotation equivariance of the +network. To solve this problem, we design a new Rotation-equivariant Channel +Attention which has the ability to both generate channel attention and keep +rotation equivariance. Then we embed a new channel attention function into +Iterative Attentional Feature Fusion (iAFF) module to realize +Rotation-equivariant Attention Feature Fusion. Experimental results demonstrate +that ReAFFPN achieves a better rotation-equivariant feature fusion ability and +significantly improve the accuracy of the Rotation-equivariant Convolutional +Networks.",rotation-equivariant feature fusion, +Lexical Complexity Controlled Sentence Generation,"Text generation rarely considers the control of lexical complexity, which +limits its more comprehensive practical application. We introduce a novel task +of lexical complexity controlled sentence generation, which aims at keywords to +sentence generation with desired complexity levels. It has enormous potential +in domains such as grade reading, language teaching and acquisition. The +challenge of this task is to generate fluent sentences only using the words of +given complexity levels. We propose a simple but effective approach for this +task based on complexity embedding. Compared with potential solutions, our +approach fuses the representations of the word complexity levels into the model +to get better control of lexical complexity. And we demonstrate the feasibility +of the approach for both training models from scratch and fine-tuning the +pre-trained models. To facilitate the research, we develop two datasets in +English and Chinese respectively, on which extensive experiments are conducted. +Results show that our approach better controls lexical complexity and generates +higher quality sentences than baseline methods.",Text generation, +Defense against adversarial attacks on deep convolutional neural networks through nonlocal denoising,"Despite substantial advances in network architecture performance, the +susceptibility of adversarial attacks makes deep learning challenging to +implement in safety-critical applications. This paper proposes a data-centric +approach to addressing this problem. A nonlocal denoising method with different +luminance values has been used to generate adversarial examples from the +Modified National Institute of Standards and Technology database (MNIST) and +Canadian Institute for Advanced Research (CIFAR-10) data sets. Under +perturbation, the method provided absolute accuracy improvements of up to 9.3% +in the MNIST data set and 13% in the CIFAR-10 data set. Training using +transformed images with higher luminance values increases the robustness of the +classifier. We have shown that transfer learning is disadvantageous for +adversarial machine learning. The results indicate that simple adversarial +examples can improve resilience and make deep learning easier to apply in +various applications.",adversarial attacks Defense, +MaNLP@SMM4H22: BERT for Classification of Twitter Posts,"The reported work is our straightforward approach for the shared task +Classification of tweets self-reporting age organized by the Social Media +Mining for Health Applications (SMM4H) workshop. This literature describes the +approach that was used to build a binary classification system, that classifies +the tweets related to birthday posts into two classes namely, exact +age(positive class) and non-exact age(negative class). We made two submissions +with variations in the preprocessing of text which yielded F1 scores of 0.80 +and 0.81 when evaluated by the organizers.",Twitter post classification, +The Massively Multilingual Natural Language Understanding 2022 (MMNLU-22) Workshop and Competition,"Despite recent progress in Natural Language Understanding (NLU), the creation +of multilingual NLU systems remains a challenge. It is common to have NLU +systems limited to a subset of languages due to lack of available data. They +also often vary widely in performance. We launch a three-phase approach to +address the limitations in NLU and help propel NLU technology to new heights. +We release a 52 language dataset called the Multilingual Amazon SLU resource +package (SLURP) for Slot-filling, Intent classification, and Virtual assistant +Evaluation, or MASSIVE, in an effort to address parallel data availability for +voice assistants. We organize the Massively Multilingual NLU 2022 Challenge to +provide a competitive environment and push the state-of-the art in the +transferability of models into other languages. Finally, we host the first +Massively Multilingual NLU workshop which brings these components together. The +MMNLU workshop seeks to advance the science behind multilingual NLU by +providing a platform for the presentation of new research in the field and +connecting teams working on this research direction. This paper summarizes the +dataset, workshop and the competition and the findings of each phase.",multilingual natural language understanding, +Cluster & Tune: Boost Cold Start Performance in Text Classification,"In real-world scenarios, a text classification task often begins with a cold +start, when labeled data is scarce. In such cases, the common practice of +fine-tuning pre-trained models, such as BERT, for a target classification task, +is prone to produce poor performance. We suggest a method to boost the +performance of such models by adding an intermediate unsupervised +classification task, between the pre-training and fine-tuning phases. As such +an intermediate task, we perform clustering and train the pre-trained model on +predicting the cluster labels. We test this hypothesis on various data sets, +and show that this additional classification phase can significantly improve +performance, mainly for topical classification tasks, when the number of +labeled instances available for fine-tuning is only a couple of dozen to a few +hundred.",text classification, +GPTs at Factify 2022: Prompt Aided Fact-Verification,"One of the most pressing societal issues is the fight against false news. The +false claims, as difficult as they are to expose, create a lot of damage. To +tackle the problem, fact verification becomes crucial and thus has been a topic +of interest among diverse research communities. Using only the textual form of +data we propose our solution to the problem and achieve competitive results +with other approaches. We present our solution based on two approaches - PLM +(pre-trained language model) based method and Prompt based method. The +PLM-based approach uses the traditional supervised learning, where the model is +trained to take 'x' as input and output prediction 'y' as P(y|x). Whereas, +Prompt-based learning reflects the idea to design input to fit the model such +that the original objective may be re-framed as a problem of (masked) language +modeling. We may further stimulate the rich knowledge provided by PLMs to +better serve downstream tasks by employing extra prompts to fine-tune PLMs. Our +experiments showed that the proposed method performs better than just +fine-tuning PLMs. We achieved an F1 score of 0.6946 on the FACTIFY dataset and +a 7th position on the competition leader-board.",fact verification, +PET: An Annotated Dataset for Process Extraction from Natural Language Text,"Process extraction from text is an important task of process discovery, for +which various approaches have been developed in recent years. However, in +contrast to other information extraction tasks, there is a lack of +gold-standard corpora of business process descriptions that are carefully +annotated with all the entities and relationships of interest. Due to this, it +is currently hard to compare the results obtained by extraction approaches in +an objective manner, whereas the lack of annotated texts also prevents the +application of data-driven information extraction methodologies, typical of the +natural language processing field. Therefore, to bridge this gap, we present +the PET dataset, a first corpus of business process descriptions annotated with +activities, gateways, actors, and flow information. We present our new +resource, including a variety of baselines to benchmark the difficulty and +challenges of business process extraction from text. PET can be accessed via +huggingface.co/datasets/patriziobellan/PET",process discovery, +WildQA: In-the-Wild Video Question Answering,"Existing video understanding datasets mostly focus on human interactions, +with little attention being paid to the ""in the wild"" settings, where the +videos are recorded outdoors. We propose WILDQA, a video understanding dataset +of videos recorded in outside settings. In addition to video question answering +(Video QA), we also introduce the new task of identifying visual support for a +given question and answer (Video Evidence Selection). Through evaluations using +a wide range of baseline models, we show that WILDQA poses new challenges to +the vision and language research communities. The dataset is available at +https://lit.eecs.umich.edu/wildqa/.",video question answering, +CodeRetriever: Unimodal and Bimodal Contrastive Learning for Code Search,"In this paper, we propose the CodeRetriever model, which learns the +function-level code semantic representations through large-scale code-text +contrastive pre-training. We adopt two contrastive learning schemes in +CodeRetriever: unimodal contrastive learning and bimodal contrastive learning. +For unimodal contrastive learning, we design an unsupervised learning approach +to build semantic-related code pairs based on the documentation and function +name. For bimodal contrastive learning, we leverage the documentation and +in-line comments of code to build code-text pairs. Both contrastive objectives +can fully leverage large-scale code corpus for pre-training. Extensive +experimental results show that CodeRetriever achieves new state-of-the-art with +significant improvement over existing code pre-trained models, on eleven +domain/language-specific code search tasks with six programming languages in +different code granularity (function-level, snippet-level and statement-level). +These results demonstrate the effectiveness and robustness of CodeRetriever.",code pre-trained models, +IIITDWD-ShankarB@ Dravidian-CodeMixi-HASOC2021: mBERT based model for identification of offensive content in south Indian languages,"In recent years, there has been a lot of focus on offensive content. The +amount of offensive content generated by social media is increasing at an +alarming rate. This created a greater need to address this issue than ever +before. To address these issues, the organizers of ""Dravidian-Code Mixed +HASOC-2020"" have created two challenges. Task 1 involves identifying offensive +content in Malayalam data, whereas Task 2 includes Malayalam and Tamil Code +Mixed Sentences. Our team participated in Task 2. In our suggested model, we +experiment with multilingual BERT to extract features, and three different +classifiers are used on extracted features. Our model received a weighted F1 +score of 0.70 for Malayalam data and was ranked fifth; we also received a +weighted F1 score of 0.573 for Tamil Code Mixed data and were ranked eleventh.",offensive content detection, +The KITMUS Test: Evaluating Knowledge Integration from Multiple Sources in Natural Language Understanding Systems,"Many state-of-the-art natural language understanding (NLU) models are based +on pretrained neural language models. These models often make inferences using +information from multiple sources. An important class of such inferences are +those that require both background knowledge, presumably contained in a model's +pretrained parameters, and instance-specific information that is supplied at +inference time. However, the integration and reasoning abilities of NLU models +in the presence of multiple knowledge sources have been largely understudied. +In this work, we propose a test suite of coreference resolution subtasks that +require reasoning over multiple facts. These subtasks differ in terms of which +knowledge sources contain the relevant facts. We also introduce subtasks where +knowledge is present only at inference time using fictional knowledge. We +evaluate state-of-the-art coreference resolution models on our dataset. Our +results indicate that several models struggle to reason on-the-fly over +knowledge observed both at pretrain time and at inference time. However, with +task-specific training, a subset of models demonstrates the ability to +integrate certain knowledge types from multiple sources. Still, even the best +performing models seem to have difficulties with reliably integrating knowledge +presented only at inference time.",natural language understanding, +HINT: Hypernetwork Instruction Tuning for Efficient Zero- & Few-Shot Generalisation,"Recent NLP models have shown the remarkable ability to effectively generalise +`zero-shot' to new tasks using only natural language instructions as guidance. +However, many of these approaches suffer from high computational costs due to +their reliance on concatenating lengthy instructions with every input example, +resulting in costly reprocessing of the instruction. To avoid this, we +introduce Hypernetworks for INstruction Tuning (HINT), which convert task +instructions and examples into parameter-efficient modules inserted into an +underlying model using a pretrained text encoder, eliminating the need to +include instructions in the model input. The hypernetwork in HINT also produces +an encoded instruction, which we concatenate with encoded inputs during +decoding to further improve performance. HINT models outperform strong +state-of-the-art baselines by over 10% when controlling for compute (measured +in FLOPs). By converting instructions into modules, HINT models can effectively +disregard the length of instructions and few-shot example inputs in terms of +compute usage. As a result, HINT can enhance its performance by up to 25% by +incorporating additional few-shot data, while utilizing only up to 5% more +compute. This combines the strengths of parameter-efficient fine-tuning and +in-context learning.",Few-Shot Generalization, +Many Episode Learning in a Modular Embodied Agent via End-to-End Interaction,"In this work we give a case study of an embodied machine-learning (ML) +powered agent that improves itself via interactions with crowd-workers. The +agent consists of a set of modules, some of which are learned, and others +heuristic. While the agent is not ""end-to-end"" in the ML sense, end-to-end +interaction is a vital part of the agent's learning mechanism. We describe how +the design of the agent works together with the design of multiple annotation +interfaces to allow crowd-workers to assign credit to module errors from +end-to-end interactions, and to label data for individual modules. Over +multiple automated human-agent interaction, credit assignment, data annotation, +and model re-training and re-deployment, rounds we demonstrate agent +improvement.",embodied machine-learning, +MeSHup: A Corpus for Full Text Biomedical Document Indexing,"Medical Subject Heading (MeSH) indexing refers to the problem of assigning a +given biomedical document with the most relevant labels from an extremely large +set of MeSH terms. Currently, the vast number of biomedical articles in the +PubMed database are manually annotated by human curators, which is time +consuming and costly; therefore, a computational system that can assist the +indexing is highly valuable. When developing supervised MeSH indexing systems, +the availability of a large-scale annotated text corpus is desirable. A +publicly available, large corpus that permits robust evaluation and comparison +of various systems is important to the research community. We release a large +scale annotated MeSH indexing corpus, MeSHup, which contains 1,342,667 full +text articles in English, together with the associated MeSH labels and +metadata, authors, and publication venues that are collected from the MEDLINE +database. We train an end-to-end model that combines features from documents +and their associated labels on our corpus and report the new baseline.",Medical Subject Heading indexing, +Numerical Optimizations for Weighted Low-rank Estimation on Language Model,"Singular value decomposition (SVD) is one of the most popular compression +methods that approximate a target matrix with smaller matrices. However, +standard SVD treats the parameters within the matrix with equal importance, +which is a simple but unrealistic assumption. The parameters of a trained +neural network model may affect task performance unevenly, which suggests +non-equal importance among the parameters. Compared to SVD, the decomposition +method aware of parameter importance is the more practical choice in real +cases. Unlike standard SVD, weighted value decomposition is a non-convex +optimization problem that lacks a closed-form solution. We systematically +investigated multiple optimization strategies to tackle the problem and +examined our method by compressing Transformer-based language models. Further, +we designed a metric to predict when the SVD may introduce a significant +performance drop, for which our method can be a rescue strategy. The extensive +evaluations demonstrate that our method can perform better than current SOTA +methods in compressing Transformer-based language models.",Singular value decomposition, +Labeling instructions matter in biomedical image analysis,"Biomedical image analysis algorithm validation depends on high-quality +annotation of reference datasets, for which labeling instructions are key. +Despite their importance, their optimization remains largely unexplored. Here, +we present the first systematic study of labeling instructions and their impact +on annotation quality in the field. Through comprehensive examination of +professional practice and international competitions registered at the MICCAI +Society, we uncovered a discrepancy between annotators' needs for labeling +instructions and their current quality and availability. Based on an analysis +of 14,040 images annotated by 156 annotators from four professional companies +and 708 Amazon Mechanical Turk (MTurk) crowdworkers using instructions with +different information density levels, we further found that including exemplary +images significantly boosts annotation performance compared to text-only +descriptions, while solely extending text descriptions does not. Finally, +professional annotators constantly outperform MTurk crowdworkers. Our study +raises awareness for the need of quality standards in biomedical image analysis +labeling instructions.",biomedical image analysis, +Neural Graphical Models,"Probabilistic Graphical Models are often used to understand dynamics of a +system. They can model relationships between features (nodes) and the +underlying distribution. Theoretically these models can represent very complex +dependency functions, but in practice often simplifying assumptions are made +due to computational limitations associated with graph operations. In this work +we introduce Neural Graphical Models (NGMs) which attempt to represent complex +feature dependencies with reasonable computational costs. Given a graph of +feature relationships and corresponding samples, we capture the dependency +structure between the features along with their complex function +representations by using a neural network as a multi-task learning framework. +We provide efficient learning, inference and sampling algorithms. NGMs can fit +generic graph structures including directed, undirected and mixed-edge graphs +as well as support mixed input data types. We present empirical studies that +show NGMs' capability to represent Gaussian graphical models, perform inference +analysis of a lung cancer data and extract insights from a real world infant +mortality data provided by Centers for Disease Control and Prevention.",Probabilistic Graphical Models, +Self-Supervised Deep Learning to Enhance Breast Cancer Detection on Screening Mammography,"A major limitation in applying deep learning to artificial intelligence (AI) +systems is the scarcity of high-quality curated datasets. We investigate strong +augmentation based self-supervised learning (SSL) techniques to address this +problem. Using breast cancer detection as an example, we first identify a +mammogram-specific transformation paradigm and then systematically compare four +recent SSL methods representing a diversity of approaches. We develop a method +to convert a pretrained model from making predictions on uniformly tiled +patches to whole images, and an attention-based pooling method that improves +the classification performance. We found that the best SSL model substantially +outperformed the baseline supervised model. The best SSL model also improved +the data efficiency of sample labeling by nearly 4-fold and was highly +transferrable from one dataset to another. SSL represents a major breakthrough +in computer vision and may help the AI for medical imaging field to shift away +from supervised learning and dependency on scarce labels.",Breast Cancer Detection, +HOP: History-and-Order Aware Pre-training for Vision-and-Language Navigation,"Pre-training has been adopted in a few of recent works for +Vision-and-Language Navigation (VLN). However, previous pre-training methods +for VLN either lack the ability to predict future actions or ignore the +trajectory contexts, which are essential for a greedy navigation process. In +this work, to promote the learning of spatio-temporal visual-textual +correspondence as well as the agent's capability of decision making, we propose +a novel history-and-order aware pre-training paradigm (HOP) with VLN-specific +objectives that exploit the past observations and support future action +prediction. Specifically, in addition to the commonly used Masked Language +Modeling (MLM) and Trajectory-Instruction Matching (TIM), we design two proxy +tasks to model temporal order information: Trajectory Order Modeling (TOM) and +Group Order Modeling (GOM). Moreover, our navigation action prediction is also +enhanced by introducing the task of Action Prediction with History (APH), which +takes into account the history visual perceptions. Extensive experimental +results on four downstream VLN tasks (R2R, REVERIE, NDH, RxR) demonstrate the +effectiveness of our proposed method compared against several state-of-the-art +agents.",vision-and-language navigation, +Adaptive Graph Spatial-Temporal Transformer Network for Traffic Flow Forecasting,"Traffic flow forecasting on graphs has real-world applications in many +fields, such as transportation system and computer networks. Traffic +forecasting can be highly challenging due to complex spatial-temporal +correlations and non-linear traffic patterns. Existing works mostly model such +spatial-temporal dependencies by considering spatial correlations and temporal +correlations separately and fail to model the direct spatial-temporal +correlations. Inspired by the recent success of transformers in the graph +domain, in this paper, we propose to directly model the cross-spatial-temporal +correlations on the spatial-temporal graph using local multi-head +self-attentions. To reduce the time complexity, we set the attention receptive +field to the spatially neighboring nodes, and we also introduce an adaptive +graph to capture the hidden spatial-temporal dependencies. Based on these +attention mechanisms, we propose a novel Adaptive Graph Spatial-Temporal +Transformer Network (ASTTN), which stacks multiple spatial-temporal attention +layers to apply self-attention on the input graph, followed by linear layers +for predictions. Experimental results on public traffic network datasets, +METR-LA PEMS-BAY, PeMSD4, and PeMSD7, demonstrate the superior performance of +our model.",traffic flow forecasting, +Relphormer: Relational Graph Transformer for Knowledge Graph Representations,"Transformers have achieved remarkable performance in widespread fields, +including natural language processing, computer vision and graph mining. +However, vanilla Transformer architectures have not yielded promising +improvements in the Knowledge Graph (KG) representations, where the +translational distance paradigm dominates this area. Note that vanilla +Transformer architectures struggle to capture the intrinsically heterogeneous +structural and semantic information of knowledge graphs. To this end, we +propose a new variant of Transformer for knowledge graph representations dubbed +Relphormer. Specifically, we introduce Triple2Seq which can dynamically sample +contextualized sub-graph sequences as the input to alleviate the heterogeneity +issue. We propose a novel structure-enhanced self-attention mechanism to encode +the relational information and keep the semantic information within entities +and relations. Moreover, we utilize masked knowledge modeling for general +knowledge graph representation learning, which can be applied to various +KG-based tasks including knowledge graph completion, question answering, and +recommendation. Experimental results on six datasets show that Relphormer can +obtain better performance compared with baselines. Code is available in +https://github.com/zjunlp/Relphormer.",Knowledge Graph representations, +BootAug: Boosting Text Augmentation via Hybrid Instance Filtering Framework,"Text augmentation is an effective technique for addressing the problem of +insufficient data in natural language processing. However, existing text +augmentation methods tend to focus on few-shot scenarios and usually perform +poorly on large public datasets. Our research indicates that existing +augmentation methods often generate instances with shifted feature spaces, +which leads to a drop in performance on the augmented data (for example, EDA +generally loses $\approx 2\%$ in aspect-based sentiment classification). To +address this problem, we propose a hybrid instance-filtering framework +(BootAug) based on pre-trained language models that can maintain a similar +feature space with natural datasets. BootAug is transferable to existing text +augmentation methods (such as synonym substitution and back translation) and +significantly improves the augmentation performance by $\approx 2-3\%$ in +classification accuracy. Our experimental results on three classification tasks +and nine public datasets show that BootAug addresses the performance drop +problem and outperforms state-of-the-art text augmentation methods. +Additionally, we release the code to help improve existing augmentation methods +on large datasets.",text augmentation, +ESIE-BERT: Enriching Sub-words Information Explicitly with BERT for Joint Intent Classification and SlotFilling,"Natural language understanding (NLU) has two core tasks: intent +classification and slot filling. The success of pre-training language models +resulted in a significant breakthrough in the two tasks. One of the promising +solutions called BERT can jointly optimize the two tasks. We note that +BERT-based models convert each complex token into multiple sub-tokens by +wordpiece algorithm, which generates a mismatch between the lengths of the +tokens and the labels. This leads to BERT-based models do not do well in label +prediction which limits model performance improvement. Many existing models can +be compatible with this issue but some hidden semantic information is discarded +in the fine-tuning process. We address the problem by introducing a novel joint +method on top of BERT which explicitly models the multiple sub-tokens features +after wordpiece tokenization, thereby contributing to the two tasks. Our method +can well extract the contextual features from complex tokens by the proposed +sub-words attention adapter (SAA), which preserves overall utterance +information. Additionally, we propose an intent attention adapter (IAA) to +obtain the full sentence features to aid users to predict intent. Experimental +results confirm that our proposed model is significantly improved on two public +benchmark datasets. In particular, the slot filling F1 score is improved from +96.1 to 98.2 (2.1% absolute) on the Airline Travel Information Systems (ATIS) +dataset.",Natural language understanding, +Bridging the Domain Gap for Multi-Agent Perception,"Existing multi-agent perception algorithms usually select to share deep +neural features extracted from raw sensing data between agents, achieving a +trade-off between accuracy and communication bandwidth limit. However, these +methods assume all agents have identical neural networks, which might not be +practical in the real world. The transmitted features can have a large domain +gap when the models differ, leading to a dramatic performance drop in +multi-agent perception. In this paper, we propose the first lightweight +framework to bridge such domain gaps for multi-agent perception, which can be a +plug-in module for most existing systems while maintaining confidentiality. Our +framework consists of a learnable feature resizer to align features in multiple +dimensions and a sparse cross-domain transformer for domain adaption. Extensive +experiments on the public multi-agent perception dataset V2XSet have +demonstrated that our method can effectively bridge the gap for features from +different domains and outperform other baseline methods significantly by at +least 8% for point-cloud-based 3D object detection.",Multi-Agent Perception, +Towards Adversarially Robust Recommendation from Adaptive Fraudster Detection,"The robustness of recommender systems under node injection attacks has +garnered significant attention. Recently, GraphRfi, a GNN-based recommender +system, was proposed and shown to effectively mitigate the impact of injected +fake users. However, we demonstrate that GraphRfi remains vulnerable to attacks +due to the supervised nature of its fraudster detection component, where +obtaining clean labels is challenging in practice. In particular, we propose a +powerful poisoning attack, MetaC, against both GNN-based and MF-based +recommender systems. Furthermore, we analyze why GraphRfi fails under such an +attack. Then, based on our insights obtained from vulnerability analysis, we +design an adaptive fraudster detection module that explicitly considers label +uncertainty. This module can serve as a plug-in for different recommender +systems, resulting in a robust framework named PDR. Comprehensive experiments +show that our defense approach outperforms other benchmark methods under +attacks. Overall, our research presents an effective framework for integrating +fraudster detection into recommendation systems to achieve adversarial +robustness.",recommendation systems, +A Robust Ensemble Model for Patasitic Egg Detection and Classification,"Intestinal parasitic infections, as a leading causes of morbidity worldwide, +still lacks time-saving, high-sensitivity and user-friendly examination method. +The development of deep learning technique reveals its broad application +potential in biological image. In this paper, we apply several object detectors +such as YOLOv5 and variant cascadeRCNNs to automatically discriminate parasitic +eggs in microscope images. Through specially-designed optimization including +raw data augmentation, model ensemble, transfer learning and test time +augmentation, our model achieves excellent performance on challenge dataset. In +addition, our model trained with added noise gains a high robustness against +polluted input, which further broaden its applicability in practice.",parasitic egg detection, +$¦Â$-DARTS: Beta-Decay Regularization for Differentiable Architecture Search,"Neural Architecture Search~(NAS) has attracted increasingly more attention in +recent years because of its capability to design deep neural networks +automatically. Among them, differential NAS approaches such as DARTS, have +gained popularity for the search efficiency. However, they suffer from two main +issues, the weak robustness to the performance collapse and the poor +generalization ability of the searched architectures. To solve these two +problems, a simple-but-efficient regularization method, termed as Beta-Decay, +is proposed to regularize the DARTS-based NAS searching process. Specifically, +Beta-Decay regularization can impose constraints to keep the value and variance +of activated architecture parameters from too large. Furthermore, we provide +in-depth theoretical analysis on how it works and why it works. Experimental +results on NAS-Bench-201 show that our proposed method can help to stabilize +the searching process and makes the searched network more transferable across +different datasets. In addition, our search scheme shows an outstanding +property of being less dependent on training time and data. Comprehensive +experiments on a variety of search spaces and datasets validate the +effectiveness of the proposed method.",Neural Architecture Search, +Plug & Play Attacks: Towards Robust and Flexible Model Inversion Attacks,"Model inversion attacks (MIAs) aim to create synthetic images that reflect +the class-wise characteristics from a target classifier's private training data +by exploiting the model's learned knowledge. Previous research has developed +generative MIAs that use generative adversarial networks (GANs) as image priors +tailored to a specific target model. This makes the attacks time- and +resource-consuming, inflexible, and susceptible to distributional shifts +between datasets. To overcome these drawbacks, we present Plug & Play Attacks, +which relax the dependency between the target model and image prior, and enable +the use of a single GAN to attack a wide range of targets, requiring only minor +adjustments to the attack. Moreover, we show that powerful MIAs are possible +even with publicly available pre-trained GANs and under strong distributional +shifts, for which previous approaches fail to produce meaningful results. Our +extensive evaluation confirms the improved robustness and flexibility of Plug & +Play Attacks and their ability to create high-quality images revealing +sensitive class characteristics.",model inversion attacks, +Person Re-Identification,"Person Re-Identification (Re-ID) is an important problem in computer +vision-based surveillance applications, in which one aims to identify a person +across different surveillance photographs taken from different cameras having +varying orientations and field of views. Due to the increasing demand for +intelligent video surveillance, Re-ID has gained significant interest in the +computer vision community. In this work, we experiment on some existing Re-ID +methods that obtain state of the art performance in some open benchmarks. We +qualitatively and quantitaively analyse their performance on a provided +dataset, and then propose methods to improve the results. This work was the +report submitted for COL780 final project at IIT Delhi.",person re-identification, +DP$^2$-NILM: A Distributed and Privacy-preserving Framework for Non-intrusive Load Monitoring,"Non-intrusive load monitoring (NILM), which usually utilizes machine learning +methods and is effective in disaggregating smart meter readings from the +household-level into appliance-level consumption, can help analyze electricity +consumption behaviours of users and enable practical smart energy and smart +grid applications. Recent studies have proposed many novel NILM frameworks +based on federated deep learning (FL). However, there lacks comprehensive +research exploring the utility optimization schemes and the privacy-preserving +schemes in different FL-based NILM application scenarios. In this paper, we +make the first attempt to conduct FL-based NILM focusing on both the utility +optimization and the privacy-preserving by developing a distributed and +privacy-preserving NILM (DP2-NILM) framework and carrying out comparative +experiments on practical NILM scenarios based on real-world smart meter +datasets. Specifically, two alternative federated learning strategies are +examined in the utility optimization schemes, i.e., the FedAvg and the FedProx. +Moreover, different levels of privacy guarantees, i.e., the local differential +privacy federated learning and the global differential privacy federated +learning are provided in the DP2-NILM. Extensive comparison experiments are +conducted on three real-world datasets to evaluate the proposed framework.",Non-intrusive load monitoring, +Persian Abstract Meaning Representation,"Abstract Meaning Representation (AMR) is an annotation framework representing +the semantic structure of a sentence as a whole. From the beginning, AMR was +not intended to act as an interlingua; however, it has made progress towards +the idea of designing a universal meaning representation framework. +Accordingly, developing AMR annotation guidelines for different languages, +based on language divergences, is of significant importance. In this paper, we +elaborate on Persian Abstract Meaning Representation (PAMR) annotation +specifications, based on which we annotated the Persian translation of ""The +Little Prince"" as the first gold standard for Persian AMR. Moreover, we +describe how some Persian-specific syntactic constructions would result in +different AMR annotations.",Abstract Meaning Representation, +Scaling Knowledge Graph Embedding Models,"Developing scalable solutions for training Graph Neural Networks (GNNs) for +link prediction tasks is challenging due to the high data dependencies which +entail high computational cost and huge memory footprint. We propose a new +method for scaling training of knowledge graph embedding models for link +prediction to address these challenges. Towards this end, we propose the +following algorithmic strategies: self-sufficient partitions, constraint-based +negative sampling, and edge mini-batch training. Both, partitioning strategy +and constraint-based negative sampling, avoid cross partition data transfer +during training. In our experimental evaluation, we show that our scaling +solution for GNN-based knowledge graph embedding models achieves a 16x speed up +on benchmark datasets while maintaining a comparable model performance as +non-distributed methods on standard metrics.",link prediction, +Chance-Constrained Optimization in Contact-Rich Systems for Robust Manipulation,"This paper presents a chance-constrained formulation for robust trajectory +optimization during manipulation. In particular, we present a +chance-constrained optimization for Stochastic Discrete-time Linear +Complementarity Systems (SDLCS). To solve the optimization problem, we +formulate Mixed-Integer Quadratic Programming with Chance Constraints (MIQPCC). +In our formulation, we explicitly consider joint chance constraints for +complementarity as well as states to capture the stochastic evolution of +dynamics. We evaluate robustness of our optimized trajectories in simulation on +several systems. The proposed approach outperforms some recent approaches for +robust trajectory optimization for SDLCS.",Trajectory Optimization, +Spherical Transformer,"Using convolutional neural networks for 360images can induce sub-optimal +performance due to distortions entailed by a planar projection. The distortion +gets deteriorated when a rotation is applied to the 360image. Thus, many +researches based on convolutions attempt to reduce the distortions to learn +accurate representation. In contrast, we leverage the transformer architecture +to solve image classification problems for 360images. Using the proposed +transformer for 360images has two advantages. First, our method does not +require the erroneous planar projection process by sampling pixels from the +sphere surface. Second, our sampling method based on regular polyhedrons makes +low rotation equivariance errors, because specific rotations can be reduced to +permutations of faces. In experiments, we validate our network on two aspects, +as follows. First, we show that using a transformer with highly uniform +sampling methods can help reduce the distortion. Second, we demonstrate that +the transformer architecture can achieve rotation equivariance on specific +rotations. We compare our method to other state-of-the-art algorithms using the +SPH-MNIST, SPH-CIFAR, and SUN360 datasets and show that our method is +competitive with other methods.",Spherical Image Classification, +Investigation of a Machine learning methodology for the SKA pulsar search pipeline,"The SKA pulsar search pipeline will be used for real time detection of +pulsars. Modern radio telescopes such as SKA will be generating petabytes of +data in their full scale of operation. Hence experience-based and data-driven +algorithms become indispensable for applications such as candidate detection. +Here we describe our findings from testing a state of the art object detection +algorithm called Mask R-CNN to detect candidate signatures in the SKA pulsar +search pipeline. We have trained the Mask R-CNN model to detect candidate +images. A custom annotation tool was developed to mark the regions of interest +in large datasets efficiently. We have successfully demonstrated this algorithm +by detecting candidate signatures on a simulation dataset. The paper presents +details of this work with a highlight on the future prospects.",pulsars detection, +Doubly Reparameterized Importance Weighted Structure Learning for Scene Graph Generation,"As a structured prediction task, scene graph generation, given an input +image, aims to explicitly model objects and their relationships by constructing +a visually-grounded scene graph. In the current literature, such task is +universally solved via a message passing neural network based mean field +variational Bayesian methodology. The classical loose evidence lower bound is +generally chosen as the variational inference objective, which could induce +oversimplified variational approximation and thus underestimate the underlying +complex posterior. In this paper, we propose a novel doubly reparameterized +importance weighted structure learning method, which employs a tighter +importance weighted lower bound as the variational inference objective. It is +computed from multiple samples drawn from a reparameterizable Gumbel-Softmax +sampler and the resulting constrained variational inference task is solved by a +generic entropic mirror descent algorithm. The resulting doubly reparameterized +gradient estimator reduces the variance of the corresponding derivatives with a +beneficial impact on learning. The proposed method achieves the +state-of-the-art performance on various popular scene graph generation +benchmarks.",scene graph generation, +2-d signature of images and texture classification,"We introduce a proper notion of 2-dimensional signature for images. This +object is inspired by the so-called rough paths theory, and it captures many +essential features of a 2-dimensional object such as an image. It thus serves +as a low-dimensional feature for pattern classification. Here we implement a +simple procedure for texture classification. In this context, we show that a +low dimensional set of features based on signatures produces an excellent +accuracy.",texture classification, +"Artificial Intelligence for Health Message Generation: Theory, Method, and an Empirical Study Using Prompt Engineering","This study introduces and examines the potential of an AI system to generate +health awareness messages. The topic of folic acid, a vitamin that is critical +during pregnancy, served as a test case. Using prompt engineering, we generated +messages that could be used to raise awareness and compared them to retweeted +human-generated messages via computational and human evaluation methods. The +system was easy to use and prolific, and computational analyses revealed that +the AI-generated messages were on par with human-generated ones in terms of +sentiment, reading ease, and semantic content. Also, the human evaluation study +showed that AI-generated messages ranked higher in message quality and clarity. +We discuss the theoretical, practical, and ethical implications of these +results.",health message generation, +Face Presentation Attack Detection,"Face recognition technology has been widely used in daily interactive +applications such as checking-in and mobile payment due to its convenience and +high accuracy. However, its vulnerability to presentation attacks (PAs) limits +its reliable use in ultra-secure applicational scenarios. A presentation attack +is first defined in ISO standard as: a presentation to the biometric data +capture subsystem with the goal of interfering with the operation of the +biometric system. Specifically, PAs range from simple 2D print, replay and more +sophisticated 3D masks and partial masks. To defend the face recognition +systems against PAs, both academia and industry have paid extensive attention +to developing face presentation attack detection (PAD) technology (or namely +`face anti-spoofing (FAS)').",face presentation attack detection, +Template-based Recruitment Email Generation For Job Recommendation,"Text generation has long been a popular research topic in NLP. However, the +task of generating recruitment emails from recruiters to candidates in the job +recommendation scenario has received little attention by the research +community. This work aims at defining the topic of automatic email generation +for job recommendation, identifying the challenges, and providing a baseline +template-based solution for Danish jobs. Evaluation by human experts shows that +our method is effective. We wrap up by discussing the future research +directions for better solving this task.",Text generation, +Divide & Conquer Imitation Learning,"When cast into the Deep Reinforcement Learning framework, many robotics tasks +require solving a long horizon and sparse reward problem, where learning +algorithms struggle. In such context, Imitation Learning (IL) can be a powerful +approach to bootstrap the learning process. However, most IL methods require +several expert demonstrations which can be prohibitively difficult to acquire. +Only a handful of IL algorithms have shown efficiency in the context of an +extreme low expert data regime where a single expert demonstration is +available. In this paper, we present a novel algorithm designed to imitate +complex robotic tasks from the states of an expert trajectory. Based on a +sequential inductive bias, our method divides the complex task into smaller +skills. The skills are learned into a goal-conditioned policy that is able to +solve each skill individually and chain skills to solve the entire task. We +show that our method imitates a non-holonomic navigation task and scales to a +complex simulated robotic manipulation task with very high sample efficiency.",imitation learning, +GANimator: Neural Motion Synthesis from a Single Sequence,"We present GANimator, a generative model that learns to synthesize novel +motions from a single, short motion sequence. GANimator generates motions that +resemble the core elements of the original motion, while simultaneously +synthesizing novel and diverse movements. Existing data-driven techniques for +motion synthesis require a large motion dataset which contains the desired and +specific skeletal structure. By contrast, GANimator only requires training on a +single motion sequence, enabling novel motion synthesis for a variety of +skeletal structures e.g., bipeds, quadropeds, hexapeds, and more. Our framework +contains a series of generative and adversarial neural networks, each +responsible for generating motions in a specific frame rate. The framework +progressively learns to synthesize motion from random noise, enabling +hierarchical control over the generated motion content across varying levels of +detail. We show a number of applications, including crowd simulation, key-frame +editing, style transfer, and interactive control, which all learn from a single +input sequence. Code and data for this paper are at +https://peizhuoli.github.io/ganimator.",motion synthesis, +Learning shape distributions from large databases of healthy organs: applications to zero-shot and few-shot abnormal pancreas detection,"We propose a scalable and data-driven approach to learn shape distributions +from large databases of healthy organs. To do so, volumetric segmentation masks +are embedded into a common probabilistic shape space that is learned with a +variational auto-encoding network. The resulting latent shape representations +are leveraged to derive zeroshot and few-shot methods for abnormal shape +detection. The proposed distribution learning approach is illustrated on a +large database of 1200 healthy pancreas shapes. Downstream qualitative and +quantitative experiments are conducted on a separate test set of 224 pancreas +from patients with mixed conditions. The abnormal pancreas detection AUC +reached up to 65.41% in the zero-shot configuration, and 78.97% in the few-shot +configuration with as few as 15 abnormal examples, outperforming a baseline +approach based on the sole volume.",abnormal pancreas detection, +Low-light Enhancement Method Based on Attention Map Net,"Low-light image enhancement is a crucial preprocessing task for some complex +vision tasks. Target detection, image segmentation, and image recognition +outcomes are all directly impacted by the impact of image enhancement. However, +the majority of the currently used image enhancement techniques do not produce +satisfactory outcomes, and these enhanced networks have relatively weak +robustness. We suggest an improved network called BrightenNet that uses U-Net +as its primary structure and incorporates a number of different attention +mechanisms as a solution to this issue. In a specific application, we employ +the network as the generator and LSGAN as the training framework to achieve +better enhancement results. We demonstrate the validity of the proposed network +BrightenNet in the experiments that follow in this paper. The results it +produced can both preserve image details and conform to human vision standards.",low-light image enhancement, +PERFECT: Prompt-free and Efficient Few-shot Learning with Language Models,"Current methods for few-shot fine-tuning of pretrained masked language models +(PLMs) require carefully engineered prompts and verbalizers for each new task +to convert examples into a cloze-format that the PLM can score. In this work, +we propose PERFECT, a simple and efficient method for few-shot fine-tuning of +PLMs without relying on any such handcrafting, which is highly effective given +as few as 32 data points. PERFECT makes two key design choices: First, we show +that manually engineered task prompts can be replaced with task-specific +adapters that enable sample-efficient fine-tuning and reduce memory and storage +costs by roughly factors of 5 and 100, respectively. Second, instead of using +handcrafted verbalizers, we learn new multi-token label embeddings during +fine-tuning, which are not tied to the model vocabulary and which allow us to +avoid complex auto-regressive decoding. These embeddings are not only learnable +from limited data but also enable nearly 100x faster training and inference. +Experiments on a wide range of few-shot NLP tasks demonstrate that PERFECT, +while being simple and efficient, also outperforms existing state-of-the-art +few-shot learning methods. Our code is publicly available at +https://github.com/facebookresearch/perfect.git.",few-shot learning, +LiSnowNet: Real-time Snow Removal for LiDAR Point Cloud,"LiDARs have been widely adopted to modern self-driving vehicles, providing 3D +information of the scene and surrounding objects. However, adverser weather +conditions still pose significant challenges to LiDARs since point clouds +captured during snowfall can easily be corrupted. The resulting noisy point +clouds degrade downstream tasks such as mapping. Existing works in de-noising +point clouds corrupted by snow are based on nearest-neighbor search, and thus +do not scale well with modern LiDARs which usually capture $100k$ or more +points at 10Hz. In this paper, we introduce an unsupervised de-noising +algorithm, LiSnowNet, running 52$\times$ faster than the state-of-the-art +methods while achieving superior performance in de-noising. Unlike previous +methods, the proposed algorithm is based on a deep convolutional neural network +and can be easily deployed to hardware accelerators such as GPUs. In addition, +we demonstrate how to use the proposed method for mapping even with corrupted +point clouds.", LiDAR Point Cloud, +Repairing $\mathcal{EL}$ Ontologies Using Weakening and Completing,"The quality of ontologies in terms of their correctness and completeness is +crucial for developing high-quality ontology-based applications. Traditional +debugging techniques repair ontologies by removing unwanted axioms, but may +thereby remove consequences that are correct in the domain of the ontology. In +this paper we propose an interactive approach to mitigate this for +$\mathcal{EL}$ ontologies by axiom weakening and completing. We present +algorithms for weakening and completing and present the first approach for +repairing that takes into account removing, weakening and completing. We show +different combination strategies, discuss the influence on the final ontologies +and show experimental results. We show that previous work has only considered +special cases and that there is a trade-off between the amount of validation +work for a domain expert and the quality of the ontology in terms of +correctness and completeness.",ontology repair, +Continual Learning For On-Device Environmental Sound Classification,"Continuously learning new classes without catastrophic forgetting is a +challenging problem for on-device environmental sound classification given the +restrictions on computation resources (e.g., model size, running memory). To +address this issue, we propose a simple and efficient continual learning +method. Our method selects the historical data for the training by measuring +the per-sample classification uncertainty. Specifically, we measure the +uncertainty by observing how the classification probability of data fluctuates +against the parallel perturbations added to the classifier embedding. In this +way, the computation cost can be significantly reduced compared with adding +perturbation to the raw data. Experimental results on the DCASE 2019 Task 1 and +ESC-50 dataset show that our proposed method outperforms baseline continual +learning methods on classification accuracy and computational efficiency, +indicating our method can efficiently and incrementally learn new classes +without the catastrophic forgetting problem for on-device environmental sound +classification.",environmental sound classification, +Differentiable SAR Renderer and SAR Target Reconstruction,"Forward modeling of wave scattering and radar imaging mechanisms is the key +to information extraction from synthetic aperture radar (SAR) images. Like +inverse graphics in optical domain, an inherently-integrated forward-inverse +approach would be promising for SAR advanced information retrieval and target +reconstruction. This paper presents such an attempt to the inverse graphics for +SAR imagery. A differentiable SAR renderer (DSR) is developed which +reformulates the mapping and projection algorithm of SAR imaging mechanism in +the differentiable form of probability maps. First-order gradients of the +proposed DSR are then analytically derived which can be back-propagated from +rendered image/silhouette to the target geometry and scattering attributes. A +3D inverse target reconstruction algorithm from SAR images is devised. Several +simulation and reconstruction experiments are conducted, including targets with +and without background, using both synthesized data or real measured inverse +SAR (ISAR) data by ground radar. Results demonstrate the efficacy of the +proposed DSR and its inverse approach.",differentiable SAR renderer, +DuReader_retrieval: A Large-scale Chinese Benchmark for Passage Retrieval from Web Search Engine,"In this paper, we present DuReader_retrieval, a large-scale Chinese dataset +for passage retrieval. DuReader_retrieval contains more than 90K queries and +over 8M unique passages from a commercial search engine. To alleviate the +shortcomings of other datasets and ensure the quality of our benchmark, we (1) +reduce the false negatives in development and test sets by manually annotating +results pooled from multiple retrievers, and (2) remove the training queries +that are semantically similar to the development and testing queries. +Additionally, we provide two out-of-domain testing sets for cross-domain +evaluation, as well as a set of human translated queries for for cross-lingual +retrieval evaluation. The experiments demonstrate that DuReader_retrieval is +challenging and a number of problems remain unsolved, such as the salient +phrase mismatch and the syntactic mismatch between queries and paragraphs. +These experiments also show that dense retrievers do not generalize well across +domains, and cross-lingual retrieval is essentially challenging. +DuReader_retrieval is publicly available at +https://github.com/baidu/DuReader/tree/master/DuReader-Retrieval.",passage retrieval, +ALERT: Adapting Language Models to Reasoning Tasks,"Current large language models can perform reasonably well on complex tasks +that require step-by-step reasoning with few-shot learning. Are these models +applying reasoning skills they have learnt during pre-training and reason +outside of their training context, or are they simply memorizing their training +corpus at finer granularity and have learnt to better understand their context? +To tease apart these possibilities, we introduce ALERT, a benchmark and suite +of analyses for assessing language models' reasoning ability comparing +pre-trained and finetuned models on complex tasks that require reasoning skills +to solve. ALERT provides a test bed to asses any language model on fine-grained +reasoning skills, which spans over 20 datasets and covers 10 different +reasoning skills. We leverage ALERT to further investigate the role of +finetuning. With extensive empirical analysis we find that language models +learn more reasoning skills such as textual entailment, abductive reasoning, +and analogical reasoning during finetuning stage compared to pretraining state. +We also find that when language models are finetuned they tend to overfit to +the prompt template, which hurts the robustness of models causing +generalization problems.",language model reasoning, +Logic-Based Ethical Planning,"In this paper we propose a framework for ethical decision making in the +context of planning, with intended application to robotics. We put forward a +compact but highly expressive language for ethical planning that combines +linear temporal logic with lexicographic preference modelling. This original +combination allows us to assess plans both with respect to an agent's values +and their desires, introducing the novel concept of the morality level of an +agent and moving towards multigoal, multivalue planning. We initiate the study +of computational complexity of planning tasks in our setting, and we discuss +potential applications to robotics.",ethical planning, +From Cloze to Comprehension: Retrofitting Pre-trained Masked Language Model to Pre-trained Machine Reader,"We present Pre-trained Machine Reader (PMR), a novel method for retrofitting +pre-trained masked language models (MLMs) to pre-trained machine reading +comprehension (MRC) models without acquiring labeled data. PMR can resolve the +discrepancy between model pre-training and downstream fine-tuning of existing +MLMs. To build the proposed PMR, we constructed a large volume of +general-purpose and high-quality MRC-style training data by using Wikipedia +hyperlinks and designed a Wiki Anchor Extraction task to guide the MRC-style +pre-training. Apart from its simplicity, PMR effectively solves extraction +tasks, such as Extractive Question Answering and Named Entity Recognition. PMR +shows tremendous improvements over existing approaches, especially in +low-resource scenarios. When applied to the sequence classification task in the +MRC formulation, PMR enables the extraction of high-quality rationales to +explain the classification process, thereby providing greater prediction +explainability. PMR also has the potential to serve as a unified model for +tackling various extraction and classification tasks in the MRC formulation.",machine reading comprehension, +A Benchmark of Long-tailed Instance Segmentation with Noisy Labels,"In this paper, we consider the instance segmentation task on a long-tailed +dataset, which contains label noise, i.e., some of the annotations are +incorrect. There are two main reasons making this case realistic. First, +datasets collected from real world usually obey a long-tailed distribution. +Second, for instance segmentation datasets, as there are many instances in one +image and some of them are tiny, it is easier to introduce noise into the +annotations. Specifically, we propose a new dataset, which is a large +vocabulary long-tailed dataset containing label noise for instance +segmentation. Furthermore, we evaluate previous proposed instance segmentation +algorithms on this dataset. The results indicate that the noise in the training +dataset will hamper the model in learning rare categories and decrease the +overall performance, and inspire us to explore more effective approaches to +address this practical challenge. The code and dataset are available in +https://github.com/GuanlinLee/Noisy-LVIS.",long-tailed instance segmentation, +Findings of the Shared Task on Offensive Span Identification from Code-Mixed Tamil-English Comments,"Offensive content moderation is vital in social media platforms to support +healthy online discussions. However, their prevalence in codemixed Dravidian +languages is limited to classifying whole comments without identifying part of +it contributing to offensiveness. Such limitation is primarily due to the lack +of annotated data for offensive spans. Accordingly, in this shared task, we +provide Tamil-English code-mixed social comments with offensive spans. This +paper outlines the dataset so released, methods, and results of the submitted +systems",Offensive Content Span Detection, +Correlation between Alignment-Uniformity and Performance of Dense Contrastive Representations,"Recently, dense contrastive learning has shown superior performance on dense +prediction tasks compared to instance-level contrastive learning. Despite its +supremacy, the properties of dense contrastive representations have not yet +been carefully studied. Therefore, we analyze the theoretical ideas of dense +contrastive learning using a standard CNN and straightforward feature matching +scheme rather than propose a new complex method. Inspired by the analysis of +the properties of instance-level contrastive representations through the lens +of alignment and uniformity on the hypersphere, we employ and extend the same +lens for the dense contrastive representations to analyze their underexplored +properties. We discover the core principle in constructing a positive pair of +dense features and empirically proved its validity. Also, we introduces a new +scalar metric that summarizes the correlation between alignment-and-uniformity +and downstream performance. Using this metric, we study various facets of +densely learned contrastive representations such as how the correlation changes +over single- and multi-object datasets or linear evaluation and dense +prediction tasks. The source code is publicly available at: +https://github.com/SuperSupermoon/DenseCL-analysis",dense contrastive learning, +KESA: A Knowledge Enhanced Approach For Sentiment Analysis,"Though some recent works focus on injecting sentiment knowledge into +pre-trained language models, they usually design mask and reconstruction tasks +in the post-training phase. In this paper, we aim to benefit from sentiment +knowledge in a lighter way. To achieve this goal, we study sentence-level +sentiment analysis and, correspondingly, propose two sentiment-aware auxiliary +tasks named sentiment word cloze and conditional sentiment prediction. The +first task learns to select the correct sentiment words within the input, given +the overall sentiment polarity as prior knowledge. On the contrary, the second +task predicts the overall sentiment polarity given the sentiment polarity of +the word as prior knowledge. In addition, two kinds of label combination +methods are investigated to unify multiple types of labels in each task. We +argue that more information can promote the models to learn more profound +semantic representation. We implement it in a straightforward way to verify +this hypothesis. The experimental results demonstrate that our approach +consistently outperforms pre-trained models and is additive to existing +knowledge-enhanced post-trained models. The code and data are released at +https://github.com/lshowway/KESA.",Sentence-level Sentiment Analysis, +C2-CRS: Coarse-to-Fine Contrastive Learning for Conversational Recommender System,"Conversational recommender systems (CRS) aim to recommend suitable items to +users through natural language conversations. For developing effective CRSs, a +major technical issue is how to accurately infer user preference from very +limited conversation context. To address issue, a promising solution is to +incorporate external data for enriching the context information. However, prior +studies mainly focus on designing fusion models tailored for some specific type +of external data, which is not general to model and utilize multi-type external +data. + To effectively leverage multi-type external data, we propose a novel +coarse-to-fine contrastive learning framework to improve data semantic fusion +for CRS. In our approach, we first extract and represent multi-grained semantic +units from different data signals, and then align the associated multi-type +semantic units in a coarse-to-fine way. To implement this framework, we design +both coarse-grained and fine-grained procedures for modeling user preference, +where the former focuses on more general, coarse-grained semantic fusion and +the latter focuses on more specific, fine-grained semantic fusion. Such an +approach can be extended to incorporate more kinds of external data. Extensive +experiments on two public CRS datasets have demonstrated the effectiveness of +our approach in both recommendation and conversation tasks.",Conversational Recommender Systems, +How Effective is Incongruity? Implications for Code-mix Sarcasm Detection,"The presence of sarcasm in conversational systems and social media like +chatbots, Facebook, Twitter, etc. poses several challenges for downstream NLP +tasks. This is attributed to the fact that the intended meaning of a sarcastic +text is contrary to what is expressed. Further, the use of code-mix language to +express sarcasm is increasing day by day. Current NLP techniques for code-mix +data have limited success due to the use of different lexicon, syntax, and +scarcity of labeled corpora. To solve the joint problem of code-mixing and +sarcasm detection, we propose the idea of capturing incongruity through +sub-word level embeddings learned via fastText. Empirical results shows that +our proposed model achieves F1-score on code-mix Hinglish dataset comparable to +pretrained multilingual models while training 10x faster and using a lower +memory footprint",Sarcasm Detection, +EVOTER: Evolution of Transparent Explainable Rule-sets,"Most AI systems are black boxes generating reasonable outputs for given +inputs. Some domains, however, have explainability and trustworthiness +requirements that cannot be directly met by these approaches. Various methods +have therefore been developed to interpret black-box models after training. +This paper advocates an alternative approach where the models are transparent +and explainable to begin with. This approach, EVOTER, evolves rule-sets based +on simple logical expressions. The approach is evaluated in several +prediction/classification and prescription/policy search domains with and +without a surrogate. It is shown to discover meaningful rule sets that perform +similarly to black-box models. The rules can provide insight into the domain, +and make biases hidden in the data explicit. It may also be possible to edit +them directly to remove biases and add constraints. EVOTER thus forms a +promising foundation for building trustworthy AI systems for real-world +applications in the future.",Transparent and Explainable Models, +Lightweight Monocular Depth Estimation,"Monocular depth estimation can play an important role in addressing the issue +of deriving scene geometry from 2D images. It has been used in a variety of +industries, including robots, self-driving cars, scene comprehension, 3D +reconstructions, and others. The goal of our method is to create a lightweight +machine-learning model in order to predict the depth value of each pixel given +only a single RGB image as input with the Unet structure of the image +segmentation network. We use the NYU Depth V2 dataset to test the structure and +compare the result with other methods. The proposed method achieves relatively +high accuracy and low rootmean-square error.",monocular depth estimation, +Look\&Listen: Multi-Modal Correlation Learning for Active Speaker Detection and Speech Enhancement,"Active speaker detection and speech enhancement have become two increasingly +attractive topics in audio-visual scenario understanding. According to their +respective characteristics, the scheme of independently designed architecture +has been widely used in correspondence to each single task. This may lead to +the representation learned by the model being task-specific, and inevitably +result in the lack of generalization ability of the feature based on +multi-modal modeling. More recent studies have shown that establishing +cross-modal relationship between auditory and visual stream is a promising +solution for the challenge of audio-visual multi-task learning. Therefore, as a +motivation to bridge the multi-modal associations in audio-visual tasks, a +unified framework is proposed to achieve target speaker detection and speech +enhancement with joint learning of audio-visual modeling in this study.",audio-visual multi-task learning, +Pred&Guide: Labeled Target Class Prediction for Guiding Semi-Supervised Domain Adaptation,"Semi-supervised domain adaptation aims to classify data belonging to a target +domain by utilizing a related label-rich source domain and very few labeled +examples of the target domain. Here, we propose a novel framework, Pred&Guide, +which leverages the inconsistency between the predicted and the actual class +labels of the few labeled target examples to effectively guide the domain +adaptation in a semi-supervised setting. Pred&Guide consists of three stages, +as follows (1) First, in order to treat all the target samples equally, we +perform unsupervised domain adaptation coupled with self-training; (2) Second +is the label prediction stage, where the current model is used to predict the +labels of the few labeled target examples, and (3) Finally, the correctness of +the label predictions are used to effectively weigh source examples class-wise +to better guide the domain adaptation process. Extensive experiments show that +the proposed Pred&Guide framework achieves state-of-the-art results for two +large-scale benchmark datasets, namely Office-Home and DomainNet.",semi-supervised domain adaptation, +Selecting Seed Words for Wordle using Character Statistics,"Wordle, a word guessing game rose to global popularity in the January of +2022. The goal of the game is to guess a five-letter English word within six +tries. Each try provides the player with hints by means of colour changing +tiles which inform whether or not a given character is part of the solution as +well as, in cases where it is part of the solution, whether or not it is in the +correct placement. Numerous attempts have been made to find the best starting +word and best strategy to solve the daily wordle. This study uses character +statistics of five-letter words to determine the best three starting words.",Word Game Strategy Optimization, +Feasibility on Detecting Door Slamming towards Monitoring Early Signs of Domestic Violence,"By using low-cost microcontrollers and TinyML, we investigate the feasibility +of detecting potential early warning signs of domestic violence and other +anti-social behaviors within the home. We created a machine learning model to +determine if a door was closed aggressively by analyzing audio data and feeding +this into a convolutional neural network to classify the sample. Under test +conditions, with no background noise, accuracy of 88.89\% was achieved, +declining to 87.50\% when assorted background noises were mixed in at a +relative volume of 0.5 times that of the sample. The model is then deployed on +an Arduino Nano BLE 33 Sense attached to the door, and only begins sampling +once an acceleration greater than a predefined threshold acceleration is +detected. The predictions made by the model can then be sent via BLE to another +device, such as a smartphone of Raspberry Pi.",domestic violence detection, +Escaping Data Scarcity for High-Resolution Heterogeneous Face Hallucination,"In Heterogeneous Face Recognition (HFR), the objective is to match faces +across two different domains such as visible and thermal. Large domain +discrepancy makes HFR a difficult problem. Recent methods attempting to fill +the gap via synthesis have achieved promising results, but their performance is +still limited by the scarcity of paired training data. In practice, large-scale +heterogeneous face data are often inaccessible due to the high cost of +acquisition and annotation process as well as privacy regulations. In this +paper, we propose a new face hallucination paradigm for HFR, which not only +enables data-efficient synthesis but also allows to scale up model training +without breaking any privacy policy. Unlike existing methods that learn face +synthesis entirely from scratch, our approach is particularly designed to take +advantage of rich and diverse facial priors from visible domain for more +faithful hallucination. On the other hand, large-scale training is enabled by +introducing a new federated learning scheme to allow institution-wise +collaborations while avoiding explicit data sharing. Extensive experiments +demonstrate the advantages of our approach in tackling HFR under current data +limitations. In a unified framework, our method yields the state-of-the-art +hallucination results on multiple HFR datasets.",heterogeneous face hallucination, +muBoost: An Effective Method for Solving Indic Multilingual Text Classification Problem,"Text Classification is an integral part of many Natural Language Processing +tasks such as sarcasm detection, sentiment analysis and many more such +applications. Many e-commerce websites, social-media/entertainment platforms +use such models to enhance user-experience to generate traffic and thus, +revenue on their platforms. In this paper, we are presenting our solution to +Multilingual Abusive Comment Identification Problem on Moj, an Indian +video-sharing social networking service, powered by ShareChat. The problem +dealt with detecting abusive comments, in 13 regional Indic languages such as +Hindi, Telugu, Kannada etc., on the videos on Moj platform. Our solution +utilizes the novel muBoost, an ensemble of CatBoost classifier models and +Multilingual Representations for Indian Languages (MURIL) model, to produce +SOTA performance on Indic text classification tasks. We were able to achieve a +mean F1-score of 89.286 on the test data, an improvement over baseline MURIL +model with a F1-score of 87.48.",multilingual text classification, +PrivMVMF: Privacy-Preserving Multi-View Matrix Factorization for Recommender Systems,"With an increasing focus on data privacy, there have been pilot studies on +recommender systems in a federated learning (FL) framework, where multiple +parties collaboratively train a model without sharing their data. Most of these +studies assume that the conventional FL framework can fully protect user +privacy. However, there are serious privacy risks in matrix factorization in +federated recommender systems based on our study. This paper first provides a +rigorous theoretical analysis of the server reconstruction attack in four +scenarios in federated recommender systems, followed by comprehensive +experiments. The empirical results demonstrate that the FL server could infer +users' information with accuracy >80% based on the uploaded gradients from FL +nodes. The robustness analysis suggests that our reconstruction attack analysis +outperforms the random guess by >30% under Laplace noises with b no larger than +0.5 for all scenarios. Then, the paper proposes a new privacy-preserving +framework based on homomorphic encryption, Privacy-Preserving Multi-View Matrix +Factorization (PrivMVMF), to enhance user data privacy protection in federated +recommender systems. The proposed PrivMVMF is successfully implemented and +tested thoroughly with the MovieLens dataset.",Privacy-Preserving Recommender Systems, +MulT: An End-to-End Multitask Learning Transformer,"We propose an end-to-end Multitask Learning Transformer framework, named +MulT, to simultaneously learn multiple high-level vision tasks, including depth +estimation, semantic segmentation, reshading, surface normal estimation, 2D +keypoint detection, and edge detection. Based on the Swin transformer model, +our framework encodes the input image into a shared representation and makes +predictions for each vision task using task-specific transformer-based decoder +heads. At the heart of our approach is a shared attention mechanism modeling +the dependencies across the tasks. We evaluate our model on several multitask +benchmarks, showing that our MulT framework outperforms both the state-of-the +art multitask convolutional neural network models and all the respective single +task transformer models. Our experiments further highlight the benefits of +sharing attention across all the tasks, and demonstrate that our MulT model is +robust and generalizes well to new domains. Our project website is at +https://ivrl.github.io/MulT/.",Multitask Learning, +Do You Really Mean That? Content Driven Audio-Visual Deepfake Dataset and Multimodal Method for Temporal Forgery Localization,"Due to its high societal impact, deepfake detection is getting active +attention in the computer vision community. Most deepfake detection methods +rely on identity, facial attributes, and adversarial perturbation-based +spatio-temporal modifications at the whole video or random locations while +keeping the meaning of the content intact. However, a sophisticated deepfake +may contain only a small segment of video/audio manipulation, through which the +meaning of the content can be, for example, completely inverted from a +sentiment perspective. We introduce a content-driven audio-visual deepfake +dataset, termed Localized Audio Visual DeepFake (LAV-DF), explicitly designed +for the task of learning temporal forgery localization. Specifically, the +content-driven audio-visual manipulations are performed strategically to change +the sentiment polarity of the whole video. Our baseline method for benchmarking +the proposed dataset is a 3DCNN model, termed as Boundary Aware Temporal +Forgery Detection (BA-TFD), which is guided via contrastive, boundary matching, +and frame classification loss functions. Our extensive quantitative and +qualitative analysis demonstrates the proposed method's strong performance for +temporal forgery localization and deepfake detection tasks.",Deepfake Detection, +The Winning Solution to the iFLYTEK Challenge 2021 Cultivated Land Extraction from High-Resolution Remote Sensing Image,"Extracting cultivated land accurately from high-resolution remote images is a +basic task for precision agriculture. This report introduces our solution to +the iFLYTEK challenge 2021 cultivated land extraction from high-resolution +remote sensing image. The challenge requires segmenting cultivated land objects +in very high-resolution multispectral remote sensing images. We established a +highly effective and efficient pipeline to solve this problem. We first divided +the original images into small tiles and separately performed instance +segmentation on each tile. We explored several instance segmentation algorithms +that work well on natural images and developed a set of effective methods that +are applicable to remote sensing images. Then we merged the prediction results +of all small tiles into seamless, continuous segmentation results through our +proposed overlap-tile fusion strategy. We achieved the first place among 486 +teams in the challenge.",cultivated land extraction, +M$^2$DQN: A Robust Method for Accelerating Deep Q-learning Network,"Deep Q-learning Network (DQN) is a successful way which combines +reinforcement learning with deep neural networks and leads to a widespread +application of reinforcement learning. One challenging problem when applying +DQN or other reinforcement learning algorithms to real world problem is data +collection. Therefore, how to improve data efficiency is one of the most +important problems in the research of reinforcement learning. In this paper, we +propose a framework which uses the Max-Mean loss in Deep Q-Network (M$^2$DQN). +Instead of sampling one batch of experiences in the training step, we sample +several batches from the experience replay and update the parameters such that +the maximum TD-error of these batches is minimized. The proposed method can be +combined with most of existing techniques of DQN algorithm by replacing the +loss function. We verify the effectiveness of this framework with one of the +most widely used techniques, Double DQN (DDQN), in several gym games. The +results show that our method leads to a substantial improvement in both the +learning speed and performance.",Deep Q-learning Network, +Can You Fool AI by Doing a 180? $\unicode{x2013}$ A Case Study on Authorship Analysis of Texts by Arata Osada,"This paper is our attempt at answering a twofold question covering the areas +of ethics and authorship analysis. Firstly, since the methods used for +performing authorship analysis imply that an author can be recognized by the +content he or she creates, we were interested in finding out whether it would +be possible for an author identification system to correctly attribute works to +authors if in the course of years they have undergone a major psychological +transition. Secondly, and from the point of view of the evolution of an +author's ethical values, we checked what it would mean if the authorship +attribution system encounters difficulties in detecting single authorship. We +set out to answer those questions through performing a binary authorship +analysis task using a text classifier based on a pre-trained transformer model +and a baseline method relying on conventional similarity metrics. For the test +set, we chose works of Arata Osada, a Japanese educator and specialist in the +history of education, with half of them being books written before the World +War II and another half in the 1950s, in between which he underwent a +transformation in terms of political opinions. As a result, we were able to +confirm that in the case of texts authored by Arata Osada in a time span of +more than 10 years, while the classification accuracy drops by a large margin +and is substantially lower than for texts by other non-fiction writers, +confidence scores of the predictions remain at a similar level as in the case +of a shorter time span, indicating that the classifier was in many instances +tricked into deciding that texts written over a time span of multiple years +were actually written by two different people, which in turn leads us to +believe that such a change can affect authorship analysis, and that historical +events have great impact on a person's ethical outlook as expressed in their +writings.",Authorship Identification, +Inclusive Artificial Intelligence,"Prevailing methods for assessing and comparing generative AIs incentivize +responses that serve a hypothetical representative individual. Evaluating +models in these terms presumes homogeneous preferences across the population +and engenders selection of agglomerative AIs, which fail to represent the +diverse range of interests across individuals. We propose an alternative +evaluation method that instead prioritizes inclusive AIs, which provably retain +the requisite knowledge not only for subsequent response customization to +particular segments of the population but also for utility-maximizing +decisions.",Generative AI Evaluation, +"Towards technological adaptation of advanced farming through AI, IoT, and Robotics: A Comprehensive overview","The population explosion of the 21st century has adversely affected the +natural resources with restricted availability of cultivable land, increased +average temperatures due to global warming, and carbon footprint resulting in a +drastic increase in floods as well as droughts thus making food security +significant anxiety for most countries. The traditional methods were no longer +sufficient which paved the way for technological ascents such as a substantial +rise in Artificial Intelligence (AI), Internet of Things (IoT), as well as +Robotics that provides high productivity, functional efficiency, flexibility, +cost-effectiveness in the domain of agriculture. AI, IoT, and Robotics-based +devices and methods have produced new paradigms and opportunities in +agriculture. AI's existing approaches are soil management, crop diseases +identification, weed identification, and management in collaboration with IoT +devices. IoT has utilized automatic agricultural operations and real-time +monitoring with few personnel employed in real-time. The major existing +applications of agricultural robotics are for the function of soil preparation, +planting, monitoring, harvesting, and storage. In this paper, researchers have +explored a comprehensive overview of recent implementation, scopes, +opportunities, challenges, limitations, and future research instructions of AI, +IoT, and Robotics based methodology in the agriculture sector.",agriculture Robotics, +Zero Shot Crosslingual Eye-Tracking Data Prediction using Multilingual Transformer Models,"Eye tracking data during reading is a useful source of information to +understand the cognitive processes that take place during language +comprehension processes. Different languages account for different brain +triggers , however there seems to be some uniform indicators. In this paper, we +describe our submission to the CMCL 2022 shared task on predicting human +reading patterns for multi-lingual dataset. Our model uses text representations +from transformers and some hand engineered features with a regression layer on +top to predict statistical measures of mean and standard deviation for 2 main +eye-tracking features. We train an end to end model to extract meaningful +information from different languages and test our model on two seperate +datasets. We compare different transformer models and show ablation studies +affecting model performance. Our final submission ranked 4th place for +SubTask-1 and 1st place for SubTask-2 for the shared task.",Multilingual Reading Pattern Prediction, +Privacy-Preserving Image Classification Using Isotropic Network,"In this paper, we propose a privacy-preserving image classification method +that uses encrypted images and an isotropic network such as the vision +transformer. The proposed method allows us not only to apply images without +visual information to deep neural networks (DNNs) for both training and testing +but also to maintain a high classification accuracy. In addition, compressible +encrypted images, called encryption-then-compression (EtC) images, can be used +for both training and testing without any adaptation network. Previously, to +classify EtC images, an adaptation network was required before a classification +network, so methods with an adaptation network have been only tested on small +images. To the best of our knowledge, previous privacy-preserving image +classification methods have never considered image compressibility and patch +embedding-based isotropic networks. In an experiment, the proposed +privacy-preserving image classification was demonstrated to outperform +state-of-the-art methods even when EtC images were used in terms of +classification accuracy and robustness against various attacks under the use of +two isotropic networks: vision transformer and ConvMixer.",privacy-preserving image classification, +5q032e@SMM4H'22: Transformer-based classification of premise in tweets related to COVID-19,"Automation of social network data assessment is one of the classic challenges +of natural language processing. During the COVID-19 pandemic, mining people's +stances from public messages have become crucial regarding understanding +attitudes towards health orders. In this paper, the authors propose the +predictive model based on transformer architecture to classify the presence of +premise in Twitter texts. This work is completed as part of the Social Media +Mining for Health (SMM4H) Workshop 2022. We explored modern transformer-based +classifiers in order to construct the pipeline efficiently capturing tweets +semantics. Our experiments on a Twitter dataset showed that RoBERTa is superior +to the other transformer models in the case of the premise prediction task. The +model achieved competitive performance with respect to ROC AUC value 0.807, and +0.7648 for the F1 score.",Tweet Stance Classification, +NeAT: Neural Adaptive Tomography,"In this paper, we present Neural Adaptive Tomography (NeAT), the first +adaptive, hierarchical neural rendering pipeline for multi-view inverse +rendering. Through a combination of neural features with an adaptive explicit +representation, we achieve reconstruction times far superior to existing neural +inverse rendering methods. The adaptive explicit representation improves +efficiency by facilitating empty space culling and concentrating samples in +complex regions, while the neural features act as a neural regularizer for the +3D reconstruction. The NeAT framework is designed specifically for the +tomographic setting, which consists only of semi-transparent volumetric scenes +instead of opaque objects. In this setting, NeAT outperforms the quality of +existing optimization-based tomography solvers while being substantially +faster.",Multi-View Inverse Rendering, +A Generalized & Robust Framework For Timestamp Supervision in Temporal Action Segmentation,"In temporal action segmentation, Timestamp supervision requires only a +handful of labelled frames per video sequence. For unlabelled frames, previous +works rely on assigning hard labels, and performance rapidly collapses under +subtle violations of the annotation assumptions. We propose a novel +Expectation-Maximization (EM) based approach that leverages the label +uncertainty of unlabelled frames and is robust enough to accommodate possible +annotation errors. With accurate timestamp annotations, our proposed method +produces SOTA results and even exceeds the fully-supervised setup in several +metrics and datasets. When applied to timestamp annotations with missing action +segments, our method presents stable performance. To further test our +formulation's robustness, we introduce the new challenging annotation setup of +Skip-tag supervision. This setup relaxes constraints and requires annotations +of any fixed number of random frames in a video, making it more flexible than +Timestamp supervision while remaining competitive.",temporal action segmentation, +Depthformer : Multiscale Vision Transformer For Monocular Depth Estimation With Local Global Information Fusion,"Attention-based models such as transformers have shown outstanding +performance on dense prediction tasks, such as semantic segmentation, owing to +their capability of capturing long-range dependency in an image. However, the +benefit of transformers for monocular depth prediction has seldom been explored +so far. This paper benchmarks various transformer-based models for the depth +estimation task on an indoor NYUV2 dataset and an outdoor KITTI dataset. We +propose a novel attention-based architecture, Depthformer for monocular depth +estimation that uses multi-head self-attention to produce the multiscale +feature maps, which are effectively combined by our proposed decoder network. +We also propose a Transbins module that divides the depth range into bins whose +center value is estimated adaptively per image. The final depth estimated is a +linear combination of bin centers for each pixel. Transbins module takes +advantage of the global receptive field using the transformer module in the +encoding stage. Experimental results on NYUV2 and KITTI depth estimation +benchmark demonstrate that our proposed method improves the state-of-the-art by +3.3%, and 3.3% respectively in terms of Root Mean Squared Error (RMSE). Code is +available at https://github.com/ashutosh1807/Depthformer.git.",monocular depth estimation, +COCO-DR: Combating Distribution Shifts in Zero-Shot Dense Retrieval with Contrastive and Distributionally Robust Learning,"We present a new zero-shot dense retrieval (ZeroDR) method, COCO-DR, to +improve the generalization ability of dense retrieval by combating the +distribution shifts between source training tasks and target scenarios. To +mitigate the impact of document differences, COCO-DR continues pretraining the +language model on the target corpora to adapt the model to target distributions +via COtinuous COtrastive learning. To prepare for unseen target queries, +COCO-DR leverages implicit Distributionally Robust Optimization (iDRO) to +reweight samples from different source query clusters for improving model +robustness over rare queries during fine-tuning. COCO-DR achieves superior +average performance on BEIR, the zero-shot retrieval benchmark. At BERT Base +scale, COCO-DR Base outperforms other ZeroDR models with 60x larger size. At +BERT Large scale, COCO-DR Large outperforms the giant GPT-3 embedding model +which has 500x more parameters. Our analysis show the correlation between +COCO-DR's effectiveness in combating distribution shifts and improving +zero-shot accuracy. Our code and model can be found at +\url{https://github.com/OpenMatch/COCO-DR}.",zero-shot dense retrieval, +Hierarchical Decomposition and Analysis for Generalized Planning,"This paper presents new methods for analyzing and evaluating generalized +plans that can solve broad classes of related planning problems. Although +synthesis and learning of generalized plans has been a longstanding goal in AI, +it remains challenging due to fundamental gaps in methods for analyzing the +scope and utility of a given generalized plan. This paper addresses these gaps +by developing a new conceptual framework along with proof techniques and +algorithmic processes for assessing termination and goal-reachability related +properties of generalized plans. We build upon classic results from graph +theory to decompose generalized plans into smaller components that are then +used to derive hierarchical termination arguments. These methods can be used to +determine the utility of a given generalized plan, as well as to guide the +synthesis and learning processes for generalized plans. We present theoretical +as well as empirical results illustrating the scope of this new approach. Our +analysis shows that this approach significantly extends the class of +generalized plans that can be assessed automatically, thereby reducing barriers +in the synthesis and learning of reliable generalized plans.",generalized planning analysis, +Deep learning for spatio-temporal forecasting -- application to solar energy,"This thesis tackles the subject of spatio-temporal forecasting with deep +learning. The motivating application at Electricity de France (EDF) is +short-term solar energy forecasting with fisheye images. We explore two main +research directions for improving deep forecasting methods by injecting +external physical knowledge. The first direction concerns the role of the +training loss function. We show that differentiable shape and temporal criteria +can be leveraged to improve the performances of existing models. We address +both the deterministic context with the proposed DILATE loss function and the +probabilistic context with the STRIPE model. Our second direction is to augment +incomplete physical models with deep data-driven networks for accurate +forecasting. For video prediction, we introduce the PhyDNet model that +disentangles physical dynamics from residual information necessary for +prediction, such as texture or details. We further propose a learning framework +(APHYNITY) that ensures a principled and unique linear decomposition between +physical and data-driven components under mild assumptions, leading to better +forecasting performances and parameter identification.",spatio-temporal forecasting, +ReAct: Synergizing Reasoning and Acting in Language Models,"While large language models (LLMs) have demonstrated impressive capabilities +across tasks in language understanding and interactive decision making, their +abilities for reasoning (e.g. chain-of-thought prompting) and acting (e.g. +action plan generation) have primarily been studied as separate topics. In this +paper, we explore the use of LLMs to generate both reasoning traces and +task-specific actions in an interleaved manner, allowing for greater synergy +between the two: reasoning traces help the model induce, track, and update +action plans as well as handle exceptions, while actions allow it to interface +with external sources, such as knowledge bases or environments, to gather +additional information. We apply our approach, named ReAct, to a diverse set of +language and decision making tasks and demonstrate its effectiveness over +state-of-the-art baselines, as well as improved human interpretability and +trustworthiness over methods without reasoning or acting components. +Concretely, on question answering (HotpotQA) and fact verification (Fever), +ReAct overcomes issues of hallucination and error propagation prevalent in +chain-of-thought reasoning by interacting with a simple Wikipedia API, and +generates human-like task-solving trajectories that are more interpretable than +baselines without reasoning traces. On two interactive decision making +benchmarks (ALFWorld and WebShop), ReAct outperforms imitation and +reinforcement learning methods by an absolute success rate of 34% and 10% +respectively, while being prompted with only one or two in-context examples. +Project site with code: https://react-lm.github.io",language models reasoning, +Cyclical Focal Loss,"The cross-entropy softmax loss is the primary loss function used to train +deep neural networks. On the other hand, the focal loss function has been +demonstrated to provide improved performance when there is an imbalance in the +number of training samples in each class, such as in long-tailed datasets. In +this paper, we introduce a novel cyclical focal loss and demonstrate that it is +a more universal loss function than cross-entropy softmax loss or focal loss. +We describe the intuition behind the cyclical focal loss and our experiments +provide evidence that cyclical focal loss provides superior performance for +balanced, imbalanced, or long-tailed datasets. We provide numerous experimental +results for CIFAR-10/CIFAR-100, ImageNet, balanced and imbalanced 4,000 +training sample versions of CIFAR-10/CIFAR-100, and ImageNet-LT and Places-LT +from the Open Long-Tailed Recognition (OLTR) challenge. Implementing the +cyclical focal loss function requires only a few lines of code and does not +increase training time. In the spirit of reproducibility, our code is available +at \url{https://github.com/lnsmith54/CFL}.",loss function , +DePA: Improving Non-autoregressive Machine Translation with Dependency-Aware Decoder,"Non-autoregressive machine translation (NAT) models have lower translation +quality than autoregressive translation (AT) models because NAT decoders do not +depend on previous target tokens in the decoder input. We propose a novel and +general Dependency-Aware Decoder (DePA) to enhance target dependency modeling +in the decoder of fully NAT models from two perspectives: decoder +self-attention and decoder input. First, we propose an autoregressive +forward-backward pre-training phase before NAT training, which enables the NAT +decoder to gradually learn bidirectional target dependencies for the final NAT +training. Second, we transform the decoder input from the source language +representation space to the target language representation space through a +novel attentive transformation process, which enables the decoder to better +capture target dependencies. DePA can be applied to any fully NAT models. +Extensive experiments show that DePA consistently improves highly competitive +and state-of-the-art fully NAT models on widely used WMT and IWSLT benchmarks +by up to 1.88 BLEU gain, while maintaining the inference latency comparable to +other fully NAT models.",Non-autoregressive machine translation, +Signed Binary Weight Networks,"Efficient inference of Deep Neural Networks (DNNs) is essential to making AI +ubiquitous. Two important algorithmic techniques have shown promise for +enabling efficient inference - sparsity and binarization. These techniques +translate into weight sparsity and weight repetition at the hardware-software +level enabling the deployment of DNNs with critically low power and latency +requirements. We propose a new method called signed-binary networks to improve +efficiency further (by exploiting both weight sparsity and weight repetition +together) while maintaining similar accuracy. Our method achieves comparable +accuracy on ImageNet and CIFAR10 datasets with binary and can lead to 69% +sparsity. We observe real speedup when deploying these models on +general-purpose devices and show that this high percentage of unstructured +sparsity can lead to a further reduction in energy consumption on ASICs.",Efficient Neural Network, +Vers la compr¨¦hension automatique de la parole bout-en-bout ¨¤ moindre effort,"Recent advances in spoken language understanding benefited from +Self-Supervised models trained on large speech corpora. For French, the +LeBenchmark project has made such models available and has led to impressive +progress on several tasks including spoken language understanding. These +advances have a non-negligible cost in terms of computation time and energy +consumption. In this paper, we compare several learning strategies aiming at +reducing such cost while keeping competitive performances. The experiments are +performed on the MEDIA corpus, and show that it is possible to reduce the +learning cost while maintaining state-of-the-art performances.",spoken language understanding, +Unified Bayesian Frameworks for Multi-criteria Decision-making Problems,"This paper introduces Bayesian frameworks for tackling various aspects of +multi-criteria decision-making (MCDM) problems, leveraging a probabilistic +interpretation of MCDM methods and challenges. By harnessing the flexibility of +Bayesian models, the proposed frameworks offer statistically elegant solutions +to key challenges in MCDM, such as group decision-making problems and criteria +correlation. Additionally, these models can accommodate diverse forms of +uncertainty in decision makers' (DMs) preferences, including normal and +triangular distributions, as well as interval preferences. To address +large-scale group MCDM scenarios, a probabilistic mixture model is developed, +enabling the identification of homogeneous subgroups of DMs. Furthermore, a +probabilistic ranking scheme is devised to assess the relative importance of +criteria and alternatives based on DM(s) preferences. Through experimentation +on various numerical examples, the proposed frameworks are validated, +demonstrating their effectiveness and highlighting their distinguishing +features in comparison to alternative methods.",Multi-Criteria Decision Making, +Analysis & Computational Complexity Reduction of Monocular and Stereo Depth Estimation Techniques,"Accurate depth estimation with lowest compute and energy cost is a crucial +requirement for unmanned and battery operated autonomous systems. Robotic +applications require real time depth estimation for navigation and decision +making under rapidly changing 3D surroundings. A high accuracy algorithm may +provide the best depth estimation but may consume tremendous compute and energy +resources. A general trade-off is to choose less accurate methods for initial +depth estimate and a more accurate yet compute intensive method when needed. +Previous work has shown this trade-off can be improved by developing a +state-of-the-art method (AnyNet) to improve stereo depth estimation. + We studied both the monocular and stereo vision depth estimation methods and +investigated methods to reduce computational complexity of these methods. This +was our baseline. Consequently, our experiments show reduction of monocular +depth estimation model size by ~75% reduces accuracy by less than 2% (SSIM +metric). Our experiments with the novel stereo vision method (AnyNet) show that +accuracy of depth estimation does not degrade more than 3% (three pixel error +metric) in spite of reduction in model size by ~20%. We have shown that smaller +models can indeed perform competitively.",Depth Estimation Optimization, +Thinking Fast and Slow in Large Language Models,"Large language models (LLMs) are currently at the forefront of intertwining +AI systems with human communication and everyday life. Therefore, it is of +great importance to evaluate their emerging abilities. In this study, we show +that LLMs like GPT-3 exhibit behavior that strikingly resembles human-like +intuition - and the cognitive errors that come with it. However, LLMs with +higher cognitive capabilities, in particular ChatGPT and GPT-4, learned to +avoid succumbing to these errors and perform in a hyperrational manner. For our +experiments, we probe LLMs with the Cognitive Reflection Test (CRT) as well as +semantic illusions that were originally designed to investigate intuitive +decision-making in humans. Our study demonstrates that investigating LLMs with +methods from psychology has the potential to reveal otherwise unknown emergent +traits.",Cognitive Ability Evaluation of Language Models, +Mixed Nondeterministic-Probabilistic Automata: Blending graphical probabilistic models with nondeterminism,"Graphical models in probability and statistics are a core concept in the area +of probabilistic reasoning and probabilistic programming-graphical models +include Bayesian networks and factor graphs. In this paper we develop a new +model of mixed (nondeterministic/probabilistic) automata that subsumes both +nondeterministic automata and graphical probabilistic models. Mixed Automata +are equipped with parallel composition, simulation relation, and support +message passing algorithms inherited from graphical probabilistic models. +Segala's Probabilistic Automatacan be mapped to Mixed Automata.",mixed automata, +Many Ways to Be Lonely: Fine-Grained Characterization of Loneliness and Its Potential Changes in COVID-19,"Loneliness has been associated with negative outcomes for physical and mental +health. Understanding how people express and cope with various forms of +loneliness is critical for early screening and targeted interventions to reduce +loneliness, particularly among vulnerable groups such as young adults. To +examine how different forms of loneliness and coping strategies manifest in +loneliness self-disclosure, we built a dataset, FIG-Loneliness (FIne-Grained +Loneliness) by using Reddit posts in two young adult-focused forums and two +loneliness related forums consisting of a diverse age group. We provided +annotations by trained human annotators for binary and fine-grained loneliness +classifications of the posts. Trained on FIG-Loneliness, two BERT-based models +were used to understand loneliness forms and authors' coping strategies in +these forums. Our binary loneliness classification achieved an accuracy above +97%, and fine-grained loneliness category classification reached an average +accuracy of 77% across all labeled categories. With FIG-Loneliness and model +predictions, we found that loneliness expressions in the young adults related +forums were distinct from other forums. Those in young adult-focused forums +were more likely to express concerns pertaining to peer relationship, and were +potentially more sensitive to geographical isolation impacted by the COVID-19 +pandemic lockdown. Also, we showed that different forms of loneliness have +differential use in coping strategies.",loneliness classification, +MaXM: Towards Multilingual Visual Question Answering,"Visual Question Answering (VQA) has been primarily studied through the lens +of the English language. Yet, tackling VQA in other languages in the same +manner would require a considerable amount of resources. In this paper, we +propose scalable solutions to multilingual visual question answering (mVQA), on +both data and modeling fronts. We first propose a translation-based framework +to mVQA data generation that requires much less human annotation efforts than +the conventional approach of directly collection questions and answers. Then, +we apply our framework to the multilingual captions in the Crossmodal-3600 +dataset and develop an efficient annotation protocol to create MaXM, a +test-only VQA benchmark in 7 diverse languages. Finally, we develop a simple, +lightweight, and effective approach as well as benchmark state-of-the-art +English and multilingual VQA models. We hope that our benchmark encourages +further research on mVQA.",multilingual visual question answering, +Tightly Coupled Learning Strategy for Weakly Supervised Hierarchical Place Recognition,"Visual place recognition (VPR) is a key issue for robotics and autonomous +systems. For the trade-off between time and performance, most of methods use +the coarse-to-fine hierarchical architecture, which consists of retrieving +top-N candidates using global features, and re-ranking top-N with local +features. However, since the two types of features are usually processed +independently, re-ranking may harm global retrieval, termed re-ranking +confusion. Moreover, re-ranking is limited by global retrieval. In this paper, +we propose a tightly coupled learning (TCL) strategy to train triplet models. +Different from original triplet learning (OTL) strategy, it combines global and +local descriptors for joint optimization. In addition, a bidirectional search +dynamic time warping (BS-DTW) algorithm is also proposed to mine locally +spatial information tailored to VPR in re-ranking. The experimental results on +public benchmarks show that the models using TCL outperform the models using +OTL, and TCL can be used as a general strategy to improve performance for +weakly supervised ranking tasks. Further, our lightweight unified model is +better than several state-of-the-art methods and has over an order of magnitude +of computational efficiency to meet the real-time requirements of robots.",Visual place recognition , +MemSAC: Memory Augmented Sample Consistency for Large Scale Unsupervised Domain Adaptation,"Practical real world datasets with plentiful categories introduce new +challenges for unsupervised domain adaptation like small inter-class +discriminability, that existing approaches relying on domain invariance alone +cannot handle sufficiently well. In this work we propose MemSAC, which exploits +sample level similarity across source and target domains to achieve +discriminative transfer, along with architectures that scale to a large number +of categories. For this purpose, we first introduce a memory augmented approach +to efficiently extract pairwise similarity relations between labeled source and +unlabeled target domain instances, suited to handle an arbitrary number of +classes. Next, we propose and theoretically justify a novel variant of the +contrastive loss to promote local consistency among within-class cross domain +samples while enforcing separation between classes, thus preserving +discriminative transfer from source to target. We validate the advantages of +MemSAC with significant improvements over previous state-of-the-art on multiple +challenging transfer tasks designed for large-scale adaptation, such as +DomainNet with 345 classes and fine-grained adaptation on Caltech-UCSD birds +dataset with 200 classes. We also provide in-depth analysis and insights into +the effectiveness of MemSAC.",unsupervised domain adaptation, +Deep Learning of Causal Structures in High Dimensions,"Recent years have seen rapid progress at the intersection between causality +and machine learning. Motivated by scientific applications involving +high-dimensional data, in particular in biomedicine, we propose a deep neural +architecture for learning causal relationships between variables from a +combination of empirical data and prior causal knowledge. We combine +convolutional and graph neural networks within a causal risk framework to +provide a flexible and scalable approach. Empirical results include linear and +nonlinear simulations (where the underlying causal structures are known and can +be directly compared against), as well as a real biological example where the +models are applied to high-dimensional molecular data and their output compared +against entirely unseen validation experiments. These results demonstrate the +feasibility of using deep learning approaches to learn causal networks in +large-scale problems spanning thousands of variables.",Causal Relationship Learning, +DivEMT: Neural Machine Translation Post-Editing Effort Across Typologically Diverse Languages,"We introduce DivEMT, the first publicly available post-editing study of +Neural Machine Translation (NMT) over a typologically diverse set of target +languages. Using a strictly controlled setup, 18 professional translators were +instructed to translate or post-edit the same set of English documents into +Arabic, Dutch, Italian, Turkish, Ukrainian, and Vietnamese. During the process, +their edits, keystrokes, editing times and pauses were recorded, enabling an +in-depth, cross-lingual evaluation of NMT quality and post-editing +effectiveness. Using this new dataset, we assess the impact of two +state-of-the-art NMT systems, Google Translate and the multilingual mBART-50 +model, on translation productivity. We find that post-editing is consistently +faster than translation from scratch. However, the magnitude of productivity +gains varies widely across systems and languages, highlighting major +disparities in post-editing effectiveness for languages at different degrees of +typological relatedness to English, even when controlling for system +architecture and training data size. We publicly release the complete dataset +including all collected behavioral data, to foster new research on the +translation capabilities of NMT systems for typologically diverse languages."," +Neural Machine Translation", +In the Eye of Transformer: Global-Local Correlation for Egocentric Gaze Estimation,"In this paper, we present the first transformer-based model to address the +challenging problem of egocentric gaze estimation. We observe that the +connection between the global scene context and local visual information is +vital for localizing the gaze fixation from egocentric video frames. To this +end, we design the transformer encoder to embed the global context as one +additional visual token and further propose a novel Global-Local Correlation +(GLC) module to explicitly model the correlation of the global token and each +local token. We validate our model on two egocentric video datasets - EGTEA +Gaze+ and Ego4D. Our detailed ablation studies demonstrate the benefits of our +method. In addition, our approach exceeds previous state-of-the-arts by a large +margin. We also provide additional visualizations to support our claim that +global-local correlation serves a key representation for predicting gaze +fixation from egocentric videos. More details can be found in our website +(https://bolinlai.github.io/GLC-EgoGazeEst).",egocentric gaze estimation, +Improving Visual-textual Sentiment Analysis by Fusing Expert Features,"Visual-textual sentiment analysis aims to predict sentiment with the input of +a pair of image and text. The main challenge of visual-textual sentiment +analysis is how to learn effective visual features for sentiment prediction +since input images are often very diverse. To address this challenge, we +propose a new method that improves visual-textual sentiment analysis by +introducing powerful expert visual features. The proposed method consists of +four parts: (1) a visual-textual branch to learn features directly from data +for sentiment analysis, (2) a visual expert branch with a set of pre-trained +""expert"" encoders to extract effective visual features, (3) a CLIP branch to +implicitly model visual-textual correspondence, and (4) a multimodal feature +fusion network based on either BERT or MLP to fuse multimodal features and make +sentiment prediction. Extensive experiments on three datasets show that our +method produces better visual-textual sentiment analysis performance than +existing methods.",visual-textual sentiment analysis, +PLOT: Prompt Learning with Optimal Transport for Vision-Language Models,"With the increasing attention to large vision-language models such as CLIP, +there has been a significant amount of effort dedicated to building efficient +prompts. Unlike conventional methods of only learning one single prompt, we +propose to learn multiple comprehensive prompts to describe diverse +characteristics of categories such as intrinsic attributes or extrinsic +contexts. However, directly matching each prompt to the same visual feature is +problematic, as it pushes the prompts to converge to one point. To solve this +problem, we propose to apply optimal transport to match the vision and text +modalities. Specifically, we first model images and the categories with visual +and textual feature sets. Then, we apply a two-stage optimization strategy to +learn the prompts. In the inner loop, we optimize the optimal transport +distance to align visual features and prompts by the Sinkhorn algorithm, while +in the outer loop, we learn the prompts by this distance from the supervised +data. Extensive experiments are conducted on the few-shot recognition task and +the improvement demonstrates the superiority of our method. The code is +available at https://github.com/CHENGY12/PLOT.",prompt learning for vision-language models, +Development of an Extractive Clinical Question Answering Dataset with Multi-Answer and Multi-Focus Questions,"Background: Extractive question-answering (EQA) is a useful natural language +processing (NLP) application for answering patient-specific questions by +locating answers in their clinical notes. Realistic clinical EQA can have +multiple answers to a single question and multiple focus points in one +question, which are lacking in the existing datasets for development of +artificial intelligence solutions. Objective: Create a dataset for developing +and evaluating clinical EQA systems that can handle natural multi-answer and +multi-focus questions. Methods: We leveraged the annotated relations from the +2018 National NLP Clinical Challenges (n2c2) corpus to generate an EQA dataset. +Specifically, the 1-to-N, M-to-1, and M-to-N drug-reason relations were +included to form the multi-answer and multi-focus QA entries, which represent +more complex and natural challenges in addition to the basic +one-drug-one-reason cases. A baseline solution was developed and tested on the +dataset. Results: The derived RxWhyQA dataset contains 96,939 QA entries. Among +the answerable questions, 25% require multiple answers, and 2% ask about +multiple drugs within one question. There are frequent cues observed around the +answers in the text, and 90% of the drug and reason terms occur within the same +or an adjacent sentence. The baseline EQA solution achieved a best f1-measure +of 0.72 on the entire dataset, and on specific subsets, it was: 0.93 on the +unanswerable questions, 0.48 on single-drug questions versus 0.60 on multi-drug +questions, 0.54 on the single-answer questions versus 0.43 on multi-answer +questions. Discussion: The RxWhyQA dataset can be used to train and evaluate +systems that need to handle multi-answer and multi-focus questions. +Specifically, multi-answer EQA appears to be challenging and therefore warrants +more investment in research.",Clinical Extractive Question-Answering, +StyleRes: Transforming the Residuals for Real Image Editing with StyleGAN,"We present a novel image inversion framework and a training pipeline to +achieve high-fidelity image inversion with high-quality attribute editing. +Inverting real images into StyleGAN's latent space is an extensively studied +problem, yet the trade-off between the image reconstruction fidelity and image +editing quality remains an open challenge. The low-rate latent spaces are +limited in their expressiveness power for high-fidelity reconstruction. On the +other hand, high-rate latent spaces result in degradation in editing quality. +In this work, to achieve high-fidelity inversion, we learn residual features in +higher latent codes that lower latent codes were not able to encode. This +enables preserving image details in reconstruction. To achieve high-quality +editing, we learn how to transform the residual features for adapting to +manipulations in latent codes. We train the framework to extract residual +features and transform them via a novel architecture pipeline and cycle +consistency losses. We run extensive experiments and compare our method with +state-of-the-art inversion methods. Qualitative metrics and visual comparisons +show significant improvements. Code: https://github.com/hamzapehlivan/StyleRes",image inversion, +Tutorial on amortized optimization,"Optimization is a ubiquitous modeling tool and is often deployed in settings +which repeatedly solve similar instances of the same problem. Amortized +optimization methods use learning to predict the solutions to problems in these +settings, exploiting the shared structure between similar problem instances. +These methods have been crucial in variational inference and reinforcement +learning and are capable of solving optimization problems many orders of +magnitudes times faster than traditional optimization methods that do not use +amortization. This tutorial presents an introduction to the amortized +optimization foundations behind these advancements and overviews their +applications in variational inference, sparse coding, gradient-based +meta-learning, control, reinforcement learning, convex optimization, optimal +transport, and deep equilibrium networks. The source code for this tutorial is +available at +https://github.com/facebookresearch/amortized-optimization-tutorial.",amortized optimization, +Federated Named Entity Recognition,"We present an analysis of the performance of Federated Learning in a +paradigmatic natural-language processing task: Named-Entity Recognition (NER). +For our evaluation, we use the language-independent CoNLL-2003 dataset as our +benchmark dataset and a Bi-LSTM-CRF model as our benchmark NER model. We show +that federated training reaches almost the same performance as the centralized +model, though with some performance degradation as the learning environments +become more heterogeneous. We also show the convergence rate of federated +models for NER. Finally, we discuss existing challenges of Federated Learning +for NLP applications that can foster future research directions.",Named-Entity Recognition, +Inpainting at Modern Camera Resolution by Guided PatchMatch with Auto-Curation,"Recently, deep models have established SOTA performance for low-resolution +image inpainting, but they lack fidelity at resolutions associated with modern +cameras such as 4K or more, and for large holes. We contribute an inpainting +benchmark dataset of photos at 4K and above representative of modern sensors. +We demonstrate a novel framework that combines deep learning and traditional +methods. We use an existing deep inpainting model LaMa to fill the hole +plausibly, establish three guide images consisting of structure, segmentation, +depth, and apply a multiply-guided PatchMatch to produce eight candidate +upsampled inpainted images. Next, we feed all candidate inpaintings through a +novel curation module that chooses a good inpainting by column summation on an +8x8 antisymmetric pairwise preference matrix. Our framework's results are +overwhelmingly preferred by users over 8 strong baselines, with improvements of +quantitative metrics up to 7.4 over the best baseline LaMa, and our technique +when paired with 4 different SOTA inpainting backbones improves each such that +ours is overwhelmingly preferred by users over a strong super-res baseline.",image inpainting, +Molecular Joint Representation Learning via Multi-modal Information,"In recent years, artificial intelligence has played an important role on +accelerating the whole process of drug discovery. Various of molecular +representation schemes of different modals (e.g. textual sequence or graph) are +developed. By digitally encoding them, different chemical information can be +learned through corresponding network structures. Molecular graphs and +Simplified Molecular Input Line Entry System (SMILES) are popular means for +molecular representation learning in current. Previous works have done attempts +by combining both of them to solve the problem of specific information loss in +single-modal representation on various tasks. To further fusing such +multi-modal imformation, the correspondence between learned chemical feature +from different representation should be considered. To realize this, we propose +a novel framework of molecular joint representation learning via Multi-Modal +information of SMILES and molecular Graphs, called MMSG. We improve the +self-attention mechanism by introducing bond level graph representation as +attention bias in Transformer to reinforce feature correspondence between +multi-modal information. We further propose a Bidirectional Message +Communication Graph Neural Network (BMC GNN) to strengthen the information flow +aggregated from graphs for further combination. Numerous experiments on public +property prediction datasets have demonstrated the effectiveness of our model.",Molecular Representation Learning, +Dynamic Test-Time Augmentation via Differentiable Functions,"Distribution shifts, which often occur in the real world, degrade the +accuracy of deep learning systems, and thus improving robustness is essential +for practical applications. To improve robustness, we study an image +enhancement method that generates recognition-friendly images without +retraining the recognition model. We propose a novel image enhancement method, +DynTTA, which is based on differentiable data augmentation techniques and +generates a blended image from many augmented images to improve the recognition +accuracy under distribution shifts. In addition to standard data augmentations, +DynTTA also incorporates deep neural network-based image transformation, which +further improves the robustness. Because DynTTA is composed of differentiable +functions, it is directly trained with the classification loss of the +recognition model. We experiment with widely used image recognition datasets +using various classification models, including Vision Transformer and +MLP-Mixer. DynTTA improves the robustness with almost no reduction in +classification accuracy for clean images, which is a better result than the +existing methods. Furthermore, we show that estimating the training time +augmentation for distribution-shifted datasets using DynTTA and retraining the +recognition model with the estimated augmentation significantly improves +robustness.",test-time augmentation, +Coordinated Multi-Agent Reinforcement Learning for Unmanned Aerial Vehicle Swarms in Autonomous Mobile Access Applications,"This paper proposes a novel centralized training and distributed execution +(CTDE)-based multi-agent deep reinforcement learning (MADRL) method for +multiple unmanned aerial vehicles (UAVs) control in autonomous mobile access +applications. For the purpose, a single neural network is utilized in +centralized training for cooperation among multiple agents while maximizing the +total quality of service (QoS) in mobile access applications.",Multi-Agent Reinforcement Learning, +Mirror-Yolo: An attention-based instance segmentation and detection model for mirrors,"Mirrors can degrade the performance of computer vision models, however to +accurately detect mirrors in images remains challenging. YOLOv4 achieves +phenomenal results both in object detection accuracy and speed, nevertheless +the model often fails in detecting mirrors. In this paper, a novel mirror +detection method `Mirror-YOLO' is proposed, which mainly targets on mirror +detection. Based on YOLOv4, the proposed model embeds an attention mechanism +for better feature acquisition, and a hypercolumn-stairstep approach for +feature map fusion. Mirror-YOLO can also produce accurate bounding polygons for +instance segmentation. The effectiveness of our proposed model is demonstrated +by our experiments, compared to the existing mirror detection methods, the +proposed Mirror-YOLO achieves better performance in detection accuracy on the +mirror image dataset.",mirror detection, +N-ACT: An Interpretable Deep Learning Model for Automatic Cell Type and Salient Gene Identification,"Single-cell RNA sequencing (scRNAseq) is rapidly advancing our understanding +of cellular composition within complex tissues and organisms. A major +limitation in most scRNAseq analysis pipelines is the reliance on manual +annotations to determine cell identities, which are time consuming, subjective, +and require expertise. Given the surge in cell sequencing, supervised +methods-especially deep learning models-have been developed for automatic cell +type identification (ACTI), which achieve high accuracy and scalability. +However, all existing deep learning frameworks for ACTI lack interpretability +and are used as ""black-box"" models. We present N-ACT (Neural-Attention for Cell +Type identification): the first-of-its-kind interpretable deep neural network +for ACTI utilizing neural-attention to detect salient genes for use in +cell-type identification. We compare N-ACT to conventional annotation methods +on two previously manually annotated data sets, demonstrating that N-ACT +accurately identifies marker genes and cell types in an unsupervised manner, +while performing comparably on multiple data sets to current state-of-the-art +model in traditional supervised ACTI.",Cell Type Identification, +Towards Cross-Disaster Building Damage Assessment with Graph Convolutional Networks,"In the aftermath of disasters, building damage maps are obtained using change +detection to plan rescue operations. Current convolutional neural network +approaches do not consider the similarities between neighboring buildings for +predicting the damage. We present a novel graph-based building damage detection +solution to capture these relationships. Our proposed model architecture learns +from both local and neighborhood features to predict building damage. +Specifically, we adopt the sample and aggregate graph convolution strategy to +learn aggregation functions that generalize to unseen graphs which is essential +for alleviating the time needed to obtain predictions for new disasters. Our +experiments on the xBD dataset and comparisons with a classical convolutional +neural network reveal that while our approach is handicapped by class +imbalance, it presents a promising and distinct advantage when it comes to +cross-disaster generalization.",building damage assessment, +Towards Targeted Change Detection with Heterogeneous Remote Sensing Images for Forest Mortality Mapping,"Several generic methods have recently been developed for change detection in +heterogeneous remote sensing data, such as images from synthetic aperture radar +(SAR) and multispectral radiometers. However, these are not well suited to +detect weak signatures of certain disturbances of ecological systems. To +resolve this problem we propose a new approach based on image-to-image +translation and one-class classification (OCC). We aim to map forest mortality +caused by an outbreak of geometrid moths in a sparsely forested forest-tundra +ecotone using multisource satellite images. The images preceding and following +the event are collected by Landsat-5 and RADARSAT-2, respectively. Using a +recent deep learning method for change-aware image translation, we compute +difference images in both satellites' respective domains. These differences are +stacked with the original pre- and post-event images and passed to an OCC +trained on a small sample from the targeted change class. The classifier +produces a credible map of the complex pattern of forest mortality.",Ecological Disturbance Detection, +Video Extrapolation in Space and Time,"Novel view synthesis (NVS) and video prediction (VP) are typically considered +disjoint tasks in computer vision. However, they can both be seen as ways to +observe the spatial-temporal world: NVS aims to synthesize a scene from a new +point of view, while VP aims to see a scene from a new point of time. These two +tasks provide complementary signals to obtain a scene representation, as +viewpoint changes from spatial observations inform depth, and temporal +observations inform the motion of cameras and individual objects. Inspired by +these observations, we propose to study the problem of Video Extrapolation in +Space and Time (VEST). We propose a model that leverages the self-supervision +and the complementary cues from both tasks, while existing methods can only +solve one of them. Experiments show that our method achieves performance better +than or comparable to several state-of-the-art NVS and VP methods on indoor and +outdoor real-world datasets.",video extrapolation, +Recursive Reinforcement Learning,"Recursion is the fundamental paradigm to finitely describe potentially +infinite objects. As state-of-the-art reinforcement learning (RL) algorithms +cannot directly reason about recursion, they must rely on the practitioner's +ingenuity in designing a suitable ""flat"" representation of the environment. The +resulting manual feature constructions and approximations are cumbersome and +error-prone; their lack of transparency hampers scalability. To overcome these +challenges, we develop RL algorithms capable of computing optimal policies in +environments described as a collection of Markov decision processes (MDPs) that +can recursively invoke one another. Each constituent MDP is characterized by +several entry and exit points that correspond to input and output values of +these invocations. These recursive MDPs (or RMDPs) are expressively equivalent +to probabilistic pushdown systems (with call-stack playing the role of the +pushdown stack), and can model probabilistic programs with recursive procedural +calls. We introduce Recursive Q-learning -- a model-free RL algorithm for RMDPs +-- and prove that it converges for finite, single-exit and deterministic +multi-exit RMDPs under mild assumptions.",reinforcement learning, diff --git a/TKPD/prompt_keyword_async_search.py b/TKPD/prompt_keyword_async_search.py new file mode 100644 index 0000000..12ddc58 --- /dev/null +++ b/TKPD/prompt_keyword_async_search.py @@ -0,0 +1,84 @@ +import os +import csv +from concurrent.futures import ThreadPoolExecutor, as_completed +from retry import retry +import Levenshtein +from tqdm import tqdm + +API_SECRET_KEY = "sk-xxx" +BASE_URL = "xxx" +os.environ["OPENAI_API_KEY"] = API_SECRET_KEY +os.environ["OPENAI_API_BASE"] = BASE_URL +from langchain_core.messages import HumanMessage, SystemMessage, AIMessage +from langchain_openai import ChatOpenAI + +def normalized_edit_distance(str1, str2): + + str1 = str1.strip().lower() + str2 = str2.strip().lower() + + + edit_distance = Levenshtein.distance(str1, str2) + + + max_length = max(len(str1), len(str2)) + + + normalized_distance = edit_distance / max_length if max_length != 0 else 0 + return normalized_distance +usr_prompts = ["Given the following title and abstract of the research paper, identify the core task or problem being addressed in few words. You MUST respond with the keyphrase ONLY in this format: xxx", + "Based on the given title and abstract, what is the main focus or task of the research? Summarize it in a few words. You MUST respond with the keyphrase ONLY in this format: xxx", + "Analyze the title and abstract provided to identify the central task or topic of the paper, which will be used as a keyword for searching related academic papers on Google Scholar. Avoid terms that are either too broad (such as 'deep learning' or 'computer vision') or too specific (such as certain model names, unless widely recognized.). You MUST respond with the keyword ONLY in this format: xxx" + ] +@retry(delay=2) +def get_chatgpt_field(title, abstract=None, sys_content=None, usr_prompt=None, extra_prompt=True, model="gpt-3.5-turbo-0125", temperature=0): + if not sys_content: + sys_content = ( + "You are a profound researcher who is good at identifying the topic key phrase from paper's title and " + "abstract. Ensure that the topic key phrase precisely defines the research area within the article. For effective academic searching, such as on Google Scholar, the field should be specifically targeted rather than broadly categorized. For instance, use 'image classification' instead of the general 'computer vision' to enhance relevance and searchability of related literature.") + if not usr_prompt: + usr_prompt = ("Analyze the title and abstract provided to identify the central topic of the paper, which will be used as a keyword for searching related academic papers on Google Scholar. Avoid terms that are either too broad (such as 'deep learning' or 'computer vision') or too specific (such as obscure model names, unless widely recognized). Focus on a keyword that reflects the innovative aspect or core methodology of the study. You MUST respond with the keyword ONLY in this format: xxx") + + messages = [SystemMessage(content=sys_content)] + + extra_abs_content = ''' + Given Title: Large Selective Kernel Network for Remote Sensing Object Detection + Given Abstract: Recent research on remote sensing object detection has largely focused on improving the representation of oriented bounding boxes but has overlooked the unique prior knowledge presented in remote sensing scenarios. Such prior knowledge can be useful because tiny remote sensing objects may be mistakenly detected without referencing a sufficiently long-range context, which can vary for different objects. This paper considers these priors and proposes the lightweight Large Selective Kernel Network (LSKNet). LSKNet can dynamically adjust its large spatial receptive field to better model the ranging context of various objects in remote sensing scenarios. To our knowledge, large and selective kernel mechanisms have not been previously explored in remote sensing object detection. Without bells and whistles, our lightweight LSKNet sets new state-of-the-art scores on standard benchmarks, i.e., HRSC2016 (98.46% mAP), DOTA-v1.0 (81.85% mAP), and FAIR1M-v1.0 (47.87% mAP).''' if abstract else '' + if extra_prompt: + messages += [HumanMessage(content=f'''{usr_prompt}\n\n{extra_abs_content}'''), AIMessage(content='remote sensing object detection')] + + content = f'''{usr_prompt}\n + Given Title: {title} + ''' + if abstract: + content += f'Given Abstract: {abstract}' + messages.append(HumanMessage(content=content)) + + chat = ChatOpenAI(model=model, temperature=temperature) + + return chat.batch([messages])[0].content +import csv +from multiprocessing import Pool +prompt = "Identify the research field from the given title and abstract. You MUST respond with the keyword ONLY in this format: xxx" +def process_row(row): + title, abs, GT_kwd = row[0], row[1], row[2] + pred_kwd = get_chatgpt_field(title, abs, usr_prompt=prompt) # This should be replaced with the actual prediction logic + # Assuming normalized_edit_distance is defined elsewhere + + ned = normalized_edit_distance(GT_kwd, pred_kwd) + print(f'GT:{GT_kwd} \t Pred:{pred_kwd} \t Ned:{ned}') + return ned + +def main(): + with open(r'TKPD.csv','r', newline='', encoding='gbk') as input_csvfile: + reader = csv.reader(input_csvfile) + rows = [row for row in reader] + print(len(rows)) + with Pool(12) as p: + results = p.map(process_row, rows) + + average_distance = sum(results) / len(results) if results else 0 + print(f"{prompt}: {average_distance}") + +if __name__ == '__main__': + main() diff --git a/previous_methods/Doc2Vec&Bi-LSTM.py b/previous_methods/Doc2Vec&Bi-LSTM.py new file mode 100644 index 0000000..78232d7 --- /dev/null +++ b/previous_methods/Doc2Vec&Bi-LSTM.py @@ -0,0 +1,250 @@ + +import pandas as pd +import numpy as np +import torch +from torch.utils.data import Dataset +from gensim.models.doc2vec import Doc2Vec, TaggedDocument +import nltk +from nltk.corpus import stopwords +from nltk.stem import WordNetLemmatizer +import string +from torch.utils.data import DataLoader +import torch.nn as nn +from sklearn.model_selection import train_test_split +import nltk +from tqdm import tqdm + +nltk.download('punkt') +nltk.download('omw-1.4') + +# def dcg_at_k(scores, k): +# """ +# scores: a list of relevance scores in predicted order +# k: number of results to consider +# """ +# scores = np.asfarray(scores)[:k] +# return np.sum(scores / np.log2(np.arange(2, scores.size + 2))) +# +# def ndcg_at_k(predicted_scores, true_scores, k): +# """ +# predicted_scores: model's predicted scores +# true_scores: ground truth scores +# k: number of results to consider +# """ +# idcg = dcg_at_k(sorted(true_scores, reverse=True), k) +# dcg = dcg_at_k(predicted_scores, k) +# return dcg / idcg if idcg > 0 else 0 + +import torch +import numpy as np + +from sklearn.metrics import ndcg_score + +def NDCG_k(predictions, labels, k=20): + print(print(predictions.shape, labels.shape)) + predictions = predictions.squeeze().detach().cpu().numpy() + labels = labels.squeeze().detach().cpu().numpy() + if len(predictions) < k: + return -1 + + + ndcg = ndcg_score([labels], [predictions], k=k) + + print("Average NDCG:", ndcg) + return ndcg + +# nltk.download('stopwords') +# nltk.download('wordnet') + + +def preprocess_text(text): + stop_words = set(stopwords.words('english')) + lemmatizer = WordNetLemmatizer() + + + text = text.lower() + text = ''.join([char for char in text if char not in string.punctuation]) + words = nltk.word_tokenize(text) + words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words] + + return words + + + +class PapersDataset(Dataset): + def __init__(self, dataframe, doc2vec_model,target_type='TNCSI'): + """ + dataframe: 传入的DataFrame,包含训练或测试数据 + doc2vec_model: 已训练的Doc2Vec模型 + """ + self.dataframe = dataframe + self.doc2vec_model = doc2vec_model + self.target_type = target_type + def __len__(self): + return len(self.dataframe) + + def __getitem__(self, idx): + row = self.dataframe.iloc[idx] + metadata = f"{row['title']} {row['abstract']}" + processed_text = preprocess_text(metadata) + vector = self.doc2vec_model.infer_vector(processed_text) + if self.target_type.startswith('TNCSI'): + label = row[self.target_type] + else: + label = row['cites'] + return torch.tensor(vector, dtype=torch.float32), torch.tensor(label, dtype=torch.float32) + + +def train_doc2vec(documents): + tagged_data = [TaggedDocument(words=preprocess_text(doc), tags=[i]) for i, doc in enumerate(documents)] + model = Doc2Vec(tagged_data, vector_size=100, window=2, min_count=1, workers=4, epochs=40) + return model + + + +class Attention(nn.Module): + def __init__(self, hidden_size): + super(Attention, self).__init__() + self.linear = nn.Linear(hidden_size, 1) + + def forward(self, lstm_output): + + weights = torch.tanh(self.linear(lstm_output)) + weights = torch.softmax(weights, dim=1) + + weighted = torch.mul(lstm_output, weights.expand_as(lstm_output)) + + return torch.sum(weighted, dim=1) + +class CitationModel(nn.Module): + def __init__(self, embedding_dim, hidden_dim,target_type='TNCSI'): + super(CitationModel, self).__init__() + self.bi_lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True) + self.attention = Attention(hidden_dim * 2) + self.fc = nn.Linear(hidden_dim * 2, 1) + self.sigmoid = nn.Sigmoid() + self.relu = nn.ReLU() + self.target_type = target_type + + def forward(self, x): + lstm_out, _ = self.bi_lstm(x) + attention_out = self.attention(lstm_out) + output = self.fc(attention_out) + if self.target_type.startswith('TNCSI'): + output = self.sigmoid(output) + else: + output = self.relu(output) + return output + + + +def train_model(model, train_loader, criterion, optimizer, epochs, device): + model.train() + loss_history = [] + for epoch in tqdm(range(epochs)): + total_loss = 0 + for inputs,targets in train_loader: + inputs, targets = inputs.to(device).unsqueeze(1), targets.to(device).unsqueeze(1) + optimizer.zero_grad() + outputs = model(inputs) + + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + total_loss += loss.item() + avg_loss = total_loss / len(train_loader) + loss_history.append(avg_loss) + print(f'Epoch {epoch+1}, Loss: {avg_loss}') + # else: + # for inputs,_,targets in train_loader: + + # optimizer.zero_grad() + # outputs = model(inputs) + # + # loss = criterion(outputs, targets) + # loss.backward() + # optimizer.step() + # total_loss += loss.item() + # avg_loss = total_loss / len(train_loader) + # loss_history.append(avg_loss) + # print(f'Epoch {epoch + 1}, Loss: {avg_loss}') + + print("Training complete. Loss history:") + print(loss_history) + +# Evaluation function for NDCG +def evaluate_model(model, test_loader, device,k=20): + model.eval() + pred_scores = [] + target_scores = [] + with torch.no_grad(): + total_loss = 0 + + + for inputs, target in test_loader: + inputs = inputs.to(device).unsqueeze(1) # Ensure input is correctly shaped + outputs = model(inputs) + + # Flatten outputs and targets for NDCG computation + predicted_scores = outputs.squeeze() + true_scores = target.squeeze() + loss = nn.MSELoss()(predicted_scores.detach().cpu(),true_scores.detach().cpu()) + total_loss += loss.item() + print(predicted_scores) + print(true_scores) + print('-'*50) + pred_scores.append(outputs) + target_scores.append(true_scores) + + avg_loss = total_loss / len(test_loader) + print(f'AVG MSE:{avg_loss}') + + all_pred = torch.cat(pred_scores, dim=0).squeeze() + all_GT = torch.cat(target_scores, dim=0).squeeze() + + # all_pred = torch.Tensor(pred_scores) + # all_GT = torch.Tensor(target_scores) + ndcg = NDCG_k(all_pred, all_GT,k=k) + print(ndcg) + + return ndcg +# Main function +def main(): + csv_file = r' Desktop\NAID_train_extrainfo.csv' + target_type = 'TNCSI_SP' + + train_data = pd.read_csv(csv_file) + test_data = pd.read_csv(r' Desktop\NAID_test_extrainfo.csv') + + # Train the Doc2Vec model on training data abstracts + train_documents = train_data['abstract'].tolist() + doc2vec_model = train_doc2vec(train_documents) + + # Create training and testing datasets + + train_dataset = PapersDataset(dataframe=train_data, doc2vec_model=doc2vec_model,target_type=target_type) + test_dataset = PapersDataset(dataframe=test_data, doc2vec_model=doc2vec_model,target_type=target_type) + + # Create DataLoaders + train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True) + test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False) + + # Model setup + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + print(device) + model = CitationModel(embedding_dim=100, hidden_dim=1024,target_type=target_type).to(device) + optimizer = torch.optim.Adam(model.parameters(), lr=0.005) + criterion = nn.MSELoss() + + ep = 30 + + train_model(model, train_loader, criterion, optimizer, epochs=ep, device=device) + torch.save(model.state_dict(), f'LSTM-{target_type}-{ep}.pth') + + # model.load_state_dict(torch.load(f'LSTM-{target_type}-{ep}.pth')) + + # Evaluate using NDCG + evaluate_model(model, test_loader, device=device) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/previous_methods/Ensemble_MLP.py b/previous_methods/Ensemble_MLP.py new file mode 100644 index 0000000..4bbdf1b --- /dev/null +++ b/previous_methods/Ensemble_MLP.py @@ -0,0 +1,97 @@ +import string +import numpy as np +import pandas as pd +import torch +from torch.utils.data import Dataset, DataLoader +import torch.nn as nn +import torch.optim as optim +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler +from sklearn.metrics import ndcg_score +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +class LinearRegressionModel(nn.Module): + def __init__(self, input_dim): + super(LinearRegressionModel, self).__init__() + self.linear1 = nn.Linear(input_dim, input_dim) + self.linear2 = nn.Linear(input_dim, input_dim) + self.linear3 = nn.Linear(input_dim, 1) + + def forward(self, x): + x = self.linear1(x) + x = self.linear2(x) + x = self.linear3(x) + return x + +class CitationDataset(Dataset): + def __init__(self, features, labels): + self.features = torch.tensor(features, dtype=torch.float32) + self.labels = torch.tensor(labels, dtype=torch.float32) + + def __len__(self): + return len(self.labels) + + def __getitem__(self, idx): + return self.features[idx], self.labels[idx] + +def preprocess_and_scale_data(df, arxiv_type): + df = df[df['arxiv_type'] == arxiv_type] + df['title_length'] = df['title'].apply(lambda x: len(x.split())) + df['title_punctuation'] = df['title'].apply(lambda x: sum([1 for char in x if char in string.punctuation])) + df['pdf_page_num'] = df['pdf_page_num'].fillna(df['pdf_page_num'].mean()) + df['SMP'] = df['SMP'].fillna(df['SMP'].mean()) + features = df[['pdf_page_num', 'title_length', 'title_punctuation', 'Ref_num', 'SMP']].values + labels = df['cites'].values + + scaler = StandardScaler() + scaled_features = scaler.fit_transform(features) + return scaled_features, labels + +data = pd.read_csv('NAID_train_extrainfo.csv') +arxiv_types = data['arxiv_type'].unique() + +models = {} +train_loaders = {} +test_loaders = {} +for arxiv_type in arxiv_types: + scaled_features, targets = preprocess_and_scale_data(data, arxiv_type) + X_train, X_test, y_train, y_test = train_test_split(scaled_features, targets, test_size=0.2, random_state=42) + train_dataset = CitationDataset(X_train, y_train) + test_dataset = CitationDataset(X_test, y_test) + train_loaders[arxiv_type] = DataLoader(train_dataset, batch_size=64, shuffle=True) + test_loaders[arxiv_type] = DataLoader(test_dataset, batch_size=64, shuffle=False) + model = LinearRegressionModel(X_train.shape[1]).to(device) + models[arxiv_type] = model + + +for arxiv_type in arxiv_types: + model = models[arxiv_type] + optimizer = optim.Adam(model.parameters(), lr=0.01) + criterion = nn.MSELoss() + model.train() + for epoch in range(50): + for features, labels in train_loaders[arxiv_type]: + features, labels = features.to(device), labels.to(device) + optimizer.zero_grad() + outputs = model(features) + loss = criterion(outputs, labels.unsqueeze(1)) + loss.backward() + optimizer.step() + print(f'Epoch {epoch+1}, Loss: {loss.item()} for arxiv_type {arxiv_type}') + + +predictions = [] +actuals = [] +for arxiv_type in arxiv_types: + model = models[arxiv_type].eval() + with torch.no_grad(): + for features, labels in test_loaders[arxiv_type]: + features = features.to(device) + outputs = model(features) + predictions.extend(outputs.cpu().numpy().flatten().tolist()) + actuals.extend(labels.numpy().tolist()) + +mse = np.mean((np.array(predictions) - np.array(actuals)) ** 2) +print(f'Test MSE: {mse}') + +ndcg_value = ndcg_score([actuals], [predictions], k=20) +print(f'NDCG Score: {ndcg_value}') diff --git a/previous_methods/GPT_Predict.py b/previous_methods/GPT_Predict.py new file mode 100644 index 0000000..eaacd0f --- /dev/null +++ b/previous_methods/GPT_Predict.py @@ -0,0 +1,177 @@ +import pandas as pd + +from tools.test import get_filename_without_extension +import copy +import json +import os +from urllib.error import URLError + +import requests +import tiktoken +from bs4 import BeautifulSoup +from langchain_core.exceptions import OutputParserException + +def get_filename_without_extension(file_path): + # Extract the filename without extension + filename_without_extension = os.path.splitext(os.path.basename(file_path))[0] + return filename_without_extension + + +import pandas as pd +from concurrent.futures import ThreadPoolExecutor, as_completed +from concurrent.futures import ThreadPoolExecutor, as_completed +from tqdm import tqdm + +from sqlalchemy.orm.exc import NoResultFound +from sqlalchemy.orm import declarative_base + +from langchain.output_parsers import PydanticOutputParser, OutputFixingParser +from pydantic import BaseModel, Field + +from typing import Dict, List +from langchain.output_parsers import PydanticOutputParser, OutputFixingParser +import langchain + +from langchain.chains import LLMChain + +import os + +from langchain.prompts import PromptTemplate + +from langchain.chains.question_answering import load_qa_chain + + +import os +import sys +BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(BASE_DIR) +from langchain_text_splitters import RecursiveCharacterTextSplitter, CharacterTextSplitter +from retry import retry + +langchain.debug = False +import arxiv + +import time + +import ssl + +ssl._create_default_https_context = ssl._create_unverified_context +import glob + +from langchain_community.document_loaders import PDFMinerLoader +from langchain_community.chat_models import ChatOpenAI + +from database.DBEntity import * +from furnace.arxiv_paper import Arxiv_paper, get_arxiv_id_from_url +from sqlalchemy import create_engine, and_ +from sqlalchemy.orm import sessionmaker, scoped_session +import logging +import datetime + +engine = create_engine('xxx/scitepredict') + +Base = declarative_base() + +Base.metadata.create_all(engine) + + +Session = sessionmaker(bind=engine) +session = Session() +SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) +session_factory = scoped_session(SessionLocal) +import PyPDF2 +items = [ + "Engaging", "Controversial", "Rigorous", "Innovative", "Accessible", "Methodical", "Concise", "Persuasive", + "Comprehensive", "Insightful", "Relevant", "Objective", "Replicable", "Structured", "Coherent", "Original", + "Balanced", "Authoritative", "Impactful", "Interdisciplinary", "Well-sourced", "Technical", "Provocative", + "Hypothesis-driven", "Ethical", "Difficult to understand", "Exciting", "Not well written", "Theoretical", "To the point", + "Disengaging", "Uncontroversial", "Lax", "Conventional", "Inaccessible", "Haphazard", "Verbose", "Unconvincing", + "Superficial", "Uninsightful", "Irrelevant", "Subjective", "Non-replicable", "Unstructured", "Incoherent", "Derivative", + "Unbalanced", "Unreliable", "Inconsequential", "Narrow", "Poorly-sourced", "Nontechnical", "Unprovocative", + "Speculation-driven", "Unethical", "Easy to understand", "Dull", "Well written", "Empirical", "Circumlocutory" +] + +def parse_scores(content): + + try: + scores = [int(line.split()[1]) for line in content.split('\n')] + + mean_score = sum(scores) / len(scores) + return mean_score + except Exception as e: + print(e) + return 0 + +def paper_rating(abstract): + # download_paper(row, out_dir=r'J:\arxiv') + + prompt = f"Please rate the following abstract on each of the 60 items from 0 = Not at all to 100 = Very much. Only provide the numbers. For example:\n\n" + prompt += "1. 65\n2. 50\n3. 5\n4. 95\n5. …\n\n" + prompt += f"This is the abstract:\n{abstract}\n\n" + prompt += "These are the items:\n" + "\n".join([f"{i + 1}. {item}" for i, item in enumerate(items)]) + prompt_template = prompt + + llm = ChatOpenAI(model_name="gpt-3.5-turbo-0125", temperature=0) # Lite Speed + rst = llm.invoke(prompt_template) + content = rst.content + + return parse_scores(content) + + +def main(): + + data = pd.read_csv(r'xxx\NAID\NAID_test_extrainfo.csv') + + + scores = [] + + + with ThreadPoolExecutor(max_workers=10) as executor: + + future_to_abstract = {executor.submit(paper_rating, abstract): abstract for abstract in data['abstract']} + + + for future in tqdm(as_completed(future_to_abstract)): + score = future.result() + scores.append(score) + + data['average_score'] = scores + + + columns_to_save = ['id', 'cites', 'TNCSI', 'TNCSI_SP', 'abstract', 'average_score'] + data[columns_to_save].to_csv(r'gpt_predict.csv', index=False) + + +import pandas as pd +from sklearn.metrics import ndcg_score +import numpy as np + + +def calculate_ndcg(file_path): + + data = pd.read_csv(file_path) + + + if 'average_score' not in data.columns or 'cites' not in data.columns: + return "The required columns are not in the dataframe." + + + y_true = data['cites'].to_numpy() + y_score = data['average_score'].to_numpy() + + # Reshape data for ndcg calculation (1, -1) as ndcg expects at least 2D arrays + y_true = y_true.reshape(1, -1) + y_score = y_score.reshape(1, -1) + + + ndcg = ndcg_score(y_true, y_score,k=20) + + return ndcg + + + + +if __name__ == "__main__": + # main() + ndcg_value = calculate_ndcg('gpt_predict.csv') + print(f"The NDCG value is: {ndcg_value}") \ No newline at end of file diff --git a/previous_methods/MLP.py b/previous_methods/MLP.py new file mode 100644 index 0000000..e0b2be7 --- /dev/null +++ b/previous_methods/MLP.py @@ -0,0 +1,95 @@ +import string +import numpy as np +import pandas as pd +import torch +from torch.utils.data import Dataset, DataLoader +import torch.nn as nn +import torch.optim as optim +from sklearn.preprocessing import StandardScaler +from sklearn.metrics import ndcg_score + + +class LinearRegressionModel(nn.Module): + def __init__(self, input_dim): + super(LinearRegressionModel, self).__init__() + self.linear = nn.Linear(input_dim, input_dim) + self.linear2 = nn.Linear(input_dim, 1) + + def forward(self, x): + latent = self.linear(x) + return self.linear2(latent) + # return nn.Sigmoid()(self.linear(x)) + + +class CitationDataset(Dataset): + def __init__(self, features, labels): + self.features = torch.tensor(features, dtype=torch.float32) + self.labels = torch.tensor(labels, dtype=torch.float32) + + def __len__(self): + return len(self.labels) + + def __getitem__(self, idx): + return self.features[idx], self.labels[idx] + + +def preprocess_and_scale_data(df): + df['title_length'] = df['title'].apply(lambda x: len(x.split())) + df['title_punctuation'] = df['title'].apply(lambda x: sum([1 for char in x if char in string.punctuation])) + df['pdf_page_num'] = df['pdf_page_num'].fillna(df['pdf_page_num'].mean()) + df['SMP'] = df['SMP'].fillna(df['SMP'].mean()) + features = df[['pdf_page_num', 'title_length', 'title_punctuation', 'Ref_num', 'SMP']].values + labels = df['cites'].values + + scaler = StandardScaler() + scaled_features = scaler.fit_transform(features) + return scaled_features, labels + + +train_data = pd.read_csv('NAID_train_extrainfo.csv') +train_scaled_features, train_targets = preprocess_and_scale_data(train_data) +test_data = pd.read_csv('NAID_test_extrainfo.csv') +test_scaled_features, test_targets = preprocess_and_scale_data(test_data) + + +train_dataset = CitationDataset(train_scaled_features, train_targets) +test_dataset = CitationDataset(test_scaled_features, test_targets) +train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True) +test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False) + + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +model = LinearRegressionModel(train_scaled_features.shape[1]).to(device) +optimizer = optim.Adam(model.parameters(), lr=0.01) +criterion = nn.MSELoss() + + +model.train() +for epoch in range(50): + for features, labels in train_loader: + features, labels = features.to(device), labels.to(device) + optimizer.zero_grad() + outputs = model(features) + loss = criterion(outputs, labels.unsqueeze(1)) + loss.backward() + optimizer.step() + print(f'Epoch {epoch+1}, Loss: {loss.item()}') + + +model.eval() +predictions = [] +actuals = [] +with torch.no_grad(): + for features, labels in test_loader: + features = features.to(device) + outputs = model(features) + predictions.extend(outputs.cpu().numpy().flatten().tolist()) + actuals.extend(labels.numpy().tolist()) + + +mse = np.mean((np.array(predictions) - np.array(actuals)) ** 2) +print(f'Test MSE: {mse}') + + +ndcg_value = ndcg_score([actuals], [predictions], k=20) +print(f'NDCG Score: {ndcg_value}')