zotero.bib


@article{silver_reward_2021,
	title = {Reward is enough},
	volume = {299},
	issn = {0004-3702},
	url = {https://www.sciencedirect.com/science/article/pii/S0004370221000862},
	doi = {10.1016/j.artint.2021.103535},
	abstract = {In this article we hypothesise that intelligence, and its associated abilities, can be understood as subserving the maximisation of reward. Accordingly, reward is enough to drive behaviour that exhibits abilities studied in natural and artificial intelligence, including knowledge, learning, perception, social intelligence, language, generalisation and imitation. This is in contrast to the view that specialised problem formulations are needed for each ability, based on other signals or objectives. Furthermore, we suggest that agents that learn through trial and error experience to maximise reward could learn behaviour that exhibits most if not all of these abilities, and therefore that powerful reinforcement learning agents could constitute a solution to artificial general intelligence.},
	language = {en},
	urldate = {2022-02-09},
	journal = {Artificial Intelligence},
	author = {Silver, David and Singh, Satinder and Precup, Doina and Sutton, Richard S.},
	month = oct,
	year = {2021},
	keywords = {Reinforcement learning, Artificial general intelligence, Artificial intelligence, Reward},
	pages = {103535},
	file = {ScienceDirect Snapshot:/home/dferigo/Zotero/storage/XUXAKHJ9/S0004370221000862.html:text/html;Silver et al_2021_Reward is enough.pdf:/home/dferigo/Zotero/storage/DE8PXLYA/Silver et al_2021_Reward is enough.pdf:application/pdf},
}

@article{cleach_fast_2021,
	title = {Fast {Contact}-{Implicit} {Model}-{Predictive} {Control}},
	url = {http://arxiv.org/abs/2107.05616},
	abstract = {We present a general approach for controlling robotic systems that make and break contact with their environments. Contact-implicit model-predictive control (CI-MPC) generalizes linear MPC to contact-rich settings by relying on linear complementarity problems (LCP) computed using strategic Taylor approximations about a reference trajectory and retaining non-smooth impact and friction dynamics, allowing the policy to not only reason about contact forces and timing, but also generate entirely new contact mode sequences online. To achieve reliable and fast numerical convergence, we devise a structure-exploiting, path-following solver for the LCP contact dynamics and a custom trajectory optimizer for trajectory-tracking MPC problems. We demonstrate CI-MPC at real-time rates in simulation, and show that it is robust to model mismatch and can respond to disturbances by discovering and exploiting new contact modes across a variety of robotic systems, including a pushbot, hopper, and planar quadruped and biped.},
	language = {en},
	urldate = {2021-11-26},
	journal = {arXiv:2107.05616 [cs, eess]},
	author = {Cleac'h, Simon Le and Howell, Taylor and Schwager, Mac and Manchester, Zachary},
	month = sep,
	year = {2021},
	note = {arXiv: 2107.05616},
	keywords = {Computer Science - Robotics, Electrical Engineering and Systems Science - Systems and Control},
	file = {Cleac'h et al. - 2021 - Fast Contact-Implicit Model-Predictive Control.pdf:/home/dferigo/Zotero/storage/FCY6KR2I/Cleac'h et al. - 2021 - Fast Contact-Implicit Model-Predictive Control.pdf:application/pdf},
}

@article{roy_machine_2021,
	title = {From {Machine} {Learning} to {Robotics}: {Challenges} and {Opportunities} for {Embodied} {Intelligence}},
	shorttitle = {From {Machine} {Learning} to {Robotics}},
	url = {http://arxiv.org/abs/2110.15245},
	abstract = {Machine learning has long since become a keystone technology, accelerating science and applications in a broad range of domains. Consequently, the notion of applying learning methods to a particular problem set has become an established and valuable modus operandi to advance a particular field. In this article we argue that such an approach does not straightforwardly extended to robotics -- or to embodied intelligence more generally: systems which engage in a purposeful exchange of energy and information with a physical environment. In particular, the purview of embodied intelligent agents extends significantly beyond the typical considerations of main-stream machine learning approaches, which typically (i) do not consider operation under conditions significantly different from those encountered during training; (ii) do not consider the often substantial, long-lasting and potentially safety-critical nature of interactions during learning and deployment; (iii) do not require ready adaptation to novel tasks while at the same time (iv) effectively and efficiently curating and extending their models of the world through targeted and deliberate actions. In reality, therefore, these limitations result in learning-based systems which suffer from many of the same operational shortcomings as more traditional, engineering-based approaches when deployed on a robot outside a well defined, and often narrow operating envelope. Contrary to viewing embodied intelligence as another application domain for machine learning, here we argue that it is in fact a key driver for the advancement of machine learning technology. In this article our goal is to highlight challenges and opportunities that are specific to embodied intelligence and to propose research directions which may significantly advance the state-of-the-art in robot learning.},
	language = {en},
	urldate = {2021-11-03},
	journal = {arXiv:2110.15245 [cs]},
	author = {Roy, Nicholas and Posner, Ingmar and Barfoot, Tim and Beaudoin, Philippe and Bengio, Yoshua and Bohg, Jeannette and Brock, Oliver and Depatie, Isabelle and Fox, Dieter and Koditschek, Dan and Lozano-Perez, Tomas and Mansinghka, Vikash and Pal, Christopher and Richards, Blake and Sadigh, Dorsa and Schaal, Stefan and Sukhatme, Gaurav and Therien, Denis and Toussaint, Marc and Van de Panne, Michiel},
	month = oct,
	year = {2021},
	note = {arXiv: 2110.15245},
	keywords = {Computer Science - Machine Learning, Computer Science - Robotics},
	file = {Roy et al_2021_From Machine Learning to Robotics.pdf:/home/dferigo/Zotero/storage/CA7DHLB4/Roy et al_2021_From Machine Learning to Robotics.pdf:application/pdf},
}

@techreport{traversaro_multibody_2019,
	title = {Multibody dynamics notation},
	url = {https://pure.tue.nl/ws/portalfiles/portal/139293126/A_Multibody_Dynamics_Notation_Revision_2_.pdf},
	language = {en},
	author = {Traversaro, Silvio and Saccon, Alessandro},
	year = {2019},
	pages = {24},
}

@inproceedings{liu_robot_2021,
	title = {Robot {Reinforcement} {Learning} on the {Constraint} {Manifold}},
	url = {https://openreview.net/forum?id=zwo1-MdMl1P},
	abstract = {Reinforcement learning in robotics is extremely challenging due to many practical issues, including safety, mechanical constraints, and wear and tear. Typically, these issues are not considered in...},
	language = {en},
	urldate = {2021-10-21},
	author = {Liu, Puze and Tateo, Davide and Ammar, Haitham Bou and Peters, Jan},
	month = jun,
	year = {2021},
	file = {Liu et al_2021_Robot Reinforcement Learning on the Constraint Manifold.pdf:/home/dferigo/Zotero/storage/8VD2XRMC/Liu et al_2021_Robot Reinforcement Learning on the Constraint Manifold.pdf:application/pdf},
}

@article{acosta_validating_2021,
	title = {Validating {Robotics} {Simulators} on {Real} {World} {Impacts}},
	url = {http://arxiv.org/abs/2110.00541},
	abstract = {A realistic simulation environment is an essential tool in every roboticist's toolkit, with uses ranging from planning and control to training policies with reinforcement learning. Despite the centrality of simulation in modern robotics, little work has been done to compare the performance of robotics simulators against real-world data, especially for scenarios involving dynamic motions with high speed impact events. Handling dynamic contact is the computational bottleneck for most simulations, and thus the modeling and algorithmic choices surrounding impacts and friction form the largest distinctions between popular tools. Here, we evaluate the ability of several simulators to reproduce real-world trajectories involving impacts. Using experimental data, we identify system-specific contact parameters of popular simulators Drake, MuJoCo, and Bullet, analyzing the effects of modeling choices around these parameters. For the simple example of a cube tossed onto a table, simulators capture inelastic impacts well while failing to capture elastic impacts. For the higher-dimensional case of a Cassie biped landing from a jump, the simulators capture the bulk motion well but the accuracy is limited by numerous model differences between the real robot and the simulators.},
	urldate = {2021-10-15},
	journal = {arXiv:2110.00541 [cs]},
	author = {Acosta, Brian and Yang, William and Posa, Michael},
	month = oct,
	year = {2021},
	note = {arXiv: 2110.00541},
	keywords = {Computer Science - Robotics},
	file = {Acosta et al_2021_Validating Robotics Simulators on Real World Impacts.pdf:/home/dferigo/Zotero/storage/D4WET8GW/Acosta et al_2021_Validating Robotics Simulators on Real World Impacts.pdf:application/pdf;arXiv.org Snapshot:/home/dferigo/Zotero/storage/LU98NNTF/2110.html:text/html},
}

@inproceedings{gronauer_successful_2021,
	address = {Montreal, Canada},
	title = {The {Successful} {Ingredients} of {Policy} {Gradient} {Algorithms}},
	isbn = {978-0-9992411-9-6},
	url = {https://www.ijcai.org/proceedings/2021/338},
	doi = {10.24963/ijcai.2021/338},
	abstract = {Despite the sublime success in recent years, the underlying mechanisms powering the advances of reinforcement learning are yet poorly understood. In this paper, we identify these mechanisms - which we call ingredients - in on-policy policy gradient methods and empirically determine their impact on the learning. To allow an equitable assessment, we conduct our experiments based on a uniﬁed and modular implementation. Our results underline the signiﬁcance of recent algorithmic advances and demonstrate that reaching state-of-the-art performance may not need sophisticated algorithms but can also be accomplished by the combination of a few simple ingredients.},
	language = {en},
	urldate = {2021-10-14},
	booktitle = {Proceedings of the {Thirtieth} {International} {Joint} {Conference} on {Artificial} {Intelligence}},
	publisher = {International Joint Conferences on Artificial Intelligence Organization},
	author = {Gronauer, Sven and Gottwald, Martin and Diepold, Klaus},
	month = aug,
	year = {2021},
	pages = {2455--2461},
	file = {Gronauer et al_2021_The Successful Ingredients of Policy Gradient Algorithms.pdf:/home/dferigo/Zotero/storage/B9A5Y9UV/Gronauer et al_2021_The Successful Ingredients of Policy Gradient Algorithms.pdf:application/pdf},
}

@article{engstrom_implementation_2020,
	title = {Implementation {Matters} in {Deep} {Policy} {Gradients}: {A} {Case} {Study} on {PPO} and {TRPO}},
	shorttitle = {Implementation {Matters} in {Deep} {Policy} {Gradients}},
	url = {http://arxiv.org/abs/2005.12729},
	abstract = {We study the roots of algorithmic progress in deep policy gradient algorithms through a case study on two popular algorithms: Proximal Policy Optimization (PPO) and Trust Region Policy Optimization (TRPO). Specifically, we investigate the consequences of "code-level optimizations:" algorithm augmentations found only in implementations or described as auxiliary details to the core algorithm. Seemingly of secondary importance, such optimizations turn out to have a major impact on agent behavior. Our results show that they (a) are responsible for most of PPO's gain in cumulative reward over TRPO, and (b) fundamentally change how RL methods function. These insights show the difficulty and importance of attributing performance gains in deep reinforcement learning. Code for reproducing our results is available at https://github.com/MadryLab/implementation-matters .},
	urldate = {2021-05-27},
	journal = {arXiv:2005.12729 [cs, stat]},
	author = {Engstrom, Logan and Ilyas, Andrew and Santurkar, Shibani and Tsipras, Dimitris and Janoos, Firdaus and Rudolph, Larry and Madry, Aleksander},
	month = may,
	year = {2020},
	note = {arXiv: 2005.12729},
	keywords = {Computer Science - Machine Learning, Computer Science - Robotics, Statistics - Machine Learning},
	file = {arXiv.org Snapshot:/home/dferigo/Zotero/storage/AI6US5L8/2005.html:text/html;Engstrom et al_2020_Implementation Matters in Deep Policy Gradients.pdf:/home/dferigo/Zotero/storage/HSDXPTLP/Engstrom et al_2020_Implementation Matters in Deep Policy Gradients.pdf:application/pdf},
}

@misc{schulman_nuts_2016,
	title = {The {Nuts} and {Bolts} of {Deep} {RL} {Research}},
	url = {https://rll.berkeley.edu/deeprlcourse/docs/nuts-and-bolts.pdf},
	language = {en},
	author = {Schulman, John},
	month = dec,
	year = {2016},
	file = {Schulman - The Nuts and Bolts of Deep RL Research.pdf:/home/dferigo/Zotero/storage/NWLULPJC/Schulman - The Nuts and Bolts of Deep RL Research.pdf:application/pdf},
}

@article{andrychowicz_what_2020,
	title = {What {Matters} {In} {On}-{Policy} {Reinforcement} {Learning}? {A} {Large}-{Scale} {Empirical} {Study}},
	shorttitle = {What {Matters} {In} {On}-{Policy} {Reinforcement} {Learning}?},
	url = {http://arxiv.org/abs/2006.05990},
	abstract = {In recent years, on-policy reinforcement learning (RL) has been successfully applied to many different continuous control tasks. While RL algorithms are often conceptually simple, their state-of-the-art implementations take numerous low- and high-level design decisions that strongly affect the performance of the resulting agents. Those choices are usually not extensively discussed in the literature, leading to discrepancy between published descriptions of algorithms and their implementations. This makes it hard to attribute progress in RL and slows down overall progress [Engstrom'20]. As a step towards filling that gap, we implement {\textgreater}50 such ``choices'' in a unified on-policy RL framework, allowing us to investigate their impact in a large-scale empirical study. We train over 250'000 agents in five continuous control environments of different complexity and provide insights and practical recommendations for on-policy training of RL agents.},
	urldate = {2021-05-27},
	journal = {arXiv:2006.05990 [cs, stat]},
	author = {Andrychowicz, Marcin and Raichuk, Anton and Stańczyk, Piotr and Orsini, Manu and Girgin, Sertan and Marinier, Raphael and Hussenot, Léonard and Geist, Matthieu and Pietquin, Olivier and Michalski, Marcin and Gelly, Sylvain and Bachem, Olivier},
	month = jun,
	year = {2020},
	note = {arXiv: 2006.05990},
	keywords = {Computer Science - Machine Learning, Statistics - Machine Learning},
	file = {Andrychowicz et al_2020_What Matters In On-Policy Reinforcement Learning.pdf:/home/dferigo/Zotero/storage/WZMAKPPK/Andrychowicz et al_2020_What Matters In On-Policy Reinforcement Learning.pdf:application/pdf;arXiv.org Snapshot:/home/dferigo/Zotero/storage/U5JYD637/2006.html:text/html},
}

@article{korber_comparing_2021,
	title = {Comparing {Popular} {Simulation} {Environments} in the {Scope} of {Robotics} and {Reinforcement} {Learning}},
	url = {http://arxiv.org/abs/2103.04616},
	abstract = {This letter compares the performance of four different, popular simulation environments for robotics and reinforcement learning (RL) through a series of benchmarks. The benchmarked scenarios are designed carefully with current industrial applications in mind. Given the need to run simulations as fast as possible to reduce the real-world training time of the RL agents, the comparison includes not only different simulation environments but also different hardware conﬁgurations, ranging from an entry-level notebook up to a dual CPU high performance server. We show that the chosen simulation environments beneﬁt the most from single core performance. Yet, using a multi core system, multiple simulations could be run in parallel to increase the performance.},
	language = {en},
	urldate = {2022-04-20},
	journal = {arXiv:2103.04616 [cs]},
	author = {Körber, Marian and Lange, Johann and Rediske, Stephan and Steinmann, Simon and Glück, Roland},
	month = mar,
	year = {2021},
	note = {arXiv: 2103.04616},
	keywords = {Computer Science - Machine Learning, Computer Science - Robotics, Computer Science - Artificial Intelligence},
	file = {Körber et al_2021_Comparing Popular Simulation Environments in the Scope of Robotics and.pdf:/home/dferigo/Zotero/storage/LWVBUMDD/Körber et al_2021_Comparing Popular Simulation Environments in the Scope of Robotics and.pdf:application/pdf},
}

@inproceedings{kim_survey_2021,
	title = {A {Survey} on {Simulation} {Environments} for {Reinforcement} {Learning}},
	doi = {10.1109/UR52253.2021.9494694},
	abstract = {Most of the recent studies of reinforcement learning and robotics basically employ computer simulation due to the advantages of time and cost. For this reason, users have to spare time for investigation in order to choose optimal environment for their purposes. This paper presents a survey result that can be a guidance in user’s choice for simulation environments. The investigation result includes features, brief historical backgrounds, license policies and formats for robot and object description of the eight most popular environments in robot RL studies. We also propose a quantitative evaluation method for those simulation environments considering the features and a pragmatic point of view.},
	booktitle = {2021 18th {International} {Conference} on {Ubiquitous} {Robots} ({UR})},
	author = {Kim, Taewoo and Jang, Minsu and Kim, Jaehong},
	month = jul,
	year = {2021},
	note = {ISSN: 2325-033X},
	keywords = {Analytical models, Computer simulation, Lead, Licenses, Reinforcement learning, Rendering (computer graphics), Software},
	pages = {63--67},
	file = {IEEE Xplore Abstract Record:/home/dferigo/Zotero/storage/K5USS9RE/9494694.html:text/html;Kim et al_2021_A Survey on Simulation Environments for Reinforcement Learning.pdf:/home/dferigo/Zotero/storage/2HCEHSZB/Kim et al_2021_A Survey on Simulation Environments for Reinforcement Learning.pdf:application/pdf},
}

@inproceedings{belbute-peres_end--end_2018,
	title = {End-to-{End} {Differentiable} {Physics} for {Learning} and {Control}},
	language = {en},
	author = {Belbute-Peres, Filipe de A and Smith, Kevin A and Allen, Kelsey R and Tenenbaum, Joshua B and Kolter, J Zico},
	year = {2018},
	pages = {12},
	file = {Belbute-Peres et al_2018_End-to-End Differentiable Physics for Learning and Control.pdf:/home/dferigo/Zotero/storage/YX2WUZLM/Belbute-Peres et al_2018_End-to-End Differentiable Physics for Learning and Control.pdf:application/pdf},
}

@article{rackauckas_universal_2021,
	title = {Universal {Differential} {Equations} for {Scientific} {Machine} {Learning}},
	url = {http://arxiv.org/abs/2001.04385},
	abstract = {In the context of science, the well-known adage “a picture is worth a thousand words” might well be “a model is worth a thousand datasets.” In this manuscript we introduce the SciML software ecosystem as a tool for mixing the information of physical laws and scientiﬁc models with data-driven machine learning approaches. We describe a mathematical object, which we denote universal diﬀerential equations (UDEs), as the unifying framework connecting the ecosystem. We show how a wide variety of applications, from automatically discovering biological mechanisms to solving high-dimensional Hamilton-Jacobi-Bellman equations, can be phrased and eﬃciently handled through the UDE formalism and its tooling. We demonstrate the generality of the software tooling to handle stochasticity, delays, and implicit constraints. This funnels the wide variety of SciML applications into a core set of training mechanisms which are highly optimized, stabilized for stiﬀ equations, and compatible with distributed parallelism and GPU accelerators.},
	language = {en},
	urldate = {2022-04-20},
	journal = {arXiv:2001.04385 [cs, math, q-bio, stat]},
	author = {Rackauckas, Christopher and Ma, Yingbo and Martensen, Julius and Warner, Collin and Zubov, Kirill and Supekar, Rohit and Skinner, Dominic and Ramadhan, Ali and Edelman, Alan},
	month = nov,
	year = {2021},
	note = {arXiv: 2001.04385},
	keywords = {Computer Science - Machine Learning, Mathematics - Dynamical Systems, Quantitative Biology - Quantitative Methods, Statistics - Machine Learning},
	file = {Rackauckas et al_2021_Universal Differential Equations for Scientific Machine Learning.pdf:/home/dferigo/Zotero/storage/3ALYEQ65/Rackauckas et al_2021_Universal Differential Equations for Scientific Machine Learning.pdf:application/pdf},
}

@article{singh_efficient_2022,
	title = {Efficient {Analytical} {Derivatives} of {Rigid}-{Body} {Dynamics} using {Spatial} {Vector} {Algebra}},
	volume = {7},
	issn = {2377-3766, 2377-3774},
	url = {http://arxiv.org/abs/2105.05102},
	doi = {10.1109/LRA.2022.3141194},
	abstract = {An essential need for many model-based robot control algorithms is the ability to quickly and accurately compute partial derivatives of the equations of motion. State of the art approaches to this problem often use analytical methods based on the chain rule applied to existing dynamics algorithms. Although these methods are an improvement over ﬁnite differences in terms of accuracy, they are not always the most efﬁcient. In this paper, we contribute new closed-form expressions for the ﬁrstorder partial derivatives of inverse dynamics, leading to a recursive algorithm. The algorithm is benchmarked against chain-rule approaches in Fortran and against an existing algorithm from the Pinocchio library in C++. Tests consider computing the partial derivatives of inverse and forward dynamics for robots ranging from kinematic chains to humanoids and quadrupeds. Compared to the previous open-source Pinocchio implementation, our new analytical results uncover a key computational restructuring that enables efﬁciency gains. Speedups of up to 1.4x are reported for calculating the partial derivatives of inverse dynamics for the 50-dof Talos humanoid.},
	language = {en},
	number = {2},
	urldate = {2022-04-20},
	journal = {IEEE Robotics and Automation Letters},
	author = {Singh, Shubham and Russell, Ryan P. and Wensing, Patrick M.},
	month = apr,
	year = {2022},
	note = {arXiv: 2105.05102},
	keywords = {Computer Science - Robotics},
	pages = {1776--1783},
	file = {Singh et al_2022_Efficient Analytical Derivatives of Rigid-Body Dynamics using Spatial Vector.pdf:/home/dferigo/Zotero/storage/EU2RG38L/Singh et al_2022_Efficient Analytical Derivatives of Rigid-Body Dynamics using Spatial Vector.pdf:application/pdf},
}

@inproceedings{carpentier_analytical_2018,
	title = {Analytical {Derivatives} of {Rigid} {Body} {Dynamics} {Algorithms}},
	isbn = {978-0-9923747-4-7},
	url = {http://www.roboticsproceedings.org/rss14/p38.pdf},
	doi = {10.15607/RSS.2018.XIV.038},
	abstract = {Rigid body dynamics is a well-established frame-work in robotics. It can be used to expose the analytic form of kinematic and dynamic functions of the robot model. So far, two major algorithms, namely the recursive Newton-Euler algorithm (RNEA) and the articulated body algorithm (ABA), have been proposed to compute the inverse dynamics and the forward dynamics in a few microseconds. Evaluating their derivatives is an important challenge for various robotic applications (optimal control, estimation, co-design or reinforcement learning). However it remains time consuming, whether using ﬁnite differences or automatic differentiation. In this paper, we propose new algorithms to efﬁciently compute them thanks to closed-form formulations. Using the chain rule and adequate algebraic differentiation of spatial algebra, we ﬁrstly differentiate explicitly RNEA. Then, using properties about the derivative of function composition, we show that the same algorithm can also be used to compute the derivatives of ABA with a marginal additional cost. For this purpose, we introduce a new algorithm to compute the inverse of the joint-space inertia matrix, without explicitly computing the matrix itself. All the algorithms are implemented in our open-source C++ framework called Pinocchio. Benchmarks show computational costs varying between 3 microseconds (for a 7-dof arm) up to 17 microseconds (for a 36-dof humanoid), outperforming the alternative approaches of the state of the art.},
	language = {en},
	urldate = {2021-05-18},
	booktitle = {Robotics: {Science} and {Systems} {XIV}},
	publisher = {Robotics: Science and Systems Foundation},
	author = {Carpentier, Justin and Mansard, Nicolas},
	month = jun,
	year = {2018},
	file = {Carpentier_Mansard_2018_Analytical Derivatives of Rigid Body Dynamics Algorithms.pdf:/home/dferigo/Zotero/storage/8XI5XBSA/Carpentier_Mansard_2018_Analytical Derivatives of Rigid Body Dynamics Algorithms.pdf:application/pdf},
}

@article{innes_differentiable_2019,
	title = {A {Differentiable} {Programming} {System} to {Bridge} {Machine} {Learning} and {Scientific} {Computing}},
	url = {http://arxiv.org/abs/1907.07587},
	abstract = {Scientific computing is increasingly incorporating the advancements in machine learning and the ability to work with large amounts of data. At the same time, machine learning models are becoming increasingly sophisticated and exhibit many features often seen in scientific computing, stressing the capabilities of machine learning frameworks. Just as the disciplines of scientific computing and machine learning have shared common underlying infrastructure in the form of numerical linear algebra, we now have the opportunity to further share new computational infrastructure, and thus ideas, in the form of Differentiable Programming. We describe Zygote, a Differentiable Programming system that is able to take gradients of general program structures. We implement this system in the Julia programming language. Our system supports almost all language constructs (control flow, recursion, mutation, etc.) and compiles high-performance code without requiring any user intervention or refactoring to stage computations. This enables an expressive programming model for deep learning, but more importantly, it enables us to incorporate a large ecosystem of libraries in our models in a straightforward way. We discuss our approach to automatic differentiation, including its support for advanced techniques such as mixed-mode, complex and checkpointed differentiation, and present several examples of differentiating programs.},
	urldate = {2019-09-14},
	journal = {arXiv:1907.07587 [cs]},
	author = {Innes, Mike and Edelman, Alan and Fischer, Keno and Rackauckas, Chris and Saba, Elliot and Shah, Viral B. and Tebbutt, Will},
	month = jul,
	year = {2019},
	note = {arXiv: 1907.07587},
	keywords = {Computer Science - Machine Learning, Computer Science - Programming Languages},
	file = {arXiv.org Snapshot:/home/dferigo/Zotero/storage/GUIDB8WM/1907.html:text/html;Innes et al_2019_A Differentiable Programming System to Bridge Machine Learning and Scientific.pdf:/home/dferigo/Zotero/storage/5DV2YZ5K/Innes et al_2019_A Differentiable Programming System to Bridge Machine Learning and Scientific.pdf:application/pdf},
}

@misc{nvidia_nvidia_2011,
	title = {Nvidia {PhysX}},
	url = {https://developer.nvidia.com/physx-sdk},
	author = {{NVIDIA}},
	year = {2011},
}

@misc{nvidia_nvidia_2018,
	title = {Nvidia {Isaac}},
	url = {https://developer.nvidia.com/isaac-sdk},
	author = {{NVIDIA}},
	year = {2018},
}

@article{zhao_sim--real_2020,
	title = {Sim-to-{Real} {Transfer} in {Deep} {Reinforcement} {Learning} for {Robotics}: a {Survey}},
	shorttitle = {Sim-to-{Real} {Transfer} in {Deep} {Reinforcement} {Learning} for {Robotics}},
	url = {http://arxiv.org/abs/2009.13303},
	abstract = {Deep reinforcement learning has recently seen huge success across multiple areas in the robotics domain. Owing to the limitations of gathering real-world data, i.e., sample inefﬁciency and the cost of collecting it, simulation environments are utilized for training the different agents. This not only aids in providing a potentially inﬁnite data source, but also alleviates safety concerns with real robots. Nonetheless, the gap between the simulated and real worlds degrades the performance of the policies once the models are transferred into real robots. Multiple research efforts are therefore now being directed towards closing this sim-toreal gap and accomplish more efﬁcient policy transfer. Recent years have seen the emergence of multiple methods applicable to different domains, but there is a lack, to the best of our knowledge, of a comprehensive review summarizing and putting into context the different methods. In this survey paper, we cover the fundamental background behind sim-to-real transfer in deep reinforcement learning and overview the main methods being utilized at the moment: domain randomization, domain adaptation, imitation learning, meta-learning and knowledge distillation. We categorize some of the most relevant recent works, and outline the main application scenarios. Finally, we discuss the main opportunities and challenges of the different approaches and point to the most promising directions.},
	language = {en},
	urldate = {2020-10-02},
	journal = {arXiv:2009.13303 [cs]},
	author = {Zhao, Wenshuai and Queralta, Jorge Peña and Westerlund, Tomi},
	month = sep,
	year = {2020},
	note = {arXiv: 2009.13303},
	keywords = {Computer Science - Machine Learning, Computer Science - Robotics},
	file = {Zhao et al_2020_Sim-to-Real Transfer in Deep Reinforcement Learning for Robotics.pdf:/home/dferigo/Zotero/storage/BA2MFGQZ/Zhao et al_2020_Sim-to-Real Transfer in Deep Reinforcement Learning for Robotics.pdf:application/pdf},
}

@article{muratore_robot_2022,
	title = {Robot {Learning} from {Randomized} {Simulations}: {A} {Review}},
	shorttitle = {Robot {Learning} from {Randomized} {Simulations}},
	url = {http://arxiv.org/abs/2111.00956},
	abstract = {The rise of deep learning has caused a paradigm shift in robotics research, favoring methods that require large amounts of data. Unfortunately, it is prohibitively expensive to generate such data sets on a physical platform. Therefore, state-of-the-art approaches learn in simulation where data generation is fast as well as inexpensive and subsequently transfer the knowledge to the real robot (sim-to-real). Despite becoming increasingly realistic, all simulators are by construction based on models, hence inevitably imperfect. This raises the question of how simulators can be modiﬁed to facilitate learning robot control policies and overcome the mismatch between simulation and reality, often called the ‘reality gap’. We provide a comprehensive review of sim-to-real research for robotics, focusing on a technique named ‘domain randomization’ which is a method for learning from randomized simulations.},
	language = {en},
	urldate = {2022-04-20},
	journal = {arXiv:2111.00956 [cs]},
	author = {Muratore, Fabio and Ramos, Fabio and Turk, Greg and Yu, Wenhao and Gienger, Michael and Peters, Jan},
	month = jan,
	year = {2022},
	note = {arXiv: 2111.00956},
	keywords = {Computer Science - Machine Learning, Computer Science - Robotics},
	file = {Muratore et al_2022_Robot Learning from Randomized Simulations.pdf:/home/dferigo/Zotero/storage/BJEATQL9/Muratore et al_2022_Robot Learning from Randomized Simulations.pdf:application/pdf},
}

@article{li_reinforcement_2021,
	title = {Reinforcement {Learning} for {Robust} {Parameterized} {Locomotion} {Control} of {Bipedal} {Robots}},
	url = {http://arxiv.org/abs/2103.14295},
	abstract = {Developing robust walking controllers for bipedal robots is a challenging endeavor. Traditional model-based locomotion controllers require simplifying assumptions and careful modelling; any small errors can result in unstable control. To address these challenges for bipedal locomotion, we present a model-free reinforcement learning framework for training robust locomotion policies in simulation, which can then be transferred to a real bipedal Cassie robot. To facilitate sim-to-real transfer, domain randomization is used to encourage the policies to learn behaviors that are robust across variations in system dynamics. The learned policies enable Cassie to perform a set of diverse and dynamic behaviors, while also being more robust than traditional controllers and prior learning-based methods that use residual control. We demonstrate this on versatile walking behaviors such as tracking a target walking velocity, walking height, and turning yaw.},
	urldate = {2021-05-27},
	journal = {arXiv:2103.14295 [cs, eess]},
	author = {Li, Zhongyu and Cheng, Xuxin and Peng, Xue Bin and Abbeel, Pieter and Levine, Sergey and Berseth, Glen and Sreenath, Koushil},
	month = mar,
	year = {2021},
	note = {arXiv: 2103.14295},
	keywords = {Computer Science - Machine Learning, Computer Science - Robotics, Computer Science - Artificial Intelligence, Electrical Engineering and Systems Science - Systems and Control},
	file = {Li et al. - 2021 - Reinforcement Learning for Robust Parameterized Lo.pdf:/home/dferigo/Zotero/storage/CG5A5LGU/Li et al. - 2021 - Reinforcement Learning for Robust Parameterized Lo.pdf:application/pdf},
}

@inproceedings{seung-joon_yi_learning_2011,
	address = {Shanghai, China},
	title = {Learning full body push recovery control for small humanoid robots},
	isbn = {978-1-61284-386-5},
	url = {http://ieeexplore.ieee.org/document/5980531/},
	doi = {10.1109/ICRA.2011.5980531},
	abstract = {Dynamic bipedal walking is susceptible to external disturbances and surface irregularities, requiring robust feedback control to remain stable. In this work, we present a practical hierarchical push recovery strategy that can be readily implemented on a wide range of humanoid robots. Our method consists of low level controllers that perform simple, biomechanically motivated push recovery actions and a high level controller that combines the low level controllers according to proprioceptive and inertial sensory signals and the current robot state. Reinforcement learning is used to optimize the parameters of the controllers in order to maximize the stability of the robot over a broad range of external disturbances. The controllers are learned on a physical simulation and implemented on the Darwin-HP humanoid robot platform, and the resulting experiments demonstrate effective full body push recovery behaviors during dynamic walking.},
	urldate = {2021-05-27},
	booktitle = {2011 {IEEE} {International} {Conference} on {Robotics} and {Automation}},
	publisher = {IEEE},
	author = {{Seung-Joon Yi} and {Byoung-Tak Zhang} and Hong, Dennis and Lee, Daniel D.},
	month = may,
	year = {2011},
	pages = {2047--2052},
	file = {Seung-Joon Yi et al. - 2011 - Learning full body push recovery control for small.pdf:/home/dferigo/Zotero/storage/N3N4IJG7/Seung-Joon Yi et al. - 2011 - Learning full body push recovery control for small.pdf:application/pdf},
}

@article{smith_legged_2021,
	title = {Legged {Robots} that {Keep} on {Learning}: {Fine}-{Tuning} {Locomotion} {Policies} in the {Real} {World}},
	shorttitle = {Legged {Robots} that {Keep} on {Learning}},
	url = {http://arxiv.org/abs/2110.05457},
	abstract = {Legged robots are physically capable of traversing a wide range of challenging environments, but designing controllers that are sufficiently robust to handle this diversity has been a long-standing challenge in robotics. Reinforcement learning presents an appealing approach for automating the controller design process and has been able to produce remarkably robust controllers when trained in a suitable range of environments. However, it is difficult to predict all likely conditions the robot will encounter during deployment and enumerate them at training-time. What if instead of training controllers that are robust enough to handle any eventuality, we enable the robot to continually learn in any setting it finds itself in? This kind of real-world reinforcement learning poses a number of challenges, including efficiency, safety, and autonomy. To address these challenges, we propose a practical robot reinforcement learning system for fine-tuning locomotion policies in the real world. We demonstrate that a modest amount of real-world training can substantially improve performance during deployment, and this enables a real A1 quadrupedal robot to autonomously fine-tune multiple locomotion skills in a range of environments, including an outdoor lawn and a variety of indoor terrains.},
	urldate = {2021-10-18},
	journal = {arXiv:2110.05457 [cs]},
	author = {Smith, Laura and Kew, J. Chase and Peng, Xue Bin and Ha, Sehoon and Tan, Jie and Levine, Sergey},
	month = oct,
	year = {2021},
	note = {arXiv: 2110.05457},
	keywords = {Computer Science - Robotics},
	file = {Smith et al. - 2021 - Legged Robots that Keep on Learning Fine-Tuning L.pdf:/home/dferigo/Zotero/storage/FGTZX2P6/Smith et al. - 2021 - Legged Robots that Keep on Learning Fine-Tuning L.pdf:application/pdf},
}

@inproceedings{bloesch_towards_2022,
	title = {Towards {Real} {Robot} {Learning} in the {Wild}: {A} {Case} {Study} in {Bipedal} {Locomotion}},
	shorttitle = {Towards {Real} {Robot} {Learning} in the {Wild}},
	abstract = {Algorithms for self-learning systems have made considerable progress in recent years, yet safety concerns and the need for additional instrumentation have so far largely limited learning experiments with real robots to well controlled lab settings. In this paper, we demonstrate how a small bipedal robot can autonomously learn to walk with minimal human intervention and with minimal instrumentation of the environment. We employ data-efficient off-policy deep reinforcement learning to learn to walk end-to-end, directly on hardware, using rewards that are computed exclusively from proprioceptive sensing. To allow the robot to autonomously adapt its behaviour to its environment, we additionally provide the agent with raw RGB camera images as input. By deploying two robots in different geographic locations while sharing data in a distributed learning setup, we achieve higher throughput and greater diversity of the training data. Our learning experiments constitute a step towards the long-term vision of learning “in the wild” for legged robots, and, to our knowledge, represent the first demonstration of learning a deep neural network controller for bipedal locomotion directly on hardware.},
	language = {en},
	booktitle = {Proceedings of the 5th {Conference} on {Robot} {Learning}},
	publisher = {PMLR},
	author = {Bloesch, Michael and Humplik, Jan and Patraucean, Viorica and Hafner, Roland and Haarnoja, Tuomas and Byravan, Arunkumar and Siegel, Noah Yamamoto and Tunyasuvunakool, Saran and Casarini, Federico and Batchelor, Nathan and Romano, Francesco and Saliceti, Stefano and Riedmiller, Martin and Eslami, S. M. Ali and Heess, Nicolas},
	month = jan,
	year = {2022},
	note = {ISSN: 2640-3498},
	pages = {1502--1511},
	file = {Bloesch et al_2022_Towards Real Robot Learning in the Wild.pdf:/home/dferigo/Zotero/storage/MUVLGR7I/Bloesch et al_2022_Towards Real Robot Learning in the Wild.pdf:application/pdf},
}

@inproceedings{castillo_robust_2021,
	address = {Prague, Czech Republic},
	title = {Robust {Feedback} {Motion} {Policy} {Design} {Using} {Reinforcement} {Learning} on a {3D} {Digit} {Bipedal} {Robot}},
	isbn = {978-1-66541-714-3},
	url = {https://ieeexplore.ieee.org/document/9636467/},
	doi = {10.1109/IROS51168.2021.9636467},
	abstract = {In this paper, a hierarchical and robust framework for learning bipedal locomotion is presented and successfully implemented on the 3D biped robot Digit built by Agility Robotics. We propose a cascade-structure controller that combines the learning process with intuitive feedback regulations. This design allows the framework to realize robust and stable walking with a reduced-dimensional state and action spaces of the policy, signiﬁcantly simplifying the design and increasing the sampling efﬁciency of the learning method. The inclusion of feedback regulation into the framework improves the robustness of the learned walking gait and ensures the success of the sim-to-real transfer of the proposed controller with minimal tuning. We speciﬁcally present a learning pipeline that considers hardware-feasible initial poses of the robot within the learning process to ensure the initial state of the learning is replicated as close as possible to the initial state of the robot in hardware experiments. Finally, we demonstrate the feasibility of our method by successfully transferring the learned policy in simulation to the Digit robot hardware, realizing sustained walking gaits under external force disturbances and challenging terrains not incurred during the training process. To the best of our knowledge, this is the ﬁrst time a learning-based policy is transferred successfully to the Digit robot in hardware experiments.},
	language = {en},
	urldate = {2022-04-19},
	booktitle = {2021 {IEEE}/{RSJ} {International} {Conference} on {Intelligent} {Robots} and {Systems} ({IROS})},
	publisher = {IEEE},
	author = {Castillo, Guillermo A. and Weng, Bowen and Zhang, Wei and Hereid, Ayonga},
	month = sep,
	year = {2021},
	pages = {5136--5143},
	file = {Castillo et al. - 2021 - Robust Feedback Motion Policy Design Using Reinfor.pdf:/home/dferigo/Zotero/storage/43DSYDWU/Castillo et al. - 2021 - Robust Feedback Motion Policy Design Using Reinfor.pdf:application/pdf},
}

@inproceedings{rudin_learning_2022,
	title = {Learning to {Walk} in {Minutes} {Using} {Massively} {Parallel} {Deep} {Reinforcement} {Learning}},
	url = {https://proceedings.mlr.press/v164/rudin22a.html},
	abstract = {In this work, we present and study a training set-up that achieves fast policy generation for real-world robotic tasks by using massive parallelism on a single workstation GPU. We analyze and discuss the impact of different training algorithm components in the massively parallel regime on the final policy performance and training times. In addition, we present a novel game-inspired curriculum that is well suited for training with thousands of simulated robots in parallel. We evaluate the approach by training the quadrupedal robot ANYmal to walk on challenging terrain. The parallel approach allows training policies for flat terrain in under four minutes, and in twenty minutes for uneven terrain. This represents a speedup of multiple orders of magnitude compared to previous work. Finally, we transfer the policies to the real robot to validate the approach. We open-source our training code to help accelerate further research in the field of learned legged locomotion: https://leggedrobotics.github.io/legged\_gym/.},
	language = {en},
	urldate = {2022-04-19},
	booktitle = {Proceedings of the 5th {Conference} on {Robot} {Learning}},
	publisher = {PMLR},
	author = {Rudin, Nikita and Hoeller, David and Reist, Philipp and Hutter, Marco},
	month = jan,
	year = {2022},
	note = {ISSN: 2640-3498},
	pages = {91--100},
	file = {Rudin et al_2022_Learning to Walk in Minutes Using Massively Parallel Deep Reinforcement Learning.pdf:/home/dferigo/Zotero/storage/UKAV6F4D/Rudin et al_2022_Learning to Walk in Minutes Using Massively Parallel Deep Reinforcement Learning.pdf:application/pdf},
}

@inproceedings{gangapurwala_real-time_2021,
	address = {Xi'an, China},
	title = {Real-{Time} {Trajectory} {Adaptation} for {Quadrupedal} {Locomotion} using {Deep} {Reinforcement} {Learning}},
	isbn = {978-1-72819-077-8},
	url = {https://ieeexplore.ieee.org/document/9561639/},
	doi = {10.1109/ICRA48506.2021.9561639},
	abstract = {We present a control architecture for real-time adaptation and tracking of trajectories generated using a terrain-aware trajectory optimization solver. This approach enables us to circumvent the computationally exhaustive task of online trajectory optimization, and further introduces a control solution robust to systems modeled with approximated dynamics. We train a policy using deep reinforcement learning (RL) to introduce additive deviations to a reference trajectory in order to generate a feedback-based trajectory tracking system for a quadrupedal robot. We train this policy across a multitude of simulated terrains and ensure its generality by introducing training methods that avoid overﬁtting and convergence towards local optima. Additionally, in order to capture terrain information, we include a latent representation of the height maps in the observation space of the RL environment as a form of exteroceptive feedback. We test the performance of our trained policy by tracking the corrected set points using a model-based whole-body controller and compare it with the tracking behavior obtained without the corrective feedback in several simulation environments, and show that introducing the corrective feedback results in increase of the success rate from 72.7\% to 92.4\% for tracking precomputed dynamic long horizon trajectories on ﬂat terrain and from 47.5\% to 80.3\% on a complex modular uneven terrain. We also show successful transfer of our training approach to the real physical system and further present cogent arguments in support of our framework.},
	language = {en},
	urldate = {2022-04-19},
	booktitle = {2021 {IEEE} {International} {Conference} on {Robotics} and {Automation} ({ICRA})},
	publisher = {IEEE},
	author = {Gangapurwala, Siddhant and Geisert, Mathieu and Orsolino, Romeo and Fallon, Maurice and Havoutis, Ioannis},
	month = may,
	year = {2021},
	pages = {5973--5979},
	file = {Gangapurwala et al_2021_Real-Time Trajectory Adaptation for Quadrupedal Locomotion using Deep.pdf:/home/dferigo/Zotero/storage/8MU3IKJR/Gangapurwala et al_2021_Real-Time Trajectory Adaptation for Quadrupedal Locomotion using Deep.pdf:application/pdf},
}

@article{bellegarda_robust_2021,
	title = {Robust {High}-speed {Running} for {Quadruped} {Robots} via {Deep} {Reinforcement} {Learning}},
	url = {http://arxiv.org/abs/2103.06484},
	abstract = {Deep reinforcement learning has emerged as a popular and powerful way to develop locomotion controllers for quadruped robots. Common approaches have largely focused on learning actions directly in joint space, or learning to modify and offset foot positions produced by trajectory generators. Both approaches typically require careful reward shaping and training for millions of time steps, and with trajectory generators introduce human bias into the resulting control policies. In this paper, we instead explore learning foot positions in Cartesian space, which we track with impedance control, for a task of running as fast as possible subject to environmental disturbances. Compared with other action spaces, we observe less needed reward shaping, much improved sample efﬁciency, the emergence of natural gaits such as galloping and bounding, and ease of sim-to-sim transfer. Policies can be learned in only a few million time steps, even for challenging tasks of running over rough terrain with loads of over 100\% of the nominal quadruped mass. Training occurs in PyBullet, and we perform a sim-to-sim transfer to Gazebo, where our quadruped is able to run at over 4 m/s without a load, and 3.5 m/s with a 10 kg load, which is over 83\% of the nominal quadruped mass. Video results can be found at https://youtu.be/roE1vxpEWfw.},
	language = {en},
	urldate = {2021-05-27},
	journal = {arXiv:2103.06484 [cs, eess]},
	author = {Bellegarda, Guillaume and Nguyen, Quan},
	month = mar,
	year = {2021},
	note = {sim-to-sim},
	keywords = {Computer Science - Machine Learning, Computer Science - Robotics, Electrical Engineering and Systems Science - Systems and Control, gazebo, locomotion, pybullet, quadrupeds, rl, sim-to-sim, simulation},
	file = {Bellegarda_Nguyen_2021_Robust High-speed Running for Quadruped Robots via Deep Reinforcement Learning.pdf:/home/dferigo/Zotero/storage/5NMBKYDC/Bellegarda_Nguyen_2021_Robust High-speed Running for Quadruped Robots via Deep Reinforcement Learning.pdf:application/pdf},
}

@article{haarnoja_soft_2018,
	title = {Soft {Actor}-{Critic}: {Off}-{Policy} {Maximum} {Entropy} {Deep} {Reinforcement} {Learning} with a {Stochastic} {Actor}},
	shorttitle = {Soft {Actor}-{Critic}},
	url = {http://arxiv.org/abs/1801.01290},
	abstract = {Model-free deep reinforcement learning (RL) algorithms have been demonstrated on a range of challenging decision making and control tasks. However, these methods typically suffer from two major challenges: very high sample complexity and brittle convergence properties, which necessitate meticulous hyperparameter tuning. Both of these challenges severely limit the applicability of such methods to complex, real-world domains. In this paper, we propose soft actor-critic, an off-policy actor-critic deep RL algorithm based on the maximum entropy reinforcement learning framework. In this framework, the actor aims to maximize expected reward while also maximizing entropy. That is, to succeed at the task while acting as randomly as possible. Prior deep RL methods based on this framework have been formulated as Q-learning methods. By combining off-policy updates with a stable stochastic actor-critic formulation, our method achieves state-of-the-art performance on a range of continuous control benchmark tasks, outperforming prior on-policy and off-policy methods. Furthermore, we demonstrate that, in contrast to other off-policy algorithms, our approach is very stable, achieving very similar performance across different random seeds.},
	urldate = {2020-05-07},
	journal = {arXiv:1801.01290 [cs, stat]},
	author = {Haarnoja, Tuomas and Zhou, Aurick and Abbeel, Pieter and Levine, Sergey},
	month = aug,
	year = {2018},
	note = {arXiv: 1801.01290},
	keywords = {Computer Science - Machine Learning, Computer Science - Artificial Intelligence, Statistics - Machine Learning},
	file = {arXiv.org Snapshot:/home/dferigo/Zotero/storage/53U5GB7V/1801.html:text/html;Haarnoja et al_2018_Soft Actor-Critic.pdf:/home/dferigo/Zotero/storage/KI69MXY9/Haarnoja et al_2018_Soft Actor-Critic.pdf:application/pdf},
}

@article{mnih_human-level_2015,
	title = {Human-level control through deep reinforcement learning},
	volume = {518},
	issn = {0028-0836, 1476-4687},
	url = {http://www.nature.com/articles/nature14236},
	doi = {10.1038/nature14236},
	language = {en},
	number = {7540},
	urldate = {2018-06-23},
	journal = {Nature},
	author = {Mnih, Volodymyr and Kavukcuoglu, Koray and Silver, David and Rusu, Andrei A. and Veness, Joel and Bellemare, Marc G. and Graves, Alex and Riedmiller, Martin and Fidjeland, Andreas K. and Ostrovski, Georg and Petersen, Stig and Beattie, Charles and Sadik, Amir and Antonoglou, Ioannis and King, Helen and Kumaran, Dharshan and Wierstra, Daan and Legg, Shane and Hassabis, Demis},
	month = feb,
	year = {2015},
	keywords = {deepmind},
	pages = {529--533},
	file = {Mnih et al_2015_Human-level control through deep reinforcement learning.pdf:/home/dferigo/Zotero/storage/WC7QJIMF/Mnih et al_2015_Human-level control through deep reinforcement learning.pdf:application/pdf},
}

@inproceedings{kohl_policy_2004,
	address = {New Orleans, LA, USA},
	title = {Policy gradient reinforcement learning for fast quadrupedal locomotion},
	isbn = {978-0-7803-8232-9},
	url = {http://ieeexplore.ieee.org/document/1307456/},
	doi = {10.1109/ROBOT.2004.1307456},
	abstract = {This paper presents a machine learning approach to optimizing a quadrupedal trot gait for forward speed. Given a parameterized walk designed for a speciﬁc robot, we propose using a form of policy gradient reinforcement learning to automatically search the set of possible parameters with the goal of ﬁnding the fastest possible walk. We implement and test our approach on a commercially available quadrupedal robot platform, namely the Sony Aibo robot. After about three hours of learning, all on the physical robots and with no human intervention other than to change the batteries, the robots achieved a gait faster than any previously known gait for the Aibo, signiﬁcantly outperforming a variety of existing hand-coded and learned solutions.},
	language = {en},
	urldate = {2022-04-19},
	booktitle = {{IEEE} {International} {Conference} on {Robotics} and {Automation}, 2004. {Proceedings}. {ICRA} '04. 2004},
	publisher = {IEEE},
	author = {Kohl, N. and Stone, P.},
	year = {2004},
	pages = {2619--2624 Vol.3},
	file = {Kohl_Stone_2004_Policy gradient reinforcement learning for fast quadrupedal locomotion.pdf:/home/dferigo/Zotero/storage/6VYM3T8R/Kohl_Stone_2004_Policy gradient reinforcement learning for fast quadrupedal locomotion.pdf:application/pdf},
}

@inproceedings{atkeson_robot_1997,
	title = {Robot {Learning} {From} {Demonstration}},
	abstract = {The goal of robot learning from demonstration is to have a robot learn from watching a demonstration of the task to be performed. In our approach to learning from demonstration the robot learns a reward function from the demonstration and a task model from repeated attempts to perform the task. A policy is computed based on the learned reward function and task model. Lessons learned from an implementation on an anthropomorphic robot arm using a pendulum swing up task include 1) simply mimicking demonstrated motions is not adequate to perform this task, 2) a task planner can use a learned model and reward function to compute an appropriate policy, 3) this modelbased planning process supports rapid learning, 4) both parametric and nonparametric models can be learned and used, and 5) incorporating a task level direct learning component, which is non-model-based, in addition to the model-based planner, is useful in compensating for structural modeling errors and slow model learning.},
	language = {en},
	author = {Atkeson, Christopher G and Schaal, Stefan},
	year = {1997},
	pages = {9},
	file = {Atkeson_Schaal_Robot Learning From Demonstration.pdf:/home/dferigo/Zotero/storage/GNWQGNQS/Atkeson_Schaal_Robot Learning From Demonstration.pdf:application/pdf},
}

@inproceedings{kolter_hierarchical_2007,
	title = {Hierarchical {Apprenticeship} {Learning} with {Application} to {Quadruped} {Locomotion}},
	abstract = {We consider apprenticeship learning—learning from expert demonstrations—in the setting of large, complex domains. Past work in apprenticeship learning requires that the expert demonstrate complete trajectories through the domain. However, in many problems even an expert has difﬁculty controlling the system, which makes this approach infeasible. For example, consider the task of teaching a quadruped robot to navigate over extreme terrain; demonstrating an optimal policy (i.e., an optimal set of foot locations over the entire terrain) is a highly non-trivial task, even for an expert. In this paper we propose a method for hierarchical apprenticeship learning, which allows the algorithm to accept isolated advice at different hierarchical levels of the control task. This type of advice is often feasible for experts to give, even if the expert is unable to demonstrate complete trajectories. This allows us to extend the apprenticeship learning paradigm to much larger, more challenging domains. In particular, in this paper we apply the hierarchical apprenticeship learning algorithm to the task of quadruped locomotion over extreme terrain, and achieve, to the best of our knowledge, results superior to any previously published work.},
	language = {en},
	author = {Kolter, J Z and Abbeel, Pieter and Ng, Andrew Y},
	year = {2007},
	file = {Kolter et al. - Hierarchical Apprenticeship Learning with Applicat.pdf:/home/dferigo/Zotero/storage/3YAK7MMU/Kolter et al. - Hierarchical Apprenticeship Learning with Applicat.pdf:application/pdf},
}

@article{theodorou_generalized_2010,
	title = {A {Generalized} {Path} {Integral} {Control} {Approach} to {Reinforcement} {Learning}},
	volume = {11},
	abstract = {With the goal to generate more scalable algorithms with higher efﬁciency and fewer open parameters, reinforcement learning (RL) has recently moved towards combining classical techniques from optimal control and dynamic programming with modern learning techniques from statistical estimation theory. In this vein, this paper suggests to use the framework of stochastic optimal control with path integrals to derive a novel approach to RL with parameterized policies. While solidly grounded in value function estimation and optimal control based on the stochastic Hamilton-JacobiBellman (HJB) equations, policy improvements can be transformed into an approximation problem of a path integral which has no open algorithmic parameters other than the exploration noise. The resulting algorithm can be conceived of as model-based, semi-model-based, or even model free, depending on how the learning problem is structured. The update equations have no danger of numerical instabilities as neither matrix inversions nor gradient learning rates are required. Our new algorithm demonstrates interesting similarities with previous RL research in the framework of probability matching and provides intuition why the slightly heuristically motivated probability matching approach can actually perform well. Empirical evaluations demonstrate signiﬁcant performance improvements over gradient-based policy learning and scalability to high-dimensional control problems. Finally, a learning experiment on a simulated 12 degree-of-freedom robot dog illustrates the functionality of our algorithm in a complex robot learning scenario. We believe that Policy Improvement with Path Integrals (PI2) offers currently one of the most efﬁcient, numerically robust, and easy to implement algorithms for RL based on trajectory roll-outs.},
	language = {en},
	journal = {The Journal of Machine Learning Research},
	author = {Theodorou, Evangelos A and Buchli, Jonas and Schaal, Stefan and Org, Buchli},
	year = {2010},
	pages = {45},
	file = {Theodorou et al_A Generalized Path Integral Control Approach to Reinforcement Learning.pdf:/home/dferigo/Zotero/storage/N8HXKEPV/Theodorou et al_A Generalized Path Integral Control Approach to Reinforcement Learning.pdf:application/pdf},
}

@inproceedings{peters_reinforcement_2003,
	title = {Reinforcement {Learning} for {Humanoid} {Robotics}},
	abstract = {Reinforcement learning oﬀers one of the most general framework to take traditional robotics towards true autonomy and versatility. However, applying reinforcement learning to high dimensional movement systems like humanoid robots remains an unsolved problem. In this paper, we discuss diﬀerent approaches of reinforcement learning in terms of their applicability in humanoid robotics. Methods can be coarsely classiﬁed into three diﬀerent categories, i.e., greedy methods, ‘vanilla’ policy gradient methods, and natural gradient methods. We discuss that greedy methods are not likely to scale into the domain humanoid robotics as they are problematic when used with function approximation. ‘Vanilla’ policy gradient methods on the other hand have been successfully applied on real-world robots including at least one humanoid robot [3]. We demonstrate that these methods can be signiﬁcantly improved using the natural policy gradient instead of the regular policy gradient. A derivation of the natural policy gradient is provided, proving that the average policy gradient of Kakade [10] is indeed the true natural gradient. A general algorithm for estimating the natural gradient, the Natural Actor-Critic algorithm, is introduced. This algorithm converges to the nearest local minimum of the cost function with respect to the Fisher information metric under suitable conditions. The algorithm outperforms non-natural policy gradients by far in a cart-pole balancing evaluation, and for learning nonlinear dynamic motor primitives for humanoid robot control. It oﬀers a promising route for the development of reinforcement learning for truly high-dimensionally continuous state-action systems.},
	language = {en},
	booktitle = {Proceedings of the third {IEEE}-{RAS} international conference on humanoid robots},
	author = {Peters, Jan and Vijayakumar, Sethu and Schaal, Stefan},
	year = {2003},
	pages = {20},
	file = {Peters et al_Reinforcement Learning for Humanoid Robotics.pdf:/home/dferigo/Zotero/storage/WVS83I5T/Peters et al_Reinforcement Learning for Humanoid Robotics.pdf:application/pdf},
}

@article{gullapalli_acquiring_1994,
	title = {Acquiring robot skills via reinforcement learning},
	volume = {14},
	issn = {1941-000X},
	doi = {10.1109/37.257890},
	abstract = {Skill acquisition is a difficult , yet important problem in robot performance. The authors focus on two skills, namely robotic assembly and balancing and on two classic tasks to develop these skills via learning: the peg-in hole insertion task, and the ball balancing task. A stochastic real-valued (SRV) reinforcement learning algorithm is described and used for learning control and the authors show how it can be used with nonlinear multilayer ANNs. In the peg-in-hole insertion task the SRV network successfully learns to insert to insert a peg into a hole with extremely low clearance, in spite of high sensor noise. In the ball balancing task the SRV network successfully learns to balance the ball with minimal feedback.{\textless}{\textgreater}},
	number = {1},
	journal = {IEEE Control Systems Magazine},
	author = {Gullapalli, V. and Franklin, J.A. and Benbrahim, H.},
	month = feb,
	year = {1994},
	note = {Conference Name: IEEE Control Systems Magazine},
	keywords = {Adaptive control, Control design, Control systems, Delay, Feedback, Robot control, Robotic assembly, Robust control, Supervised learning, Uncertainty},
	pages = {13--24},
	file = {Gullapalli et al_1994_Acquiring robot skills via reinforcement learning.pdf:/home/dferigo/Zotero/storage/D239HJKA/Gullapalli et al_1994_Acquiring robot skills via reinforcement learning.pdf:application/pdf},
}

@inproceedings{kober_policy_2008,
	title = {Policy {Search} for {Motor} {Primitives} in {Robotics}},
	volume = {21},
	url = {https://proceedings.neurips.cc/paper/2008/hash/7647966b7343c29048673252e490f736-Abstract.html},
	abstract = {Many motor skills in humanoid robotics can be learned using parametrized motor primitives as done in imitation learning. However, most interesting motor learning problems are high-dimensional reinforcement learning problems often beyond the reach of current methods. In this paper, we extend previous work on policy learning from the immediate reward case to episodic reinforcement learning. We show that this results into a general, common framework also connected to policy gradient methods and yielding a novel algorithm for policy learning by assuming a form of exploration that is particularly well-suited for dynamic motor primitives. The resulting algorithm is an EM-inspired algorithm applicable in complex motor learning tasks. We compare this algorithm to alternative parametrized policy search methods and show that it outperforms previous methods. We apply it in the context of motor learning and show that it can learn a complex Ball-in-a-Cup task using a real Barrett WAM robot arm.},
	urldate = {2022-04-15},
	booktitle = {Advances in {Neural} {Information} {Processing} {Systems}},
	publisher = {Curran Associates, Inc.},
	author = {Kober, Jens and Peters, Jan},
	year = {2008},
	file = {Kober_Peters_2008_Policy Search for Motor Primitives in Robotics.pdf:/home/dferigo/Zotero/storage/BFS6BPCJ/Kober_Peters_2008_Policy Search for Motor Primitives in Robotics.pdf:application/pdf},
}

@inproceedings{honglak_lee_quadruped_2006,
	address = {Orlando, FL, USA},
	title = {Quadruped robot obstacle negotiation via reinforcement learning},
	isbn = {978-0-7803-9505-3},
	url = {http://ieeexplore.ieee.org/document/1642158/},
	doi = {10.1109/ROBOT.2006.1642158},
	abstract = {Legged robots can, in principle, traverse a large variety of obstacles and terrains. In this paper, we describe a successful application of reinforcement learning to the problem of negotiating obstacles with a quadruped robot. Our algorithm is based on a two-level hierarchical decomposition of the task, in which the high-level controller selects the sequence of footplacement positions, and the low-level controller generates the continuous motions to move each foot to the speciﬁed positions. The high-level controller uses an estimate of the value function to guide its search; this estimate is learned partially from supervised data. The low-level controller is obtained via policy search. We demonstrate that our robot can successfully climb over a variety of obstacles which were not seen at training time.},
	language = {en},
	urldate = {2022-04-15},
	booktitle = {Proceedings 2006 {IEEE} {International} {Conference} on {Robotics} and {Automation}, 2006. {ICRA} 2006.},
	publisher = {IEEE},
	author = {{Honglak Lee} and {Yirong Shen} and {Chih-Han Yu} and Singh, G. and Ng, A.Y.},
	year = {2006},
	pages = {3003--3010},
	file = {Honglak Lee et al_2006_Quadruped robot obstacle negotiation via reinforcement learning.pdf:/home/dferigo/Zotero/storage/LE662D9H/Honglak Lee et al_2006_Quadruped robot obstacle negotiation via reinforcement learning.pdf:application/pdf},
}

@inproceedings{kohl_machine_2004,
	title = {Machine {Learning} for {Fast} {Quadrupedal} {Locomotion}},
	abstract = {For a robot, the ability to get from one place to another is one of the most basic skills. However, locomotion on legged robots is a challenging multidimensional control problem. This paper presents a machine learning approach to legged locomotion, with all training done on the physical robots. The main contributions are a speciﬁcation of our fully automated learning environment and a detailed empirical comparison of four different machine learning algorithms for learning quadrupedal locomotion. The resulting learned walk is considerably faster than all previously reported hand-coded walks for the same robot platform.},
	language = {en},
	author = {Kohl, Nate and Stone, Peter},
	year = {2004},
	pages = {6},
	file = {Kohl_Stone_2004_Machine Learning for Fast Quadrupedal Locomotion.pdf:/home/dferigo/Zotero/storage/ZF4WFM4N/Kohl_Stone_2004_Machine Learning for Fast Quadrupedal Locomotion.pdf:application/pdf},
}

@article{zico_kolter_stanford_2011,
	title = {The {Stanford} {LittleDog}: {A} learning and rapid replanning approach to quadruped locomotion},
	volume = {30},
	issn = {0278-3649, 1741-3176},
	shorttitle = {The {Stanford} {LittleDog}},
	url = {http://journals.sagepub.com/doi/10.1177/0278364910390537},
	doi = {10.1177/0278364910390537},
	abstract = {Legged robots oﬀer the potential to navigate a wide variety of terrains that are inaccessible to wheeled vehicles. In this paper we consider the planning and control tasks of navigating a quadruped robot over a wide variety of challenging terrain, including terrain which it has not seen until run-time. We present a software architecture that makes use of both static and dynamic gaits, as well as specialized dynamic maneuvers, to accomplish this task. Throughout the paper we highlight two themes that have been central to our approach: 1) the prevalent use of learning algorithms, and 2) a focus on rapid recovery and replanning techniques; we present several novel methods and algorithms that we developed for the quadruped and that illustrate these two themes. We evaluate the performance of these diﬀerent methods, and also present and discuss the performance of our system on the oﬃcial Learning Locomotion tests.},
	language = {en},
	number = {2},
	urldate = {2022-04-15},
	journal = {The International Journal of Robotics Research},
	author = {Zico Kolter, J. and Ng, Andrew Y},
	month = feb,
	year = {2011},
	pages = {150--174},
	file = {Zico Kolter_Ng_2011_The Stanford LittleDog.pdf:/home/dferigo/Zotero/storage/3SSTHW7F/Zico Kolter_Ng_2011_The Stanford LittleDog.pdf:application/pdf},
}

@article{schaal_is_1999,
	title = {Is imitation learning the route to humanoid robots?},
	volume = {3},
	issn = {13646613},
	url = {https://linkinghub.elsevier.com/retrieve/pii/S1364661399013273},
	doi = {10.1016/S1364-6613(99)01327-3},
	abstract = {This review investigates two recent developments in artificial intelligence and neural computation: learning from imitation and the development of humanoid robots. It will be postulated that the study of imitation learning offers a promising route to gain new insights into mechanisms of perceptual motor control that could ultimately lead to the creation of autonomous humanoid robots. Imitation learning focuses on three important issues: efficient motor learning, the connection between action and perception, and modular motor control in form of movement primitives. It will be reviewed how research on representations of, and functional connections between action and perception have contributed to our understanding of motor acts of other beings. The recent discovery that some areas in the primate brain are active during both movement perception and execution has provided a hypothetical neural basis of imitation. Computational approaches to imitation learning will also be described, initially from the perspective of traditional AI and robotics, but also from the perspective of neural network models and statistical learning research. Parallels and differences between biological and computational approaches to imitation will be highlighted and an overview of current projects that actually employ imitation learning for humanoid robots will be given.},
	language = {en},
	number = {6},
	urldate = {2022-04-15},
	journal = {Trends in Cognitive Sciences},
	author = {Schaal, Stefan},
	month = jun,
	year = {1999},
	pages = {233--242},
	file = {Schaal_1999_Is imitation learning the route to humanoid robots.pdf:/home/dferigo/Zotero/storage/IN2PXD9V/Schaal_1999_Is imitation learning the route to humanoid robots.pdf:application/pdf},
}

@inproceedings{schaal_learning_1996,
	title = {Learning from {Demonstration}},
	volume = {9},
	url = {https://proceedings.neurips.cc/paper/1996/hash/68d13cf26c4b4f4f932e3eff990093ba-Abstract.html},
	abstract = {By  now  it is  widely  accepted  that learning  a  task  from  scratch,  i.e.,  without  any prior knowledge,  is a daunting  undertaking. Humans,  however,  rarely at(cid:173) tempt  to  learn  from  scratch.  They  extract  initial  biases  as  well  as  strategies  how  to  approach a learning problem from  instructions and/or demonstrations  of other  humans.  For  learning  control,  this  paper  investigates  how  learning  from  demonstration  can  be  applied  in  the  context of reinforcement  learning.  We  consider priming  the  Q-function,  the  value  function,  the  policy,  and  the  model of the task dynamics as possible areas where demonstrations can speed  up  learning.  In  general  nonlinear learning  problems,  only  model-based  rein(cid:173) forcement learning shows significant speed-up after a demonstration,  while in  the  special  case  of linear  quadratic  regulator  (LQR)  problems,  all  methods  profit  from  the  demonstration.  In  an  implementation  of pole  balancing  on  a  complex  anthropomorphic  robot  arm,  we  demonstrate  that,  when  facing  the  complexities  of real  signal  processing,  model-based  reinforcement  learning  offers  the  most robustness for LQR problems. Using  the suggested methods,  the  robot  learns  pole  balancing  in  just a  single trial  after  a  30  second  long  demonstration of the human instructor.},
	urldate = {2022-04-15},
	booktitle = {Advances in {Neural} {Information} {Processing} {Systems}},
	publisher = {MIT Press},
	author = {Schaal, Stefan},
	year = {1996},
}

@incollection{schaal_dynamic_2006,
	address = {Tokyo},
	title = {Dynamic {Movement} {Primitives} -{A} {Framework} for {Motor} {Control} in {Humans} and {Humanoid} {Robotics}},
	isbn = {978-4-431-24164-5},
	url = {http://link.springer.com/10.1007/4-431-31381-8_23},
	abstract = {Given the continuous stream of movements that biological systems exhibit in their daily activities, an account for such versatility and creativity has to assume that movement sequences consist of segments, executed either in sequence or with partial or complete overlap. Therefore, a fundamental question that has pervaded research in motor control both in artificial and biological systems revolves around identifying movement primitives (a.k.a. units of actions, basis behaviors, motor schemas, etc.). What are the fundamental building blocks that are strung together, adapted to, and created for ever new behaviors? This paper summarizes results that led to the hypothesis of Dynamic Movement Primitives (DMP). DMPs are units of action that are formalized as stable nonlinear attractor systems. They are useful for autonomous robotics as they are highly flexible in creating complex rhythmic (e.g., locomotion) and discrete (e.g., a tennis swing) behaviors that can quickly be adapted to the inevitable perturbations of a dynamically changing, stochastic environment. Moreover, DMPs provide a formal framework that also lends itself to investigations in computational neuroscience. A recent finding that allows creating DMPs with the help of well-understood statistical learning methods has elevated DMPs from a more heuristic to a principled modeling approach. Theoretical insights, evaluations on a humanoid robot, and behavioral and brain imaging data will serve to outline the framework of DMPs for a general approach to motor control in robotics and biology.},
	language = {en},
	urldate = {2022-04-15},
	booktitle = {Adaptive {Motion} of {Animals} and {Machines}},
	publisher = {Springer-Verlag},
	author = {Schaal, Stefan},
	year = {2006},
	doi = {10.1007/4-431-31381-8_23},
	pages = {261--280},
	file = {Schaal_2006_Dynamic Movement Primitives -A Framework for Motor Control in Humans and.pdf:/home/dferigo/Zotero/storage/NGSBJLMR/Schaal_2006_Dynamic Movement Primitives -A Framework for Motor Control in Humans and.pdf:application/pdf},
}

@inproceedings{peters_policy_2006,
	address = {Beijing, China},
	title = {Policy {Gradient} {Methods} for {Robotics}},
	url = {http://ieeexplore.ieee.org/document/4058714/},
	doi = {10.1109/IROS.2006.282564},
	abstract = {The aquisition and improvement of motor skills and control policies for robotics from trial and error is of essential importance if robots should ever leave precisely pre-structured environments. However, to date only few existing reinforcement learning methods have been scaled into the domains of highdimensional robots such as manipulator, legged or humanoid robots. Policy gradient methods remain one of the few exceptions and have found a variety of applications. Nevertheless, the application of such methods is not without peril if done in an uninformed manner. In this paper, we give an overview on learning with policy gradient methods for robotics with a strong focus on recent advances in the ﬁeld. We outline previous applications to robotics and show how the most recently developed methods can signiﬁcantly improve learning performance. Finally, we evaluate our most promising algorithm in the application of hitting a baseball with an anthropomorphic arm.},
	language = {en},
	urldate = {2022-04-15},
	booktitle = {2006 {IEEE}/{RSJ} {International} {Conference} on {Intelligent} {Robots} and {Systems}},
	publisher = {IEEE},
	author = {Peters, Jan and Schaal, Stefan},
	month = oct,
	year = {2006},
	pages = {2219--2225},
	file = {Peters_Schaal_2006_Policy Gradient Methods for Robotics.pdf:/home/dferigo/Zotero/storage/MAL8FFI9/Peters_Schaal_2006_Policy Gradient Methods for Robotics.pdf:application/pdf},
}

@article{benbrahim_biped_1997,
	title = {Biped dynamic walking using reinforcement learning},
	volume = {22},
	issn = {09218890},
	doi = {10.1016/S0921-8890(97)00043-2},
	number = {3-4},
	journal = {Robotics and Autonomous Systems},
	author = {Benbrahim, Hamid and Franklin, Judy A.},
	month = dec,
	year = {1997},
	pages = {283--302},
	file = {Benbrahim_Franklin_1997_Biped dynamic walking using reinforcement learning.pdf:/home/dferigo/Zotero/storage/2JYIXAGY/Benbrahim_Franklin_1997_Biped dynamic walking using reinforcement learning.pdf:application/pdf},
}

@phdthesis{watkins_christopher_learning_1989,
	title = {Learning from {Delayed} {Rewards}},
	school = {King's College},
	author = {Watkins, Christopher},
	year = {1989},
	file = {Watkins, Christopher John Cornish Hellaby_Learning from Delayed Rewards.pdf:/home/dferigo/Zotero/storage/BV9UA3K9/Watkins, Christopher John Cornish Hellaby_Learning from Delayed Rewards.pdf:application/pdf},
}

@article{hinton_fast_2006,
	title = {A {Fast} {Learning} {Algorithm} for {Deep} {Belief} {Nets}},
	volume = {18},
	issn = {0899-7667, 1530-888X},
	url = {https://direct.mit.edu/neco/article/18/7/1527-1554/7065},
	doi = {10.1162/neco.2006.18.7.1527},
	abstract = {We show how to use “complementary priors” to eliminate the explaining-away effects that make inference difficult in densely connected belief nets that have many hidden layers. Using complementary priors, we derive a fast, greedy algorithm that can learn deep, directed belief networks one layer at a time, provided the top two layers form an undirected associative memory. The fast, greedy algorithm is used to initialize a slower learning procedure that fine-tunes the weights using a contrastive version of the wake-sleep algorithm. After fine-tuning, a network with three hidden layers forms a very good generative model of the joint distribution of handwritten digit images and their labels. This generative model gives better digit classification than the best discriminative learning algorithms. The low-dimensional manifolds on which the digits lie are modeled by long ravines in the free-energy landscape of the top-level associative memory, and it is easy to explore these ravines by using the directed connections to display what the associative memory has in mind.},
	language = {en},
	number = {7},
	urldate = {2022-04-15},
	journal = {Neural Computation},
	author = {Hinton, Geoffrey E. and Osindero, Simon and Teh, Yee-Whye},
	month = jul,
	year = {2006},
	pages = {1527--1554},
	file = {Hinton et al_2006_A Fast Learning Algorithm for Deep Belief Nets.pdf:/home/dferigo/Zotero/storage/IMEASWQY/Hinton et al_2006_A Fast Learning Algorithm for Deep Belief Nets.pdf:application/pdf},
}

@techreport{rummery_-line_1994,
	title = {On-{Line} {Q}-{Learning} {Using} {Connectionist} {Systems}},
	abstract = {Reinforcement learning algorithms are a powerful machine learning technique. However, much of the work on these algorithms has been developed with regard to discrete finite-state Markovian problems, which is too restrictive for many real-world environments. Therefore, it is desirable to extend these methods to high dimensional continuous state-spaces, which requires the use of function approximation to generalise the information learnt by the system. In this report, the use of back-propagation neural networks (Rumelhart, Hinton and Williams 1986) is considered in this context. We consider a number of different algorithms based around Q-Learning (Watkins 1989) combined with the Temporal Difference algorithm (Sutton 1988), including a new algorithm (Modified Connectionist Q-Learning), and Q() (Peng and Williams 1994). In addition, we present algorithms for applying these updates on-line during trials, unlike backward replay used by Lin (1993) that requires waiting until the end of each t...},
	author = {Rummery, G. A. and Niranjan, M.},
	year = {1994},
	file = {Citeseer - Snapshot:/home/dferigo/Zotero/storage/IBKMIYHG/summary.html:text/html;Rummery_Niranjan_1994_On-Line Q-Learning Using Connectionist Systems.pdf:/home/dferigo/Zotero/storage/RP9VWYGZ/Rummery_Niranjan_1994_On-Line Q-Learning Using Connectionist Systems.pdf:application/pdf},
}

@article{williams_simple_1992,
	title = {Simple statistical gradient-following algorithms for connectionist reinforcement learning},
	volume = {8},
	issn = {1573-0565},
	url = {https://doi.org/10.1007/BF00992696},
	doi = {10.1007/BF00992696},
	abstract = {This article presents a general class of associative reinforcement learning algorithms for connectionist networks containing stochastic units. These algorithms, called REINFORCE algorithms, are shown to make weight adjustments in a direction that lies along the gradient of expected reinforcement in both immediate-reinforcement tasks and certain limited forms of delayed-reinforcement tasks, and they do this without explicitly computing gradient estimates or even storing information from which such estimates could be computed. Specific examples of such algorithms are presented, some of which bear a close relationship to certain existing algorithms while others are novel but potentially interesting in their own right. Also given are results that show how such algorithms can be naturally integrated with backpropagation. We close with a brief discussion of a number of additional issues surrounding the use of such algorithms, including what is known about their limiting behaviors as well as further considerations that might be used to help develop similar but potentially more powerful reinforcement learning algorithms.},
	language = {en},
	number = {3},
	urldate = {2022-04-15},
	journal = {Machine Learning},
	author = {Williams, Ronald J.},
	month = may,
	year = {1992},
	pages = {229--256},
	file = {Williams_1992_Simple statistical gradient-following algorithms for connectionist.pdf:/home/dferigo/Zotero/storage/TICDQE49/Williams_1992_Simple statistical gradient-following algorithms for connectionist.pdf:application/pdf},
}

@article{sutton_learning_1988,
	title = {Learning to predict by the methods of temporal differences},
	volume = {3},
	issn = {0885-6125, 1573-0565},
	url = {http://link.springer.com/10.1007/BF00115009},
	doi = {10.1007/BF00115009},
	abstract = {This article introduces a class of incremental learning procedures specialized for prediction that is, for using past experience with an incompletely known system to predict its future behavior. Whereas conventional prediction-learning methods assign credit by means of the difference between predicted and actual outcomes, tile new methods assign credit by means of the difference between temporally successive predictions. Although such temporal-difference method{\textasciitilde} have been used in Samuel's checker player, Holland's bucket brigade, and the author's Adaptive Heuristic Critic, they have remained poorly understood. Here we prove their convergence and optimality for special cases and relate them to supervised-learning methods. For most real-world prediction problems, telnporal-differenee methods require less memory and less peak computation than conventional methods and they produce more accurate predictions. We argue that most problems to which supervised learning is currently applied are really prediction problems of the sort to which temporaldifference methods can be applied to advantage.},
	language = {en},
	number = {1},
	urldate = {2022-04-15},
	journal = {Machine Learning},
	author = {Sutton, Richard S.},
	month = aug,
	year = {1988},
	pages = {9--44},
	file = {Sutton_1988_Learning to predict by the methods of temporal differences.pdf:/home/dferigo/Zotero/storage/XBWG7D4V/Sutton_1988_Learning to predict by the methods of temporal differences.pdf:application/pdf},
}

@article{tesauro_td-gammon_1994,
	title = {{TD}-{Gammon}, a {Self}-{Teaching} {Backgammon} {Program}, {Achieves} {Master}-{Level} {Play}},
	volume = {6},
	issn = {0899-7667, 1530-888X},
	url = {https://direct.mit.edu/neco/article/6/2/215-219/5771},
	doi = {10.1162/neco.1994.6.2.215},
	abstract = {TD-Gammonis a neural network that is able to teach itself to play backgammosnolely by playing against itself and learning from the results, based on the TD(A)reinforcement learning algorithm (Sutton, 1988). Despite starting from random initial weights (and hence random initial strategy), TD-Gammoanchieves a surprisingly strong level of play. With zero knowledge built in at the start of learning (i.e. given only a "raw" description of the board state), the network learns to play at a strong intermediate level. Furthermore, when a set of hand-crafted features is added to the network’s input representation, the result is a truly staggering level of performance: the latest version of TD-Gammoisn nowestimated to play at a strong master level that is extremely close to the world’s best humanplayers.},
	language = {en},
	number = {2},
	urldate = {2022-04-15},
	journal = {Neural Computation},
	author = {Tesauro, Gerald},
	month = mar,
	year = {1994},
	pages = {215--219},
	file = {Tesauro_1994_TD-Gammon, a Self-Teaching Backgammon Program, Achieves Master-Level Play.pdf:/home/dferigo/Zotero/storage/9C9X9HWP/Tesauro_1994_TD-Gammon, a Self-Teaching Backgammon Program, Achieves Master-Level Play.pdf:application/pdf},
}

@phdthesis{lin_reinforcement_1993,
	type = {{PhD} thesis},
	title = {Reinforcement {Learning} for {Robots} {Using} {Neural} {Networks}},
	language = {en},
	school = {Carnegie Mellon University},
	author = {Lin, Long-Ji},
	year = {1993},
	file = {Lin_1993_Reinforcement Learning for Robots Using Neural Networks.pdf:/home/dferigo/Zotero/storage/Y2VMYRV8/Lin_1993_Reinforcement Learning for Robots Using Neural Networks.pdf:application/pdf},
}

@article{narendra_identification_1990,
	title = {Identification and control of dynamical systems using neural networks},
	volume = {1},
	issn = {10459227},
	url = {http://ieeexplore.ieee.org/document/80202/},
	doi = {10.1109/72.80202},
	abstract = {The paper demonstrates that neural networks can be used effectively for the identification and control of nonlinear dynamical systems. The emphasis of the paper is on models for both identification and control. Static and dynamic back-propagation methods for the adjustment of parameters are discussed. In the models that are introduced, multilayer and recurrent networks are interconnected in novel configurations and hence there is a real need to study them in a unified fashion. Simulation results reveal that the identification and adaptive control schemes suggested are practically feasible. Basic concepts and definitions are introduced throughout the paper, and theoretical questions which have to be addressed are also described.},
	language = {en},
	number = {1},
	urldate = {2022-04-15},
	journal = {IEEE Transactions on Neural Networks},
	author = {Narendra, K.S. and Parthasarathy, K.},
	month = mar,
	year = {1990},
	pages = {4--27},
	file = {Narendra_Parthasarathy_1990_Identification and control of dynamical systems using neural networks.pdf:/home/dferigo/Zotero/storage/T9ITSBDR/Narendra_Parthasarathy_1990_Identification and control of dynamical systems using neural networks.pdf:application/pdf},
}

@article{koberReinforcementLearningRobotics2013,
	title = {Reinforcement {Learning} in {Robotics}: {A} {Survey}},
	language = {en},
	journal = {International Journal of Robotics Research},
	author = {Kober, Jens and Bagnell, J Andrew and Peters, Jan},
	year = {2013},
	pages = {38},
	file = {Kober2013-Reinforcement_Learning_in_Robotics.pdf:/home/dferigo/Zotero/storage/858JLQET/Kober2013-Reinforcement_Learning_in_Robotics.pdf:application/pdf;Kober2013-Reinforcement_Learning_in_Robotics.pdf:/home/dferigo/Zotero/storage/W3LHRYT7/Kober2013-Reinforcement_Learning_in_Robotics.pdf:application/pdf},
}

@article{chatzilygeroudis_survey_2020,
	title = {A {Survey} on {Policy} {Search} {Algorithms} for {Learning} {Robot} {Controllers} in a {Handful} of {Trials}},
	volume = {36},
	issn = {1941-0468},
	doi = {10.1109/TRO.2019.2958211},
	abstract = {Most policy search (PS) algorithms require thousands of training episodes to find an effective policy, which is often infeasible with a physical robot. This survey article focuses on the extreme other end of the spectrum: how can a robot adapt with only a handful of trials (a dozen) and a few minutes? By analogy with the word “big-data,” we refer to this challenge as “micro-data reinforcement learning.” In this article, we show that a first strategy is to leverage prior knowledge on the policy structure (e.g., dynamic movement primitives), on the policy parameters (e.g., demonstrations), or on the dynamics (e.g., simulators). A second strategy is to create data-driven surrogate models of the expected reward (e.g., Bayesian optimization) or the dynamical model (e.g., model-based PS), so that the policy optimizer queries the model instead of the real system. Overall, all successful micro-data algorithms combine these two strategies by varying the kind of model and prior knowledge. The current scientific challenges essentially revolve around scaling up to complex robots, designing generic priors, and optimizing the computing time.},
	number = {2},
	journal = {IEEE Transactions on Robotics},
	author = {Chatzilygeroudis, Konstantinos and Vassiliades, Vassilis and Stulp, Freek and Calinon, Sylvain and Mouret, Jean-Baptiste},
	month = apr,
	year = {2020},
	keywords = {Autonomous agents, learning and adaptive systems, micro-data policy search (MDPS), robot learning},
	pages = {328--347},
	file = {Chatzilygeroudis et al_2020_A Survey on Policy Search Algorithms for Learning Robot Controllers in a.pdf:/home/dferigo/Zotero/storage/VLFIF5C6/Chatzilygeroudis et al_2020_A Survey on Policy Search Algorithms for Learning Robot Controllers in a.pdf:application/pdf;IEEE Xplore Abstract Record:/home/dferigo/Zotero/storage/I636FFXG/8944013.html:text/html},
}

@article{fabisch_survey_2019,
	title = {A {Survey} of {Behavior} {Learning} {Applications} in {Robotics} -- {State} of the {Art} and {Perspectives}},
	url = {http://arxiv.org/abs/1906.01868},
	abstract = {Recent success of machine learning in many domains has been overwhelming, which often leads to false expectations regarding the capabilities of behavior learning in robotics. In this survey, we analyze the current state of machine learning for robotic behaviors. We will give a broad overview of behaviors that have been learned and used on real robots. Our focus is on kinematically or sensorially complex robots. That includes humanoid robots or parts of humanoid robots, for example, legged robots or robotic arms. We will classify presented behaviors according to various categories and we will draw conclusions about what can be learned and what should be learned. Furthermore, we will give an outlook on problems that are challenging today but might be solved by machine learning in the future and argue that classical robotics and other approaches from artiﬁcial intelligence should be integrated more with machine learning to form complete, autonomous systems.},
	language = {en},
	urldate = {2019-06-27},
	journal = {arXiv:1906.01868 [cs]},
	author = {Fabisch, Alexander and Petzoldt, Christoph and Otto, Marc and Kirchner, Frank},
	month = jun,
	year = {2019},
	note = {arXiv: 1906.01868},
	keywords = {Computer Science - Machine Learning, Computer Science - Robotics},
	file = {Fabisch et al_2019_A Survey of Behavior Learning Applications in Robotics -- State of the Art and.pdf:/home/dferigo/Zotero/storage/R9NX3LR5/Fabisch et al_2019_A Survey of Behavior Learning Applications in Robotics -- State of the Art and.pdf:application/pdf},
}

@article{busoniu_reinforcement_2018,
	title = {Reinforcement learning for control: {Performance}, stability, and deep approximators},
	volume = {46},
	issn = {13675788},
	shorttitle = {Reinforcement learning for control},
	url = {https://linkinghub.elsevier.com/retrieve/pii/S1367578818301184},
	doi = {10.1016/j.arcontrol.2018.09.005},
	abstract = {Reinforcement learning (RL) offers powerful algorithms to search for optimal controllers of systems with nonlinear, possibly stochastic dynamics that are unknown or highly uncertain. This review mainly covers artiﬁcial-intelligence approaches to RL, from the viewpoint of the control engineer. We explain how approximate representations of the solution make RL feasible for problems with continuous states and control actions. Stability is a central concern in control, and we argue that while the control-theoretic RL subﬁeld called adaptive dynamic programming is dedicated to it, stability of RL largely remains an open question. We also cover in detail the case where deep neural networks are used for approximation, leading to the ﬁeld of deep RL, which has shown great success in recent years. With the control practitioner in mind, we outline opportunities and pitfalls of deep RL; and we close the survey with an outlook that – among other things – points out some avenues for bridging the gap between control and artiﬁcial-intelligence RL techniques.},
	language = {en},
	urldate = {2018-12-17},
	journal = {Annual Reviews in Control},
	author = {Buşoniu, Lucian and de Bruin, Tim and Tolić, Domagoj and Kober, Jens and Palunko, Ivana},
	year = {2018},
	pages = {8--28},
	file = {Buşoniu et al_2018_Reinforcement learning for control.pdf:/home/dferigo/Zotero/storage/KWV5Z2VL/Buşoniu et al_2018_Reinforcement learning for control.pdf:application/pdf},
}

@inproceedings{stulp_reinforcement_2010,
	title = {Reinforcement learning of full-body humanoid motor skills},
	isbn = {978-1-4244-8688-5},
	url = {http://ieeexplore.ieee.org/document/5686320/},
	doi = {10.1109/ICHR.2010.5686320},
	abstract = {Applying reinforcement learning to humanoid robots is challenging because humanoids have a large number of degrees of freedom and state and action spaces are continuous. Thus, most reinforcement learning algorithms would become computationally infeasible and require a prohibitive amount of trials to explore such high-dimensional spaces. In this paper, we present a probabilistic reinforcement learning approach, which is derived from the framework of stochastic optimal control and path integrals. The algorithm, called Policy Improvement with Path Integrals (PI2), has a surprisingly simple form, has no open tuning parameters besides the exploration noise, is modelfree, and performs numerically robustly in high dimensional learning problems. We demonstrate how PI2 is able to learn fullbody motor skills on a 34-DOF humanoid robot. To demonstrate the generality of our approach, we also apply PI2 in the context of variable impedance control, where both planned trajectories and gain schedules for each joint are optimized simultaneously.},
	language = {en},
	urldate = {2018-06-17},
	publisher = {IEEE},
	author = {Stulp, Freek and Buchli, Jonas and Theodorou, Evangelos and Schaal, Stefan},
	month = dec,
	year = {2010},
	pages = {405--410},
	file = {Stulp et al_2010_Reinforcement learning of full-body humanoid motor skills.pdf:/home/dferigo/Zotero/storage/4H6NFGV6/Stulp et al_2010_Reinforcement learning of full-body humanoid motor skills.pdf:application/pdf},
}

@article{sola_micro_2020,
	title = {A micro {Lie} theory for state estimation in robotics},
	url = {http://arxiv.org/abs/1812.01537},
	abstract = {A Lie group is an old mathematical abstract object dating back to the XIX century, when mathematician Sophus Lie laid the foundations of the theory of continuous transformation groups. Its inﬂuence has spread over diverse areas of science and technology many years later. In robotics, we are recently experiencing an important trend in its usage, at least in the ﬁelds of estimation, and particularly in motion estimation for navigation. Yet for a vast majority of roboticians, Lie groups are highly abstract constructions and therefore difﬁcult to understand and to use.},
	language = {en},
	urldate = {2021-11-26},
	journal = {arXiv:1812.01537 [cs]},
	author = {Solà, Joan and Deray, Jeremie and Atchuthan, Dinesh},
	month = nov,
	year = {2020},
	note = {arXiv: 1812.01537},
	keywords = {Computer Science - Robotics},
	file = {Solà et al_2020_A micro Lie theory for state estimation in robotics.pdf:/home/dferigo/Zotero/storage/IBW6W28A/Solà et al_2020_A micro Lie theory for state estimation in robotics.pdf:application/pdf},
}

@article{dulac-arnold_empirical_2021,
	title = {An empirical investigation of the challenges of real-world reinforcement learning},
	url = {http://arxiv.org/abs/2003.11881},
	abstract = {Reinforcement learning (RL) has proven its worth in a series of artiﬁcial domains, and is beginning to show some successes in real-world scenarios. However, much of the research advances in RL are hard to leverage in real-world systems due to a series of assumptions that are rarely satisﬁed in practice. In this work, we identify and formalize a series of independent challenges that embody the difﬁculties that must be addressed for RL to be commonly deployed in real-world systems. For each challenge, we deﬁne it formally in the context of a Markov Decision Process, analyze the effects of the challenge on state-of-the-art learning algorithms, and present some existing attempts at tackling it. We believe that an approach that addresses our set of proposed challenges would be readily deployable in a large number of real world problems. Our proposed challenges are implemented in a suite of continuous control environments called realworldrl-suite which we propose an as an open-source benchmark.},
	language = {en},
	urldate = {2022-04-13},
	journal = {arXiv:2003.11881 [cs]},
	author = {Dulac-Arnold, Gabriel and Levine, Nir and Mankowitz, Daniel J. and Li, Jerry and Paduraru, Cosmin and Gowal, Sven and Hester, Todd},
	month = mar,
	year = {2021},
	note = {arXiv: 2003.11881},
	keywords = {Computer Science - Machine Learning, Computer Science - Artificial Intelligence},
	file = {Dulac-Arnold et al_2021_An empirical investigation of the challenges of real-world reinforcement.pdf:/home/dferigo/Zotero/storage/FN6Z7NFN/Dulac-Arnold et al_2021_An empirical investigation of the challenges of real-world reinforcement.pdf:application/pdf},
}

@article{dulac-arnold_empirical_2020,
	title = {An empirical investigation of the challenges of real-world reinforcement learning},
	url = {http://arxiv.org/abs/2003.11881},
	abstract = {Reinforcement learning (RL) has proven its worth in a series of artiﬁcial domains, and is beginning to show some successes in real-world scenarios. However, much of the research advances in RL are hard to leverage in real-world systems due to a series of assumptions that are rarely satisﬁed in practice. In this work, we identify and formalize a series of independent challenges that embody the difﬁculties that must be addressed for RL to be commonly deployed in real-world systems. For each challenge, we deﬁne it formally in the context of a Markov Decision Process, analyze the effects of the challenge on state-of-the-art learning algorithms, and present some existing attempts at tackling it. We believe that an approach that addresses our set of proposed challenges would be readily deployable in a large number of real world problems. Our proposed challenges are implemented in a suite of continuous control environments called realworldrl-suite which we propose an as an open-source benchmark.},
	language = {en},
	urldate = {2020-07-25},
	journal = {arXiv:2003.11881 [cs]},
	author = {Dulac-Arnold, Gabriel and Levine, Nir and Mankowitz, Daniel J. and Li, Jerry and Paduraru, Cosmin and Gowal, Sven and Hester, Todd},
	month = mar,
	year = {2020},
	note = {arXiv: 2003.11881},
	keywords = {Computer Science - Machine Learning, Computer Science - Artificial Intelligence},
	file = {Dulac-Arnold et al_2020_An empirical investigation of the challenges of real-world reinforcement.pdf:/home/dferigo/Zotero/storage/X77JWG9W/Dulac-Arnold et al_2020_An empirical investigation of the challenges of real-world reinforcement.pdf:application/pdf},
}

@article{haarnoja_learning_2019,
	title = {Learning to {Walk} via {Deep} {Reinforcement} {Learning}},
	url = {http://arxiv.org/abs/1812.11103},
	abstract = {Deep reinforcement learning (deep RL) holds the promise of automating the acquisition of complex controllers that can map sensory inputs directly to low-level actions. In the domain of robotic locomotion, deep RL could enable learning locomotion skills with minimal engineering and without an explicit model of the robot dynamics. Unfortunately, applying deep RL to real-world robotic tasks is exceptionally difficult, primarily due to poor sample complexity and sensitivity to hyperparameters. While hyperparameters can be easily tuned in simulated domains, tuning may be prohibitively expensive on physical systems, such as legged robots, that can be damaged through extensive trial-and-error learning. In this paper, we propose a sample-efficient deep RL algorithm based on maximum entropy RL that requires minimal per-task tuning and only a modest number of trials to learn neural network policies. We apply this method to learning walking gaits on a real-world Minitaur robot. Our method can acquire a stable gait from scratch directly in the real world in about two hours, without relying on any model or simulation, and the resulting policy is robust to moderate variations in the environment. We further show that our algorithm achieves state-of-the-art performance on simulated benchmarks with a single set of hyperparameters. Videos of training and the learned policy can be found on the project website.},
	urldate = {2020-05-07},
	journal = {arXiv:1812.11103 [cs, stat]},
	author = {Haarnoja, Tuomas and Ha, Sehoon and Zhou, Aurick and Tan, Jie and Tucker, George and Levine, Sergey},
	month = jun,
	year = {2019},
	note = {arXiv: 1812.11103},
	keywords = {Computer Science - Machine Learning, Computer Science - Robotics, Computer Science - Artificial Intelligence, Statistics - Machine Learning},
	file = {arXiv.org Snapshot:/home/dferigo/Zotero/storage/NVW3BT23/1812.html:text/html;Haarnoja et al_2019_Learning to Walk via Deep Reinforcement Learning.pdf:/home/dferigo/Zotero/storage/X3F83MC6/Haarnoja et al_2019_Learning to Walk via Deep Reinforcement Learning.pdf:application/pdf},
}

@inproceedings{rohmer_v-rep_2013,
	title = {V-{REP}: {A} versatile and scalable robot simulation framework},
	shorttitle = {V-{REP}},
	doi = {10.1109/IROS.2013.6696520},
	abstract = {From exploring planets to cleaning homes, the reach and versatility of robotics is vast. The integration of actuation, sensing and control makes robotics systems powerful, but complicates their simulation. This paper introduces a versatile, scalable, yet powerful general-purpose robot simulation framework called V-REP. The paper discusses the utility of a portable and flexible simulation framework that allows for direct incorporation of various control techniques. This renders simulations and simulation models more accessible to a general-public, by reducing the simulation model deployment complexity. It also increases productivity by offering built-in and ready-to-use functionalities, as well as a multitude of programming approaches. This allows for a multitude of applications including rapid algorithm development, system verification, rapid prototyping, and deployment for cases such as safety/remote monitoring, training and education, hardware control, and factory automation simulation.},
	booktitle = {2013 {IEEE}/{RSJ} {International} {Conference} on {Intelligent} {Robots} and {Systems}},
	author = {Rohmer, Eric and Singh, Surya P. N. and Freese, Marc},
	month = nov,
	year = {2013},
	note = {ISSN: 2153-0866},
	keywords = {Computational modeling, Hardware, Joints, Load modeling, Robots, Sensors, Shape},
	pages = {1321--1326},
	file = {Rohmer et al_2013_V-REP.pdf:/home/dferigo/Zotero/storage/P46T7D6L/Rohmer et al_2013_V-REP.pdf:application/pdf},
}

@article{ibarz_how_2021,
	title = {How to {Train} {Your} {Robot} with {Deep} {Reinforcement} {Learning}; {Lessons} {We}'ve {Learned}},
	issn = {0278-3649, 1741-3176},
	url = {http://arxiv.org/abs/2102.02915},
	doi = {10.1177/0278364920987859},
	abstract = {Deep reinforcement learning (RL) has emerged as a promising approach for autonomously acquiring complex behaviors from low level sensor observations. Although a large portion of deep RL research has focused on applications in video games and simulated control, which does not connect with the constraints of learning in real environments, deep RL has also demonstrated promise in enabling physical robots to learn complex skills in the real world. At the same time,real world robotics provides an appealing domain for evaluating such algorithms, as it connects directly to how humans learn; as an embodied agent in the real world. Learning to perceive and move in the real world presents numerous challenges, some of which are easier to address than others, and some of which are often not considered in RL research that focuses only on simulated domains. In this review article, we present a number of case studies involving robotic deep RL. Building off of these case studies, we discuss commonly perceived challenges in deep RL and how they have been addressed in these works. We also provide an overview of other outstanding challenges, many of which are unique to the real-world robotics setting and are not often the focus of mainstream RL research. Our goal is to provide a resource both for roboticists and machine learning researchers who are interested in furthering the progress of deep RL in the real world.},
	urldate = {2021-04-08},
	journal = {The International Journal of Robotics Research},
	author = {Ibarz, Julian and Tan, Jie and Finn, Chelsea and Kalakrishnan, Mrinal and Pastor, Peter and Levine, Sergey},
	month = jan,
	year = {2021},
	note = {arXiv: 2102.02915},
	keywords = {Computer Science - Machine Learning, Computer Science - Robotics},
	pages = {027836492098785},
	file = {arXiv.org Snapshot:/home/dferigo/Zotero/storage/89CMU29H/2102.html:text/html;Ibarz et al_2021_How to Train Your Robot with Deep Reinforcement Learning\; Lessons We've Learned.pdf:/home/dferigo/Zotero/storage/YXS79SEC/Ibarz et al_2021_How to Train Your Robot with Deep Reinforcement Learning\; Lessons We've Learned.pdf:application/pdf},
}

@article{da_learning_2020,
	title = {Learning a {Contact}-{Adaptive} {Controller} for {Robust}, {Efficient} {Legged} {Locomotion}},
	url = {http://arxiv.org/abs/2009.10019},
	abstract = {We present a hierarchical framework that combines model-based control and reinforcement learning (RL) to synthesize robust controllers for a quadruped (the Unitree Laikago). The system consists of a high-level controller that learns to choose from a set of primitives in response to changes in the environment and a low-level controller that utilizes an established control method to robustly execute the primitives. Our framework learns a controller that can adapt to challenging environmental changes on the ﬂy, including novel scenarios not seen during training. The learned controller is up to 85 percent more energy efﬁcient and is more robust compared to baseline methods. We also deploy the controller on a physical robot without any randomization or adaptation scheme.},
	language = {en},
	urldate = {2020-10-01},
	journal = {arXiv:2009.10019 [cs]},
	author = {Da, Xingye and Xie, Zhaoming and Hoeller, David and Boots, Byron and Anandkumar, Animashree and Zhu, Yuke and Babich, Buck and Garg, Animesh},
	month = sep,
	year = {2020},
	note = {arXiv: 2009.10019},
	keywords = {Computer Science - Machine Learning, Computer Science - Robotics},
	file = {Da et al_2020_Learning a Contact-Adaptive Controller for Robust, Efficient Legged Locomotion.pdf:/home/dferigo/Zotero/storage/UABUZD58/Da et al_2020_Learning a Contact-Adaptive Controller for Robust, Efficient Legged Locomotion.pdf:application/pdf},
}

@article{yang_data_2019,
	title = {Data {Efficient} {Reinforcement} {Learning} for {Legged} {Robots}},
	url = {http://arxiv.org/abs/1907.03613},
	abstract = {We present a model-based reinforcement learning framework for robot locomotion that achieves walking based on only 4.5 minutes of data collected on a quadruped robot. To accurately model the robot’s dynamics over a long horizon, we introduce a loss function that tracks the model’s prediction over multiple timesteps. We adapt model predictive control to account for planning latency, which allows the learned model to be used for real time control. Additionally, to ensure safe exploration during model learning, we embed prior knowledge of leg trajectories into the action space. The resulting system achieves fast and robust locomotion. Unlike model-free methods, which optimize for a particular task, our planner can use the same learned dynamics for various tasks, simply by changing the reward function.1 To the best of our knowledge, our approach is more than an order of magnitude more sample efﬁcient than current model-free methods.},
	language = {en},
	urldate = {2022-04-13},
	journal = {arXiv:1907.03613 [cs]},
	author = {Yang, Yuxiang and Caluwaerts, Ken and Iscen, Atil and Zhang, Tingnan and Tan, Jie and Sindhwani, Vikas},
	month = oct,
	year = {2019},
	note = {arXiv: 1907.03613},
	keywords = {Computer Science - Machine Learning, Computer Science - Robotics, Computer Science - Artificial Intelligence},
	file = {Yang et al_2019_Data Efficient Reinforcement Learning for Legged Robots.pdf:/home/dferigo/Zotero/storage/XQMX3SIP/Yang et al_2019_Data Efficient Reinforcement Learning for Legged Robots.pdf:application/pdf},
}

@article{hwangboLearningAgileDynamic2019s,
	title = {Learning agile and dynamic motor skills for legged robots},
	issn = {2470-9476},
	doi = {10.1126/scirobotics.aau5872},
	language = {en},
	urldate = {2019-05-20},
	journal = {Science Robotics},
	author = {Hwangbo, Jemin and Lee, Joonho and Dosovitskiy, Alexey and Bellicoso, Dario and Tsounis, Vassilios and Koltun, Vladlen and Hutter, Marco},
	month = jan,
	year = {2019},
	file = {Hwangbo et al_2019_Learning agile and dynamic motor skills for legged robots.pdf:/home/dferigo/Zotero/storage/GDPEWKB2/Hwangbo et al_2019_Learning agile and dynamic motor skills for legged robots.pdf:application/pdf},
}

@article{tsounis_deepgait_2020,
	title = {{DeepGait}: {Planning} and {Control} of {Quadrupedal} {Gaits} using {Deep} {Reinforcement} {Learning}},
	shorttitle = {{DeepGait}},
	url = {http://arxiv.org/abs/1909.08399},
	abstract = {This paper addresses the problem of legged locomotion in non-flat terrain. As legged robots such as quadrupeds are to be deployed in terrains with geometries which are difficult to model and predict, the need arises to equip them with the capability to generalize well to unforeseen situations. In this work, we propose a novel technique for training neural-network policies for terrain-aware locomotion, which combines state-of-the-art methods for model-based motion planning and reinforcement learning. Our approach is centered on formulating Markov decision processes using the evaluation of dynamic feasibility criteria in place of physical simulation. We thus employ policy-gradient methods to independently train policies which respectively plan and execute foothold and base motions in 3D environments using both proprioceptive and exteroceptive measurements. We apply our method within a challenging suite of simulated terrain scenarios which contain features such as narrow bridges, gaps and stepping-stones, and train policies which succeed in locomoting effectively in all cases.},
	urldate = {2020-07-28},
	journal = {arXiv:1909.08399 [cs]},
	author = {Tsounis, Vassilios and Alge, Mitja and Lee, Joonho and Farshidian, Farbod and Hutter, Marco},
	month = jan,
	year = {2020},
	note = {arXiv: 1909.08399},
	keywords = {Computer Science - Machine Learning, Computer Science - Robotics},
	file = {arXiv.org Snapshot:/home/dferigo/Zotero/storage/NWSXQ664/1909.html:text/html;Tsounis et al_2020_DeepGait.pdf:/home/dferigo/Zotero/storage/UBF2BQVD/Tsounis et al_2020_DeepGait.pdf:application/pdf},
}

@inproceedings{theodorou_reinforcement_2010,
	title = {Reinforcement learning of motor skills in high dimensions: {A} path integral approach},
	isbn = {978-1-4244-5038-1},
	shorttitle = {Reinforcement learning of motor skills in high dimensions},
	url = {http://ieeexplore.ieee.org/document/5509336/},
	doi = {10.1109/ROBOT.2010.5509336},
	abstract = {Reinforcement learning (RL) is one of the most general approaches to learning control. Its applicability to complex motor systems, however, has been largely impossible so far due to the computational difﬁculties that reinforcement learning encounters in high dimensional continuous state-action spaces. In this paper, we derive a novel approach to RL for parameterized control policies based on the framework of stochastic optimal control with path integrals. While solidly grounded in optimal control theory and estimation theory, the update equations for learning are surprisingly simple and have no danger of numerical instabilities as neither matrix inversions nor gradient learning rates are required. Empirical evaluations demonstrate signiﬁcant performance improvements over gradient-based policy learning and scalability to highdimensional control problems. Finally, a learning experiment on a robot dog illustrates the functionality of our algorithm in a real-world scenario. We believe that our new algorithm, Policy Improvement with Path Integrals (PI2), offers currently one of the most efﬁcient, numerically robust, and easy to implement algorithms for RL in robotics.},
	language = {en},
	urldate = {2018-06-17},
	publisher = {IEEE},
	author = {Theodorou, Evangelos and Buchli, Jonas and Schaal, Stefan},
	month = may,
	year = {2010},
	pages = {2397--2403},
	file = {Theodorou et al_2010_Reinforcement learning of motor skills in high dimensions.pdf:/home/dferigo/Zotero/storage/JCDMGAN5/Theodorou et al_2010_Reinforcement learning of motor skills in high dimensions.pdf:application/pdf},
}

@article{tan_sim--real_2018,
	title = {Sim-to-{Real}: {Learning} {Agile} {Locomotion} {For} {Quadruped} {Robots}},
	abstract = {Designing agile locomotion for quadruped robots often requires extensive expertise and tedious manual tuning. In this paper, we present a system to automate this process by leveraging deep reinforcement learning techniques. Our system can learn quadruped locomotion from scratch using simple reward signals. In addition, users can provide an open loop reference to guide the learning process when more control over the learned gait is needed. The control policies are learned in a physics simulator and then deployed on real robots. In robotics, policies trained in simulation often do not transfer to the real world. We narrow this reality gap by improving the physics simulator and learning robust policies. We improve the simulation using system identiﬁcation, developing an accurate actuator model and simulating latency. We learn robust controllers by randomizing the physical environments, adding perturbations and designing a compact observation space. We evaluate our system on two agile locomotion gaits: trotting and galloping. After learning in simulation, a quadruped robot can successfully perform both gaits in the real world.},
	language = {en},
	author = {Tan, Jie and Zhang, Tingnan and Coumans, Erwin and Iscen, Atil and Bai, Yunfei and Hafner, Danijar and Bohez, Steven and Vanhoucke, Vincent},
	year = {2018},
	pages = {10},
	file = {Tan et al_2018_Sim-to-Real.pdf:/home/dferigo/Zotero/storage/STASMH88/Tan et al_2018_Sim-to-Real.pdf:application/pdf},
}

@article{lillicrap_continuous_2016,
	title = {Continuous {Control} with {Deep} {Reinforcement} {Learning}},
	abstract = {We adapt the ideas underlying the success of Deep Q-Learning to the continuous action domain. We present an actor-critic, model-free algorithm based on the deterministic policy gradient that can operate over continuous action spaces. Using the same learning algorithm, network architecture and hyper-parameters, our algorithm robustly solves more than 20 simulated physics tasks, including classic problems such as cartpole swing-up, dexterous manipulation, legged locomotion and car driving. Our algorithm is able to ﬁnd policies whose performance is competitive with those found by a planning algorithm with full access to the dynamics of the domain and its derivatives. We further demonstrate that for many of the tasks the algorithm can learn policies “end-to-end”: directly from raw pixel inputs.},
	language = {en},
	author = {Lillicrap, Timothy P and Hunt, Jonathan J and Pritzel, Alexander and Heess, Nicolas and Erez, Tom and Tassa, Yuval and Silver, David and Wierstra, Daan},
	year = {2016},
	keywords = {deepmind},
	pages = {14},
	file = {Lillicrap et al_2016_Continuous Control with Deep Reinforcement Learning.pdf:/home/dferigo/Zotero/storage/4CRG6GXA/Lillicrap et al_2016_Continuous Control with Deep Reinforcement Learning.pdf:application/pdf},
}

@article{heess_learning_2015,
	title = {Learning {Continuous} {Control} {Policies} by {Stochastic} {Value} {Gradients}},
	abstract = {We present a uniﬁed framework for learning continuous control policies using backpropagation. It supports stochastic control by treating stochasticity in the Bellman equation as a deterministic function of exogenous noise. The product is a spectrum of general policy gradient algorithms that range from model-free methods with value functions to model-based methods without value functions. We use learned models but only require observations from the environment instead of observations from model-predicted trajectories, minimizing the impact of compounded model errors. We apply these algorithms ﬁrst to a toy stochastic control problem and then to several physics-based control problems in simulation. One of these variants, SVG(1), shows the effectiveness of learning models, value functions, and policies simultaneously in continuous domains.},
	author = {Heess, Nicolas and Wayne, Greg and Silver, David and Lillicrap, Timothy and Tassa, Yuval and Erez, Tom},
	year = {2015},
	pages = {13},
	file = {Heess et al_2015_Learning Continuous Control Policies by Stochastic Value Gradients.pdf:/home/dferigo/Zotero/storage/3A8IMU4B/Heess et al_2015_Learning Continuous Control Policies by Stochastic Value Gradients.pdf:application/pdf},
}

@article{heess_emergence_2017,
	title = {Emergence of {Locomotion} {Behaviours} in {Rich} {Environments}},
	abstract = {The reinforcement learning paradigm allows, in principle, for complex behaviours to be learned directly from simple reward signals. In practice, however, it is common to carefully hand-design the reward function to encourage a particular solution, or to derive it from demonstration data. In this paper explore how a rich environment can help to promote the learning of complex behavior. Speciﬁcally, we train agents in diverse environmental contexts, and ﬁnd that this encourages the emergence of robust behaviours that perform well across a suite of tasks. We demonstrate this principle for locomotion – behaviours that are known for their sensitivity to the choice of reward. We train several simulated bodies on a diverse set of challenging terrains and obstacles, using a simple reward function based on forward progress. Using a novel scalable variant of policy gradient reinforcement learning, our agents learn to run, jump, crouch and turn as required by the environment without explicit reward-based guidance. A visual depiction of highlights of the learned behavior can be viewed in this video.},
	author = {Heess, Nicolas and TB, Dhruva and Sriram, Srinivasan and Lemmon, Jay and Merel, Josh and Wayne, Greg and Tassa, Yuval and Erez, Tom and Wang, Ziyu and Eslami, S M Ali and Riedmiller, Martin and Silver, David},
	year = {2017},
	keywords = {deepmind},
	file = {Heess et al_2017_Emergence of Locomotion Behaviours in Rich Environments.pdf:/home/dferigo/Zotero/storage/H9E9RKMJ/Heess et al_2017_Emergence of Locomotion Behaviours in Rich Environments.pdf:application/pdf},
}

@article{rodriguez_deepwalk_2021,
	title = {{DeepWalk}: {Omnidirectional} {Bipedal} {Gait} by {Deep} {Reinforcement} {Learning}},
	shorttitle = {{DeepWalk}},
	url = {http://arxiv.org/abs/2106.00534},
	abstract = {Bipedal walking is one of the most difﬁcult but exciting challenges in robotics. The difﬁculties arise from the complexity of high-dimensional dynamics, sensing and actuation limitations combined with real-time and computational constraints. Deep Reinforcement Learning (DRL) holds the promise to address these issues by fully exploiting the robot dynamics with minimal craftsmanship. In this paper, we propose a novel DRL approach that enables an agent to learn omnidirectional locomotion for humanoid (bipedal) robots. Notably, the locomotion behaviors are accomplished by a single control policy (a single neural network). We achieve this by introducing a new curriculum learning method that gradually increases the task difﬁculty by scheduling target velocities. In addition, our method does not require reference motions which facilities its application to robots with different kinematics, and reduces the overall complexity. Finally, different strategies for sim-to-real transfer are presented which allow us to transfer the learned policy to a real humanoid robot.},
	language = {en},
	urldate = {2022-04-13},
	journal = {arXiv:2106.00534 [cs]},
	author = {Rodriguez, Diego and Behnke, Sven},
	month = jun,
	year = {2021},
	note = {arXiv: 2106.00534},
	keywords = {Computer Science - Robotics},
	file = {Rodriguez_Behnke_2021_DeepWalk.pdf:/home/dferigo/Zotero/storage/URLX8XDC/Rodriguez_Behnke_2021_DeepWalk.pdf:application/pdf},
}

@article{rudin_learning_2021,
	title = {Learning to {Walk} in {Minutes} {Using} {Massively} {Parallel} {Deep} {Reinforcement} {Learning}},
	url = {http://arxiv.org/abs/2109.11978},
	abstract = {In this work, we present and study a training set-up that achieves fast policy generation for real-world robotic tasks by using massive parallelism on a single workstation GPU. We analyze and discuss the impact of different training algorithm components in the massively parallel regime on the ﬁnal policy performance and training times. In addition, we present a novel game-inspired curriculum that is well suited for training with thousands of simulated robots in parallel. We evaluate the approach by training the quadrupedal robot ANYmal to walk on challenging terrain. The parallel approach allows training policies for ﬂat terrain in under four minutes, and in twenty minutes for uneven terrain. This represents a speedup of multiple orders of magnitude compared to previous work. Finally, we transfer the policies to the real robot to validate the approach. We open-source our training code to help accelerate further research in the ﬁeld of learned legged locomotion: https://leggedrobotics.github.io/legged\_gym/.},
	language = {en},
	urldate = {2022-04-13},
	journal = {arXiv:2109.11978 [cs]},
	author = {Rudin, Nikita and Hoeller, David and Reist, Philipp and Hutter, Marco},
	month = oct,
	year = {2021},
	note = {arXiv: 2109.11978},
	keywords = {Computer Science - Machine Learning, Computer Science - Robotics},
	file = {Rudin et al_2021_Learning to Walk in Minutes Using Massively Parallel Deep Reinforcement Learning.pdf:/home/dferigo/Zotero/storage/4C3M2URR/Rudin et al_2021_Learning to Walk in Minutes Using Massively Parallel Deep Reinforcement Learning.pdf:application/pdf},
}

@article{suh_differentiable_2022,
	title = {Do {Differentiable} {Simulators} {Give} {Better} {Policy} {Gradients}?},
	url = {http://arxiv.org/abs/2202.00817},
	abstract = {Differentiable simulators promise faster computation time for reinforcement learning by replacing zeroth-order gradient estimates of a stochastic objective with an estimate based on ﬁrst-order gradients. However, it is yet unclear what factors decide the performance of the two estimators on complex landscapes that involve long-horizon planning and control on physical systems, despite the crucial relevance of this question for the utility of differentiable simulators. We show that characteristics of certain physical systems, such as stiffness or discontinuities, may compromise the efﬁcacy of the ﬁrst-order estimator, and analyze this phenomenon through the lens of bias and variance. We additionally propose an α-order gradient estimator, with α ∈ [0, 1], which correctly utilizes exact gradients to combine the efﬁciency of ﬁrst-order estimates with the robustness of zeroorder methods. We demonstrate the pitfalls of traditional estimators and the advantages of the α-order estimator on some numerical examples.},
	language = {en},
	urldate = {2022-04-13},
	journal = {arXiv:2202.00817 [cs]},
	author = {Suh, H. J. Terry and Simchowitz, Max and Zhang, Kaiqing and Tedrake, Russ},
	month = feb,
	year = {2022},
	note = {arXiv: 2202.00817},
	keywords = {Computer Science - Machine Learning, Computer Science - Robotics, Computer Science - Artificial Intelligence},
	file = {Suh et al_2022_Do Differentiable Simulators Give Better Policy Gradients.pdf:/home/dferigo/Zotero/storage/8LC9ZL5P/Suh et al_2022_Do Differentiable Simulators Give Better Policy Gradients.pdf:application/pdf},
}

@article{plancher_grid_2022,
	title = {{GRiD}: {GPU}-{Accelerated} {Rigid} {Body} {Dynamics} with {Analytical} {Gradients}},
	shorttitle = {{GRiD}},
	url = {http://arxiv.org/abs/2109.06976},
	abstract = {We introduce GRiD: a GPU-accelerated library for computing rigid body dynamics with analytical gradients. GRiD was designed to accelerate the nonlinear trajectory optimization subproblem used in state-of-the-art robotic planning, control, and machine learning, which requires tens to hundreds of naturally parallel computations of rigid body dynamics and their gradients at each iteration. GRiD leverages URDF parsing and code generation to deliver optimized dynamics kernels that not only expose GPU-friendly computational patterns, but also take advantage of both ﬁne-grained parallelism within each computation and coarse-grained parallelism between computations. Through this approach, when performing multiple computations of rigid body dynamics algorithms, GRiD provides as much as a 7.2x speedup over a state-of-the-art, multi-threaded CPU implementation, and maintains as much as a 2.5x speedup when accounting for I/O overhead. We release GRiD as an open-source library for use by the wider robotics community.},
	language = {en},
	urldate = {2022-04-13},
	journal = {arXiv:2109.06976 [cs]},
	author = {Plancher, Brian and Neuman, Sabrina M. and Ghosal, Radhika and Kuindersma, Scott and Reddi, Vijay Janapa},
	month = feb,
	year = {2022},
	note = {arXiv: 2109.06976},
	keywords = {Computer Science - Robotics},
	file = {Plancher et al. - 2022 - GRiD GPU-Accelerated Rigid Body Dynamics with Ana.pdf:/home/dferigo/Zotero/storage/PS4UCZMJ/Plancher et al. - 2022 - GRiD GPU-Accelerated Rigid Body Dynamics with Ana.pdf:application/pdf},
}

@article{qiao_efficient_2021,
	title = {Efficient {Differentiable} {Simulation} of {Articulated} {Bodies}},
	url = {http://arxiv.org/abs/2109.07719},
	abstract = {We present a method for efﬁcient differentiable simulation of articulated bodies. This enables integration of articulated body dynamics into deep learning frameworks, and gradient-based optimization of neural networks that operate on articulated bodies. We derive the gradients of the forward dynamics using spatial algebra and the adjoint method. Our approach is an order of magnitude faster than autodiff tools. By only saving the initial states throughout the simulation process, our method reduces memory requirements by two orders of magnitude. We demonstrate the utility of efﬁcient differentiable dynamics for articulated bodies in a variety of applications. We show that reinforcement learning with articulated systems can be accelerated using gradients provided by our method. In applications to control and inverse problems, gradient-based optimization enabled by our work accelerates convergence by more than an order of magnitude.},
	language = {en},
	urldate = {2022-04-13},
	journal = {arXiv:2109.07719 [cs]},
	author = {Qiao, Yi-Ling and Liang, Junbang and Koltun, Vladlen and Lin, Ming C.},
	month = sep,
	year = {2021},
	note = {arXiv: 2109.07719},
	keywords = {Computer Science - Machine Learning, Computer Science - Robotics, Computer Science - Graphics},
	file = {Qiao et al_2021_Efficient Differentiable Simulation of Articulated Bodies.pdf:/home/dferigo/Zotero/storage/98MGA44B/Qiao et al_2021_Efficient Differentiable Simulation of Articulated Bodies.pdf:application/pdf},
}

@article{werling_fast_2021,
	title = {Fast and {Feature}-{Complete} {Differentiable} {Physics} for {Articulated} {Rigid} {Bodies} with {Contact}},
	url = {http://arxiv.org/abs/2103.16021},
	abstract = {We present a fast and feature-complete differentiable physics engine, Nimble (nimblephysics.org), that supports Lagrangian dynamics and hard contact constraints for articulated rigid body simulation. Our differentiable physics engine offers a complete set of features that are typically only available in non-differentiable physics simulators commonly used by robotics applications. We solve contact constraints precisely using linear complementarity problems (LCPs). We present efficient and novel analytical gradients through the LCP formulation of inelastic contact that exploit the sparsity of the LCP solution. We support complex contact geometry, and gradients approximating continuous-time elastic collision. We also introduce a novel method to compute complementarity-aware gradients that help downstream optimization tasks avoid stalling in saddle points. We show that an implementation of this combination in an existing physics engine (DART) is capable of a 87x single-core speedup over finite-differencing in computing analytical Jacobians for a single timestep, while preserving all the expressiveness of original DART.},
	urldate = {2022-04-13},
	journal = {arXiv:2103.16021 [cs, eess]},
	author = {Werling, Keenon and Omens, Dalton and Lee, Jeongseok and Exarchos, Ioannis and Liu, C. Karen},
	month = jun,
	year = {2021},
	note = {arXiv: 2103.16021},
	keywords = {Computer Science - Machine Learning, Computer Science - Robotics, Computer Science - Artificial Intelligence, Electrical Engineering and Systems Science - Systems and Control},
	file = {arXiv.org Snapshot:/home/dferigo/Zotero/storage/CHBP72VL/2103.html:text/html;Werling et al_2021_Fast and Feature-Complete Differentiable Physics for Articulated Rigid Bodies.pdf:/home/dferigo/Zotero/storage/EZU732MN/Werling et al_2021_Fast and Feature-Complete Differentiable Physics for Articulated Rigid Bodies.pdf:application/pdf},
}

@article{howell_dojo_2022,
	title = {Dojo: {A} {Differentiable} {Simulator} for {Robotics}},
	shorttitle = {Dojo},
	url = {http://arxiv.org/abs/2203.00806},
	abstract = {We present a differentiable rigid-body-dynamics simulator for robotics that prioritizes physical accuracy and differentiability: Dojo. The simulator utilizes an expressive maximal-coordinates representation, achieves stable simulation at low sample rates, and conserves energy and momentum by employing a variational integrator. A nonlinear complementarity problem, with nonlinear friction cones, models hard contact and is reliably solved using a custom primal-dual interiorpoint method. The implicit-function theorem enables efﬁcient differentiation of an intermediate relaxed problem and computes smooth gradients from the contact model. We demonstrate the usefulness of the simulator and its gradients through a number of examples including: simulation, trajectory optimization, reinforcement learning, and system identiﬁcation.},
	language = {en},
	urldate = {2022-04-13},
	journal = {arXiv:2203.00806 [cs]},
	author = {Howell, Taylor A. and Cleac'h, Simon Le and Kolter, J. Zico and Schwager, Mac and Manchester, Zachary},
	month = mar,
	year = {2022},
	note = {arXiv: 2203.00806},
	keywords = {Computer Science - Robotics},
	file = {Howell et al_2022_Dojo.pdf:/home/dferigo/Zotero/storage/SRC36QEU/Howell et al_2022_Dojo.pdf:application/pdf},
}

@article{gillen_leveraging_2022,
	title = {Leveraging {Reward} {Gradients} {For} {Reinforcement} {Learning} in {Differentiable} {Physics} {Simulations}},
	url = {http://arxiv.org/abs/2203.02857},
	abstract = {In recent years, fully differentiable rigid body physics simulators have been developed, which can be used to simulate a wide range of robotic systems. In the context of reinforcement learning for control, these simulators theoretically allow algorithms to be applied directly to analytic gradients of the reward function. However, to date, these gradients have proved extremely challenging to use, and are outclassed by algorithms using no gradient information at all. In this work we present a novel algorithm, cross entropy analytic policy gradients, that is able to leverage these gradients to outperform state of art deep reinforcement learning on a certain set of challenging nonlinear control problems.},
	language = {en},
	urldate = {2022-04-13},
	journal = {arXiv:2203.02857 [cs, eess]},
	author = {Gillen, Sean and Byl, Katie},
	month = mar,
	year = {2022},
	note = {arXiv: 2203.02857},
	keywords = {Computer Science - Machine Learning, Computer Science - Robotics, Electrical Engineering and Systems Science - Systems and Control},
	file = {Gillen_Byl_2022_Leveraging Reward Gradients For Reinforcement Learning in Differentiable.pdf:/home/dferigo/Zotero/storage/2SG33NTS/Gillen_Byl_2022_Leveraging Reward Gradients For Reinforcement Learning in Differentiable.pdf:application/pdf},
}

@article{saccon_centroidal_2017,
	title = {On {Centroidal} {Dynamics} and {Integrability} of {Average} {Angular} {Velocity}},
	volume = {2},
	issn = {2377-3766, 2377-3774},
	url = {http://arxiv.org/abs/1701.02514},
	doi = {10.1109/LRA.2017.2655560},
	abstract = {In the literature on robotics and multibody dynamics, the concept of average angular velocity has received considerable attention in recent years. We address the question of whether the average angular velocity deﬁnes an orientation frame that depends only on the current robot conﬁguration and provide a simple algebraic condition to check whether this holds. In the language of geometric mechanics, this condition corresponds to requiring the ﬂatness of the mechanical connection associated to the robotic system. Here, however, we provide both a reinterpretation and a proof of this result accessible to readers with a background in rigid body kinematics and multibody dynamics but not necessarily acquainted with differential geometry, still providing precise links to the geometric mechanics literature. This should help spreading the algebraic condition beyond the scope of geometric mechanics, contributing to a proper utilization and understanding of the concept of average angular velocity.},
	language = {en},
	number = {2},
	urldate = {2022-04-13},
	journal = {IEEE Robotics and Automation Letters},
	author = {Saccon, Alessandro and Traversaro, Silvio and Nori, Francesco and Nijmeijer, Henk},
	month = apr,
	year = {2017},
	note = {arXiv: 1701.02514},
	keywords = {Computer Science - Robotics},
	pages = {943--950},
	file = {Saccon et al_2017_On Centroidal Dynamics and Integrability of Average Angular Velocity.pdf:/home/dferigo/Zotero/storage/6P4LEJ7T/Saccon et al_2017_On Centroidal Dynamics and Integrability of Average Angular Velocity.pdf:application/pdf},
}

@article{schulman_proximal_2017,
	title = {Proximal {Policy} {Optimization} {Algorithms}},
	abstract = {We propose a new family of policy gradient methods for reinforcement learning, which alternate between sampling data through interaction with the environment, and optimizing a "surrogate" objective function using stochastic gradient ascent. Whereas standard policy gradient methods perform one gradient update per data sample, we propose a novel objective function that enables multiple epochs of minibatch updates. The new methods, which we call proximal policy optimization (PPO), have some of the benefits of trust region policy optimization (TRPO), but they are much simpler to implement, more general, and have better sample complexity (empirically). Our experiments test PPO on a collection of benchmark tasks, including simulated robotic locomotion and Atari game playing, and we show that PPO outperforms other online policy gradient methods, and overall strikes a favorable balance between sample complexity, simplicity, and wall-time.},
	urldate = {2020-05-07},
	journal = {arXiv},
	author = {Schulman, John and Wolski, Filip and Dhariwal, Prafulla and Radford, Alec and Klimov, Oleg},
	year = {2017},
	keywords = {Computer Science - Machine Learning},
}

@article{schulman_trust_2017,
	title = {Trust {Region} {Policy} {Optimization}},
	url = {http://arxiv.org/abs/1502.05477},
	abstract = {We describe an iterative procedure for optimizing policies, with guaranteed monotonic improvement. By making several approximations to the theoretically-justiﬁed procedure, we develop a practical algorithm, called Trust Region Policy Optimization (TRPO). This algorithm is similar to natural policy gradient methods and is effective for optimizing large nonlinear policies such as neural networks. Our experiments demonstrate its robust performance on a wide variety of tasks: learning simulated robotic swimming, hopping, and walking gaits; and playing Atari games using images of the screen as input. Despite its approximations that deviate from the theory, TRPO tends to give monotonic improvement, with little tuning of hyperparameters.},
	language = {en},
	urldate = {2020-07-22},
	journal = {arXiv:1502.05477 [cs]},
	author = {Schulman, John and Levine, Sergey and Moritz, Philipp and Jordan, Michael I. and Abbeel, Pieter},
	month = apr,
	year = {2017},
	note = {arXiv: 1502.05477},
	keywords = {Computer Science - Machine Learning},
}

@article{peng_learning_2017,
	title = {Learning locomotion skills using {DeepRL}: does the choice of action space matter?},
	shorttitle = {Learning locomotion skills using {DeepRL}},
	abstract = {The use of deep reinforcement learning allows for high-dimensional state descriptors, but little is known about how the choice of action representation impacts learning and the resulting performance. We compare the impact of four different action parameterizations (torques, muscle-activations, target joint angles, and target joint-angle velocities) in terms of learning time, policy robustness, motion quality, and policy query rates. Our results are evaluated on a gait-cycle imitation task for multiple planar articulated figures and multiple gaits. We demonstrate that the local feedback provided by higher-level action parameterizations can significantly impact the learning, robustness, and motion quality of the resulting policies.},
	journal = {SIGGRAPH},
	author = {Peng, Xue Bin and van de Panne, Michiel},
	year = {2017},
	keywords = {motion control, locomotion skills, physics-based character animation},
}

@article{peng_learning_2020,
	title = {Learning {Agile} {Robotic} {Locomotion} {Skills} by {Imitating} {Animals}},
	url = {http://arxiv.org/abs/2004.00784},
	abstract = {Reproducing the diverse and agile locomotion skills of animals has been a longstanding challenge in robotics. While manually-designed controllers have been able to emulate many complex behaviors, building such controllers involves a time-consuming and difficult development process, often requiring substantial expertise of the nuances of each skill. Reinforcement learning provides an appealing alternative for automating the manual effort involved in the development of controllers. However, designing learning objectives that elicit the desired behaviors from an agent can also require a great deal of skill-specific expertise. In this work, we present an imitation learning system that enables legged robots to learn agile locomotion skills by imitating real-world animals. We show that by leveraging reference motion data, a single learning-based approach is able to automatically synthesize controllers for a diverse repertoire behaviors for legged robots. By incorporating sample efficient domain adaptation techniques into the training process, our system is able to learn adaptive policies in simulation that can then be quickly adapted for real-world deployment. To demonstrate the effectiveness of our system, we train an 18-DoF quadruped robot to perform a variety of agile behaviors ranging from different locomotion gaits to dynamic hops and turns.},
	urldate = {2020-05-07},
	journal = {arXiv:2004.00784 [cs]},
	author = {Peng, Xue Bin and Coumans, Erwin and Zhang, Tingnan and Lee, Tsang-Wei and Tan, Jie and Levine, Sergey},
	month = apr,
	year = {2020},
	note = {arXiv: 2004.00784},
	keywords = {Computer Science - Machine Learning, Computer Science - Robotics},
}

@article{peng_deepmimic_2018,
	title = {{DeepMimic}: {Example}-{Guided} {Deep} {Reinforcement} {Learning} of {Physics}-{Based} {Character} {Skills}},
	abstract = {A longstanding goal in character animation is to combine data-driven specification of behavior with a system that can execute a similar behavior in a physical simulation, thus enabling realistic responses to perturbations and environmental variation. We show that well-known reinforcement learning (RL) methods can be adapted to learn robust control policies capable of imitating a broad range of example motion clips, while also learning complex recoveries, adapting to changes in morphology, and accomplishing user-specified goals. Our method handles keyframed motions, highly-dynamic actions such as motion-captured flips and spins, and retargeted motions. By combining a motion-imitation objective with a task objective, we can train characters that react intelligently in interactive settings, e.g., by walking in a desired direction or throwing a ball at a user-specified target. This approach thus combines the convenience and motion quality of using motion clips to define the desired style and appearance, with the flexibility and generality afforded by RL methods and physics-based animation. We further explore a number of methods for integrating multiple clips into the learning process to develop multi-skilled agents capable of performing a rich repertoire of diverse skills. We demonstrate results using multiple characters (human, Atlas robot, bipedal dinosaur, dragon) and a large variety of skills, including locomotion, acrobatics, and martial arts.},
	urldate = {2019-09-24},
	journal = {ACM Transactions on Graphics},
	author = {Peng, Xue Bin and Abbeel, Pieter and Levine, Sergey and van de Panne, Michiel},
	month = jul,
	year = {2018},
	keywords = {Computer Science - Machine Learning, Computer Science - Artificial Intelligence, Computer Science - Graphics},
}

@article{metta_icub_2010,
	title = {The {iCub} humanoid robot: {An} open-systems platform for research in cognitive development},
	shorttitle = {The {iCub} humanoid robot},
	abstract = {We describe a humanoid robot platform — the iCub — which was designed to support collaborative research in cognitive development through autonomous exploration and social interaction. The motivation for this effort is the conviction that significantly greater impact can be leveraged by adopting an open systems policy for software and hardware development. This creates the need for a robust humanoid robot that offers rich perceptuo-motor capabilities with many degrees of freedom, a cognitive capacity for learning and development, a software architecture that encourages reuse \& easy integration, and a support infrastructure that fosters collaboration and sharing of resources. The iCub satisfies all of these needs in the guise of an open-system platform which is freely available and which has attracted a growing community of users and developers. To date, twenty iCubs each comprising approximately 5000 mechanical and electrical parts have been delivered to several research labs in Europe and to one in the USA.},
	language = {en},
	urldate = {2020-07-22},
	journal = {Neural Networks},
	author = {Metta, Giorgio and Natale, Lorenzo and Nori, Francesco and Sandini, Giulio and Vernon, David and Fadiga, Luciano and von Hofsten, Claes and Rosander, Kerstin and Lopes, Manuel and Santos-Victor, José and Bernardino, Alexandre and Montesano, Luis},
	year = {2010},
}

@article{lillicrap_continuous_2016-1,
	title = {Continuous {Control} with {Deep} {Reinforcement} {Learning}},
	abstract = {We adapt the ideas underlying the success of Deep Q-Learning to the continuous action domain. We present an actor-critic, model-free algorithm based on the deterministic policy gradient that can operate over continuous action spaces. Using the same learning algorithm, network architecture and hyper-parameters, our algorithm robustly solves more than 20 simulated physics tasks, including classic problems such as cartpole swing-up, dexterous manipulation, legged locomotion and car driving. Our algorithm is able to ﬁnd policies whose performance is competitive with those found by a planning algorithm with full access to the dynamics of the domain and its derivatives. We further demonstrate that for many of the tasks the algorithm can learn policies “end-to-end”: directly from raw pixel inputs.},
	language = {en},
	journal = {ICLR},
	author = {Lillicrap, Timothy P and Hunt, Jonathan J and Pritzel, Alexander and Heess, Nicolas and Erez, Tom and Tassa, Yuval and Silver, David and Wierstra, Daan},
	year = {2016},
	keywords = {deepmind},
}

@article{liang_rllib_2018,
	title = {{RLlib}: {Abstractions} for {Distributed} {Reinforcement} {Learning}},
	abstract = {Reinforcement learning (RL) algorithms involve the deep nesting of highly irregular computation patterns, each of which typically exhibits opportunities for distributed computation. We argue for distributing RL components in a composable way by adapting algorithms for top-down hierarchical control, thereby encapsulating parallelism and resource requirements within short-running compute tasks. We demonstrate the beneﬁts of this principle through RLlib: a library that provides scalable software primitives for RL. These primitives enable a broad range of algorithms to be implemented with high performance, scalability, and substantial code reuse. RLlib is available as part of the open source Ray project 1.},
	journal = {arXiv},
	author = {Liang, Eric and Liaw, Richard and Moritz, Philipp and Nishihara, Robert and Fox, Roy and Goldberg, Ken and Gonzalez, Joseph E. and Jordan, Michael I. and Stoica, Ion},
	year = {2018},
	keywords = {Computer Science - Machine Learning, Computer Science - Artificial Intelligence, Computer Science - Distributed, Parallel, and Cluster Computing},
}

@inproceedings{yang_learning_2018,
	title = {Learning {Whole}-{Body} {Motor} {Skills} for {Humanoids}},
	doi = {10.1109/HUMANOIDS.2018.8625045},
	abstract = {This paper presents a hierarchical framework for Deep Reinforcement Learning that acquires motor skills for a variety of push recovery and balancing behaviors, i.e., ankle, hip, foot tilting, and stepping strategies. The policy is trained in a physics simulator with realistic setting of robot model and low-level impedance control that are easy to transfer the learned skills to real robots. The advantage over traditional methods is the integration of high-level planner and feedback control all in one single coherent policy network, which is generic for learning versatile balancing and recovery motions against unknown perturbations at arbitrary locations (e.g., legs, torso). Furthermore, the proposed framework allows the policy to be learned quickly by many state-of-the-art learning algorithms. By comparing our learned results to studies of preprogrammed, special-purpose controllers in the literature, self-learned skills are comparable in terms of disturbance rejection but with additional advantages of producing a wide range of adaptive, versatile and robust behaviors.},
	booktitle = {2018 {IEEE}-{RAS} 18th {International} {Conference} on {Humanoid} {Robots} ({Humanoids})},
	author = {Yang, C. and Yuan, K. and Merkt, W. and Komura, T. and Vijayakumar, S. and Li, Z.},
	month = nov,
	year = {2018},
	keywords = {Reinforcement learning, Aerospace electronics, Humanoid robots, humanoids, humanoid robots, learning (artificial intelligence), legged locomotion, feedback, adaptive behaviors, ankle, arbitrary locations, balancing behaviors, Collision avoidance, Deep Reinforcement, feedback control, foot tilting, hierarchical framework, high-level planner, hip, learning algorithms, low-level impedance control, path planning, PD control, preprogrammed purpose controllers, push recovery, realistic setting, recovery motions, robot model, robust behaviors, self-learned skills, single coherent policy network, special-purpose controllers, stepping strategies, unknown perturbations, versatile balancing, versatile behaviors, whole-body motor skills},
	pages = {270--276},
	file = {IEEE Xplore Abstract Record:/home/dferigo/Zotero/storage/IDSJNM9R/8625045.html:text/html;Yang et al_2018_Learning Whole-Body Motor Skills for Humanoids.pdf:/home/dferigo/Zotero/storage/TNUZ5X3Q/Yang et al_2018_Learning Whole-Body Motor Skills for Humanoids.pdf:application/pdf},
}

@article{xieIterativeReinforcementLearning2019s,
	title = {Iterative {Reinforcement} {Learning} {Based} {Design} of {Dynamic} {Locomotion} {Skills} for {Cassie}},
	abstract = {Deep reinforcement learning (DRL) is a promising approach for developing legged locomotion skills. However, the iterative design process that is inevitable in practice is poorly supported by the default methodology. It is difﬁcult to predict the outcomes of changes made to the reward functions, policy architectures, and the set of tasks being trained on. In this paper, we propose a practical method that allows the reward function to be fully redeﬁned on each successive design iteration while limiting the deviation from the previous iteration. We characterize policies via sets of Deterministic Action Stochastic State (DASS) tuples, which represent the deterministic policy state-action pairs as sampled from the states visited by the trained stochastic policy. New policies are trained using a policy gradient algorithm which then mixes RL-based policy gradients with gradient updates deﬁned by the DASS tuples. The tuples also allow for robust policy distillation to new network architectures. We demonstrate the effectiveness of this iterative-design approach on the bipedal robot Cassie, achieving stable walking with different gait styles at various speeds. We demonstrate the successful transfer of policies learned in simulation to the physical robot without any dynamics randomization, and that variable-speed walking policies for the physical robot can be represented by a small dataset of 5-10k tuples.},
	language = {en},
	urldate = {2019-04-05},
	author = {Xie, Zhaoming and Clary, Patrick and Dao, Jeremy and Morais, Pedro and Hurst, Jonathan and van de Panne, Michiel},
	month = mar,
	year = {2019},
	keywords = {Computer Science - Robotics},
	file = {Xie et al_2019_Iterative Reinforcement Learning Based Design of Dynamic Locomotion Skills for.pdf:/home/dferigo/Zotero/storage/8MUYACWD/Xie et al_2019_Iterative Reinforcement Learning Based Design of Dynamic Locomotion Skills for.pdf:application/pdf},
}

@article{liangGPUAcceleratedRoboticSimulation2018s,
	title = {{GPU}-{Accelerated} {Robotic} {Simulation} for {Distributed} {Reinforcement} {Learning}},
	abstract = {Most Deep Reinforcement Learning (Deep RL) algorithms require a prohibitively large number of training samples for learning complex tasks. Many recent works on speeding up Deep RL have focused on distributed training and simulation. While distributed training is often done on the GPU, simulation is not. In this work, we propose using GPU-accelerated RL simulations as an alternative to CPU ones. Using NVIDIA Flex, a GPU-based physics engine, we show promising speed-ups of learning various continuous-control, locomotion tasks. With one GPU and CPU core, we are able to train the Humanoid running task in less than 20 minutes, using 10 − 1000× fewer CPU cores than previous works. We also demonstrate the scalability of our simulator to multi-GPU settings to train more challenging locomotion tasks.},
	language = {en},
	urldate = {2019-06-19},
	author = {Liang, Jacky and Makoviychuk, Viktor and Handa, Ankur and Chentanez, Nuttapong and Macklin, Miles and Fox, Dieter},
	month = oct,
	year = {2018},
	note = {arXiv: 1810.05762},
	keywords = {Computer Science - Robotics},
	file = {Liang et al_2018_GPU-Accelerated Robotic Simulation for Distributed Reinforcement Learning.pdf:/home/dferigo/Zotero/storage/WHNFWU9F/Liang et al_2018_GPU-Accelerated Robotic Simulation for Distributed Reinforcement Learning.pdf:application/pdf},
}

@inproceedings{ivaldiToolsSimulatingHumanoid2014s,
	title = {Tools for simulating humanoid robot dynamics: {A} survey based on user feedback},
	shorttitle = {Tools for simulating humanoid robot dynamics},
	doi = {10.1109/HUMANOIDS.2014.7041462},
	abstract = {The number of tools for dynamics simulation has grown substantially in the last few years. Humanoid robots, in particular, make extensive use of such tools for a variety of applications, from simulating contacts to planning complex motions. It is necessary for the humanoid robotics community to have a systematic evaluation to assist in choosing which of the available tools is best for their research. This paper surveys the state of the art in dynamics simulation and reports on the analysis of an online survey about the use of dynamics simulation in the robotics research community. The major requirements for robotics researchers are better physics engines and open-source software. Despite the numerous tools, there is not a general-purpose simulator which dominates the others in terms of performance or application. However, for humanoid robotics, Gazebo emerges as the best choice among the open-source projects, while V-Rep is the preferred commercial simulator. The survey report has been instrumental for choosing Gazebo as the base for the new simulator for the iCub humanoid robot.},
	booktitle = {{IEEE}-{RAS} {International} {Conference} on {Humanoid} {Robots}},
	author = {Ivaldi, S. and Peters, J. and Padois, V. and Nori, F.},
	month = nov,
	year = {2014},
	keywords = {Robots, humanoid robots, iCub humanoid robot, Open source software, open-source software, control engineering computing, Collision avoidance, Engines, physics engine, Physics, Communities, dynamics simulation, Gazebo robotics, general-purpose simulator, humanoid robot dynamics, robotics research community, user feedback, V-Rep simulator},
	file = {IEEE Xplore Abstract Record:/home/dferigo/Zotero/storage/3MNDLKGL/7041462.html:text/html;Ivaldi et al_2014_Tools for simulating humanoid robot dynamics.pdf:/home/dferigo/Zotero/storage/UHMX6A28/Ivaldi et al_2014_Tools for simulating humanoid robot dynamics.pdf:application/pdf},
}

@inproceedings{erezSimulationToolsModelbased2015s,
	title = {Simulation tools for model-based robotics: {Comparison} of {Bullet}, {Havok}, {MuJoCo}, {ODE} and {PhysX}},
	shorttitle = {Simulation tools for model-based robotics},
	doi = {10.1109/ICRA.2015.7139807},
	abstract = {There is growing need for software tools that can accurately simulate the complex dynamics of modern robots. While a number of candidates exist, the field is fragmented. It is difficult to select the best tool for a given project, or to predict how much effort will be needed and what the ultimate simulation performance will be. Here we introduce new quantitative measures of simulation performance, focusing on the numerical challenges that are typical for robotics as opposed to multi-body dynamics and gaming. We then present extensive simulation results, obtained within a new software framework for instantiating the same model in multiple engines and running side-by-side comparisons. Overall we find that each engine performs best on the type of system it was designed and optimized for: MuJoCo wins the robotics-related tests, while the gaming engines win the gaming-related tests without a clear leader among them. The simulations are illustrated in the accompanying movie.},
	author = {Erez, T. and Tassa, Y. and Todorov, E.},
	year = {2015},
	keywords = {Computational modeling, Joints, Robot kinematics, middleware, Mathematical model, control engineering computing, Engines, MuJoCo, Accuracy, Bullet, digital simulation, gaming engines, Havok, model-based robotics, ODE, PhysX, robots, simulation tools},
	file = {Erez et al_2015_Simulation tools for model-based robotics.pdf:/home/dferigo/Zotero/storage/QIJZ3BFR/Erez et al_2015_Simulation tools for model-based robotics.pdf:application/pdf;IEEE Xplore Abstract Record:/home/dferigo/Zotero/storage/KH8DWX7R/7139807.html:text/html},
}

@article{dulac-arnoldChallengesRealWorldReinforcement2019s,
	title = {Challenges of {Real}-{World} {Reinforcement} {Learning}},
	abstract = {Reinforcement learning (RL) has proven its worth in a series of artiﬁcial domains, and is beginning to show some successes in real-world scenarios. However, much of the research advances in RL are often hard to leverage in realworld systems due to a series of assumptions that are rarely satisﬁed in practice. We present a set of nine unique challenges that must be addressed to productionize RL to real world problems. For each of these challenges, we specify the exact meaning of the challenge, present some approaches from the literature, and specify some metrics for evaluating that challenge. An approach that addresses all nine challenges would be applicable to a large number of real world problems. We also present an example domain that has been modiﬁed to present these challenges as a testbed for practical RL research.},
	language = {en},
	urldate = {2019-05-22},
	author = {Dulac-Arnold, Gabriel and Mankowitz, Daniel and Hester, Todd},
	month = apr,
	year = {2019},
	note = {arXiv: 1904.12901},
	keywords = {Computer Science - Machine Learning, Computer Science - Robotics, Computer Science - Artificial Intelligence, Statistics - Machine Learning},
	file = {Dulac-Arnold et al_2019_Challenges of Real-World Reinforcement Learning.pdf:/home/dferigo/Zotero/storage/56EVTTDV/Dulac-Arnold et al_2019_Challenges of Real-World Reinforcement Learning.pdf:application/pdf},
}

@inproceedings{azad_modeling_2010,
	title = {Modeling the contact between a rolling sphere and a compliant ground plane},
	isbn = {978-0-9807404-1-7},
	abstract = {In this paper a complete 3D contact model is presented. This model has nonlinearity in determining both normal and friction forces. A new nonlinear normal force model is introduced and the differences between this new model and previous ones are described. Also, the characteristics of this model are discussed and compared with the classical models and empirical results. For calculating the friction force, a new nonlinear model which is able to calculate pre-sliding displacement and viscous friction is presented. It is shown that this model allows us to keep track of all the energy in the system and therefore, supports an energy audit. Also, the rolling motion of a sphere on a compliant ground plane is simulated, and the results are presented. Rolling motion is an integral part of simulating the general 3D motion of a robot's foot while in contact with the ground.},
	language = {en},
	booktitle = {Proceedings of {ACRA} 2010},
	author = {Azad, Morteza and Featherstone, Roy},
	year = {2010},
}

@article{azad_new_2014,
	title = {A {New} {Nonlinear} {Model} of {Contact} {Normal} {Force}},
	volume = {30},
	issn = {1941-0468},
	doi = {10.1109/TRO.2013.2293833},
	abstract = {This paper presents a new nonlinear model of the normal force that arises during compliant contact between two spheres, or between a sphere and a flat plate. It differs from a well-known existing model by only a single term. The advantage of the new model is that it accurately predicts the measured values of the coefficient of restitution between spheres and plates of various materials, whereas other models do not.},
	number = {3},
	journal = {IEEE Transactions on Robotics},
	author = {Azad, Morteza and Featherstone, Roy},
	month = jun,
	year = {2014},
	note = {Conference Name: IEEE Transactions on Robotics},
	keywords = {Computational modeling, Robots, Mathematical model, Data models, Predictive models, Force, Animation and simulation, compliant contact model, Materials, nonlinear damping, normal force},
	pages = {736--739},
}

@inproceedings{azad_model_2016,
	title = {Model estimation and control of compliant contact normal force},
	doi = {10.1109/HUMANOIDS.2016.7803313},
	abstract = {This paper proposes a method to realize desired contact normal forces between humanoids and their compliant environment. By using contact models, desired contact forces are converted to desired deformations of compliant surfaces. To achieve desired forces, deformations are controlled by controlling the contact point positions. Parameters of contact models are assumed to be known or estimated using the approach described in this paper. The proposed methods for estimating the contact parameters and controlling the contact normal force are implemented on a LWR KUKA IV arm. To verify both methods, experiments are performed with the KUKA arm while its end-effector is in contact with two different soft objects.},
	booktitle = {2016 {IEEE}-{RAS} 16th {International} {Conference} on {Humanoid} {Robots} ({Humanoids})},
	author = {Azad, Morteza and Ortenzi, Valerio and Lin, Hsiu-Chin and Rueckert, Elmar and Mistry, Michael},
	month = nov,
	year = {2016},
	note = {ISSN: 2164-0580},
	keywords = {Humanoid robots, Acceleration, Dynamics, Motion control, Force, Estimation},
	pages = {442--447},
}

@article{gilardi_literature_2002,
	title = {Literature survey of contact dynamics modelling},
	volume = {37},
	issn = {0094114X},
	url = {https://linkinghub.elsevier.com/retrieve/pii/S0094114X02000459},
	doi = {10.1016/S0094-114X(02)00045-9},
	abstract = {Impact is a complex phenomenon that occurs when two or more bodies undergo a collision. This phenomenon is important in many diﬀerent areas––machine design, robotics, multi-body analysis are just a few examples. The purpose of this manuscript is to provide an overview of the state of the art on impact and contact modelling methodologies, taking into account their diﬀerent aspects, speciﬁcally, the energy loss, the inﬂuence of the friction model, solution approaches, the multi-contact problem and the experimental veriﬁcation. The paper is intended to provide a review of results presented in literature and some additional insights into existing models, their interrelationship and the use of these models for impact/contact scenarios encountered in space robotic applications.},
	language = {en},
	number = {10},
	urldate = {2022-03-02},
	journal = {Mechanism and Machine Theory},
	author = {Gilardi, G. and Sharf, I.},
	month = oct,
	year = {2002},
	pages = {1213--1239},
	file = {Gilardi and Sharf - 2002 - Literature survey of contact dynamics modelling.pdf:/home/dferigo/Zotero/storage/R9VJE4P2/Gilardi and Sharf - 2002 - Literature survey of contact dynamics modelling.pdf:application/pdf},
}

@inproceedings{gros_baumgarte_2015,
	address = {Osaka},
	title = {Baumgarte stabilisation over the {SO}(3) rotation group for control},
	isbn = {978-1-4799-7886-1},
	url = {http://ieeexplore.ieee.org/document/7402298/},
	doi = {10.1109/CDC.2015.7402298},
	abstract = {Representations of the SO(3) rotation group are crucial for airborne and aerospace applications. Euler angles is a popular representation in many applications, but yield models having singular dynamics. This issue is addressed via non-singular representations, operating in dimensions higher than 3. Unit quaternions and the Direction Cosine Matrix are the best known non-singular representations, and favoured in challenging aeronautic and aerospace applications. All nonsingular representations yield invariants in the model dynamics, i.e. a set of nonlinear algebraic conditions that must be fulﬁlled by the model initial conditions, and that remain fulﬁlled over time. However, due to numerical integration errors, these conditions tend to become violated when using standard integrators, making the model inconsistent with the physical reality. This issue poses some challenges when non-singular representations are deployed in optimal control. In this paper, we propose a simple technique to address the issue for classical integration schemes, establish formally its properties, and illustrate it on the optimal control of a satellite.},
	language = {en},
	urldate = {2022-03-02},
	booktitle = {2015 54th {IEEE} {Conference} on {Decision} and {Control} ({CDC})},
	publisher = {IEEE},
	author = {Gros, Sebastien and Zanon, Marion and Diehl, Moritz},
	month = dec,
	year = {2015},
	pages = {620--625},
}

@article{schulman_trust_2017-1,
	title = {Trust {Region} {Policy} {Optimization}},
	url = {http://arxiv.org/abs/1502.05477},
	abstract = {We describe an iterative procedure for optimizing policies, with guaranteed monotonic improvement. By making several approximations to the theoretically-justiﬁed procedure, we develop a practical algorithm, called Trust Region Policy Optimization (TRPO). This algorithm is similar to natural policy gradient methods and is effective for optimizing large nonlinear policies such as neural networks. Our experiments demonstrate its robust performance on a wide variety of tasks: learning simulated robotic swimming, hopping, and walking gaits; and playing Atari games using images of the screen as input. Despite its approximations that deviate from the theory, TRPO tends to give monotonic improvement, with little tuning of hyperparameters.},
	language = {en},
	urldate = {2022-02-28},
	journal = {arXiv:1502.05477 [cs]},
	author = {Schulman, John and Levine, Sergey and Moritz, Philipp and Jordan, Michael I. and Abbeel, Pieter},
	month = apr,
	year = {2017},
	note = {arXiv: 1502.05477},
	keywords = {Computer Science - Machine Learning},
}

@article{schulman_high-dimensional_2018,
	title = {High-{Dimensional} {Continuous} {Control} {Using} {Generalized} {Advantage} {Estimation}},
	url = {http://arxiv.org/abs/1506.02438},
	abstract = {Policy gradient methods are an appealing approach in reinforcement learning because they directly optimize the cumulative reward and can straightforwardly be used with nonlinear function approximators such as neural networks. The two main challenges are the large number of samples typically required, and the difﬁculty of obtaining stable and steady improvement despite the nonstationarity of the incoming data. We address the ﬁrst challenge by using value functions to substantially reduce the variance of policy gradient estimates at the cost of some bias, with an exponentially-weighted estimator of the advantage function that is analogous to TD(λ). We address the second challenge by using trust region optimization procedure for both the policy and the value function, which are represented by neural networks.},
	language = {en},
	urldate = {2022-02-28},
	journal = {arXiv:1506.02438 [cs]},
	author = {Schulman, John and Moritz, Philipp and Levine, Sergey and Jordan, Michael and Abbeel, Pieter},
	month = oct,
	year = {2018},
	note = {arXiv: 1506.02438},
	keywords = {Computer Science - Machine Learning, Computer Science - Robotics, Electrical Engineering and Systems Science - Systems and Control},
}

@incollection{zhang_taxonomy_2020,
	address = {Singapore},
	title = {Taxonomy of {Reinforcement} {Learning} {Algorithms}},
	isbn = {9789811540950},
	url = {https://doi.org/10.1007/978-981-15-4095-0_3},
	abstract = {In this chapter, we introduce and summarize the taxonomy and categories for reinforcement learning (RL) algorithms. Figure 3.1 presents an overview of the typical and popular algorithms in a structural way. We classify reinforcement learning algorithms from different perspectives, including model-based and model-free methods, value-based and policy-based methods (or combination of the two), Monte Carlo methods and temporal-difference methods, on-policy and off-policy methods. Most reinforcement learning algorithms can be classified under different categories according to the above criteria, hope this helps to provide the readers some overviews of the full picture before introducing the algorithms in detail in later chapters.Open image in new windowFig. 3.1Map of reinforcement learning algorithms. Boxes with thick lines denote different categories, others denote specific algorithms},
	language = {en},
	urldate = {2022-02-24},
	booktitle = {Deep {Reinforcement} {Learning}: {Fundamentals}, {Research} and {Applications}},
	publisher = {Springer},
	author = {Zhang, Hongming and Yu, Tianyang},
	editor = {Dong, Hao and Ding, Zihan and Zhang, Shanghang},
	year = {2020},
	doi = {10.1007/978-981-15-4095-0_3},
	keywords = {Model-based, Model-free, Monte Carlo (MC) methods, Off-policy, On-policy, Policy-based, Temporal-difference (TD) methods, Value-based},
	pages = {125--133},
}

@article{pardo_time_2022,
	title = {Time {Limits} in {Reinforcement} {Learning}},
	url = {http://arxiv.org/abs/1712.00378},
	abstract = {In reinforcement learning, it is common to let an agent interact for a ﬁxed amount of time with its environment before resetting it and repeating the process in a series of episodes. The task that the agent has to learn can either be to maximize its performance over (i) that ﬁxed period, or (ii) an indeﬁnite period where time limits are only used during training to diversify experience. In this paper, we provide a formal account for how time limits could effectively be handled in each of the two cases and explain why not doing so can cause state aliasing and invalidation of experience replay, leading to suboptimal policies and training instability. In case (i), we argue that the terminations due to time limits are in fact part of the environment, and thus a notion of the remaining time should be included as part of the agent’s input to avoid violation of the Markov property. In case (ii), the time limits are not part of the environment and are only used to facilitate learning. We argue that this insight should be incorporated by bootstrapping from the value of the state at the end of each partial episode. For both cases, we illustrate empirically the signiﬁcance of our considerations in improving the performance and stability of existing reinforcement learning algorithms, showing state-of-the-art results on several control tasks.},
	language = {en},
	urldate = {2022-02-14},
	journal = {arXiv:1712.00378 [cs]},
	author = {Pardo, Fabio and Tavakoli, Arash and Levdik, Vitaly and Kormushev, Petar},
	month = jan,
	year = {2022},
	note = {arXiv: 1712.00378},
	keywords = {Computer Science - Machine Learning},
}

@article{schulman_proximal_2017-1,
	title = {Proximal {Policy} {Optimization} {Algorithms}},
	url = {http://arxiv.org/abs/1707.06347},
	abstract = {We propose a new family of policy gradient methods for reinforcement learning, which alternate between sampling data through interaction with the environment, and optimizing a "surrogate" objective function using stochastic gradient ascent. Whereas standard policy gradient methods perform one gradient update per data sample, we propose a novel objective function that enables multiple epochs of minibatch updates. The new methods, which we call proximal policy optimization (PPO), have some of the benefits of trust region policy optimization (TRPO), but they are much simpler to implement, more general, and have better sample complexity (empirically). Our experiments test PPO on a collection of benchmark tasks, including simulated robotic locomotion and Atari game playing, and we show that PPO outperforms other online policy gradient methods, and overall strikes a favorable balance between sample complexity, simplicity, and wall-time.},
	urldate = {2020-05-07},
	journal = {arXiv:1707.06347 [cs]},
	author = {Schulman, John and Wolski, Filip and Dhariwal, Prafulla and Radford, Alec and Klimov, Oleg},
	month = aug,
	year = {2017},
	note = {arXiv: 1707.06347},
	keywords = {Computer Science - Machine Learning},
}

@phdthesis{shulman_optimizing_2016,
	title = {Optimizing {Expectations}: {From} {Deep} {Reinforcement} {Learning} to {Stochastic} {Computation} {Graphs}},
	url = {http://joschu.net/docs/thesis.pdf},
	urldate = {2019-06-07},
	author = {Shulman, John},
	year = {2016},
}

@article{merel_catch_2020,
	title = {Catch \& {Carry}: reusable neural controllers for vision-guided whole-body tasks},
	volume = {39},
	issn = {0730-0301},
	shorttitle = {Catch \& {Carry}},
	url = {https://doi.org/10.1145/3386569.3392474},
	doi = {10.1145/3386569.3392474},
	abstract = {We address the longstanding challenge of producing flexible, realistic humanoid character controllers that can perform diverse whole-body tasks involving object interactions. This challenge is central to a variety of fields, from graphics and animation to robotics and motor neuroscience. Our physics-based environment uses realistic actuation and first-person perception - including touch sensors and egocentric vision - with a view to producing active-sensing behaviors (e.g. gaze direction), transferability to real robots, and comparisons to the biology. We develop an integrated neural-network based approach consisting of a motor primitive module, human demonstrations, and an instructed reinforcement learning regime with curricula and task variations. We demonstrate the utility of our approach for several tasks, including goal-conditioned box carrying and ball catching, and we characterize its behavioral robustness. The resulting controllers can be deployed in real-time on a standard PC.1},
	number = {4},
	urldate = {2022-11-16},
	journal = {ACM Transactions on Graphics},
	author = {Merel, Josh and Tunyasuvunakool, Saran and Ahuja, Arun and Tassa, Yuval and Hasenclever, Leonard and Pham, Vu and Erez, Tom and Wayne, Greg and Heess, Nicolas},
	month = aug,
	year = {2020},
	keywords = {reinforcement learning, motor control, object interaction, physics-based character},
	pages = {39:39:1--39:39:12},
	file = {Merel et al_2020_Catch & Carry.pdf:/home/dferigo/Zotero/storage/E2BDLCYP/Merel et al_2020_Catch & Carry.pdf:application/pdf},
}

@article{liu_motor_2022,
	title = {From motor control to team play in simulated humanoid football},
	volume = {7},
	url = {https://www.science.org/doi/10.1126/scirobotics.abo0235},
	doi = {10.1126/scirobotics.abo0235},
	abstract = {Learning to combine control at the level of joint torques with longer-term goal-directed behavior is a long-standing challenge for physically embodied artificial agents. Intelligent behavior in the physical world unfolds across multiple spatial and temporal scales: Although movements are ultimately executed at the level of instantaneous muscle tensions or joint torques, they must be selected to serve goals that are defined on much longer time scales and that often involve complex interactions with the environment and other agents. Recent research has demonstrated the potential of learning-based approaches applied to the respective problems of complex movement, long-term planning, and multiagent coordination. However, their integration traditionally required the design and optimization of independent subsystems and remains challenging. In this work, we tackled the integration of motor control and long-horizon decision-making in the context of simulated humanoid football, which requires agile motor control and multiagent coordination. We optimized teams of agents to play simulated football via reinforcement learning, constraining the solution space to that of plausible movements learned using human motion capture data. They were trained to maximize several environment rewards and to imitate pretrained football-specific skills if doing so led to improved performance. The result is a team of coordinated humanoid football players that exhibit complex behavior at different scales, quantified by a range of analysis and statistics, including those used in real-world sport analytics. Our work constitutes a complete demonstration of learned integrated decision-making at multiple scales in a multiagent setting.},
	number = {69},
	urldate = {2022-11-16},
	journal = {Science Robotics},
	author = {Liu, Siqi and Lever, Guy and Wang, Zhe and Merel, Josh and Eslami, S. M. Ali and Hennes, Daniel and Czarnecki, Wojciech M. and Tassa, Yuval and Omidshafiei, Shayegan and Abdolmaleki, Abbas and Siegel, Noah Y. and Hasenclever, Leonard and Marris, Luke and Tunyasuvunakool, Saran and Song, H. Francis and Wulfmeier, Markus and Muller, Paul and Haarnoja, Tuomas and Tracey, Brendan and Tuyls, Karl and Graepel, Thore and Heess, Nicolas},
	month = aug,
	year = {2022},
	note = {Publisher: American Association for the Advancement of Science},
	pages = {eabo0235},
	file = {Liu et al_2022_From motor control to team play in simulated humanoid football.pdf:/home/dferigo/Zotero/storage/8776G3UK/Liu et al_2022_From motor control to team play in simulated humanoid football.pdf:application/pdf},
}

@misc{bohez_imitate_2022,
	title = {Imitate and {Repurpose}: {Learning} {Reusable} {Robot} {Movement} {Skills} {From} {Human} and {Animal} {Behaviors}},
	shorttitle = {Imitate and {Repurpose}},
	url = {http://arxiv.org/abs/2203.17138},
	doi = {10.48550/arXiv.2203.17138},
	abstract = {We investigate the use of prior knowledge of human and animal movement to learn reusable locomotion skills for real legged robots. Our approach builds upon previous work on imitating human or dog Motion Capture (MoCap) data to learn a movement skill module. Once learned, this skill module can be reused for complex downstream tasks. Importantly, due to the prior imposed by the MoCap data, our approach does not require extensive reward engineering to produce sensible and natural looking behavior at the time of reuse. This makes it easy to create well-regularized, task-oriented controllers that are suitable for deployment on real robots. We demonstrate how our skill module can be used for imitation, and train controllable walking and ball dribbling policies for both the ANYmal quadruped and OP3 humanoid. These policies are then deployed on hardware via zero-shot simulation-to-reality transfer. Accompanying videos are available at https://bit.ly/robot-npmp.},
	urldate = {2022-11-16},
	publisher = {arXiv},
	author = {Bohez, Steven and Tunyasuvunakool, Saran and Brakel, Philemon and Sadeghi, Fereshteh and Hasenclever, Leonard and Tassa, Yuval and Parisotto, Emilio and Humplik, Jan and Haarnoja, Tuomas and Hafner, Roland and Wulfmeier, Markus and Neunert, Michael and Moran, Ben and Siegel, Noah and Huber, Andrea and Romano, Francesco and Batchelor, Nathan and Casarini, Federico and Merel, Josh and Hadsell, Raia and Heess, Nicolas},
	month = mar,
	year = {2022},
	note = {arXiv:2203.17138 [cs]},
	keywords = {Computer Science - Machine Learning, Computer Science - Robotics, Computer Science - Artificial Intelligence},
	file = {arXiv.org Snapshot:/home/dferigo/Zotero/storage/KUU2ESMB/2203.html:text/html;Bohez et al_2022_Imitate and Repurpose.pdf:/home/dferigo/Zotero/storage/9PRUNF9I/Bohez et al_2022_Imitate and Repurpose.pdf:application/pdf},
}

@misc{gleeson_optimizing_2022,
	title = {Optimizing {Data} {Collection} in {Deep} {Reinforcement} {Learning}},
	url = {http://arxiv.org/abs/2207.07736},
	abstract = {Reinforcement learning (RL) workloads take a notoriously long time to train due to the large number of samples collected at runtime from simulators. Unfortunately, cluster scale-up approaches remain expensive, and commonly used CPU implementations of simulators induce high overhead when switching back and forth between GPU computations. We explore two optimizations that increase RL data collection efficiency by increasing GPU utilization: (1) GPU vectorization: parallelizing simulation on the GPU for increased hardware parallelism, and (2) simulator kernel fusion: fusing multiple simulation steps to run in a single GPU kernel launch to reduce global memory bandwidth requirements. We find that GPU vectorization can achieve up to 1024× speedup over commonly used CPU simulators. We profile the performance of different implementations and show that for a simple simulator, ML compiler implementations (XLA) of GPU vectorization outperform a DNN framework (PyTorch) by 13.4× by reducing CPU overhead from repeated Python to DL backend API calls. We show that simulator kernel fusion speedups with a simple simulator are 11.3× and increase by up to 1024× as simulator complexity increases in terms of memory bandwidth requirements. We show that the speedups from simulator kernel fusion are orthogonal and combinable with GPU vectorization, leading to a multiplicative speedup.},
	language = {en},
	urldate = {2022-11-16},
	publisher = {arXiv},
	author = {Gleeson, James and Snider, Daniel and Yang, Yvonne and Gabel, Moshe and de Lara, Eyal and Pekhimenko, Gennady},
	month = jul,
	year = {2022},
	note = {arXiv:2207.07736 [cs]},
	keywords = {Computer Science - Machine Learning},
	file = {Gleeson et al. - 2022 - Optimizing Data Collection in Deep Reinforcement L.pdf:/home/dferigo/Zotero/storage/AAZK89RA/Gleeson et al. - 2022 - Optimizing Data Collection in Deep Reinforcement L.pdf:application/pdf},
}

@misc{singh_learning_2022,
	title = {Learning {Bipedal} {Walking} {On} {Planned} {Footsteps} {For} {Humanoid} {Robots}},
	url = {http://arxiv.org/abs/2207.12644},
	abstract = {Deep reinforcement learning (RL) based controllers for legged robots have demonstrated impressive robustness for walking in different environments for several robot platforms. To enable the application of RL policies for humanoid robots in real-world settings, it is crucial to build a system that can achieve robust walking in any direction, on 2D and 3D terrains, and be controllable by a user-command. In this paper, we tackle this problem by learning a policy to follow a given step sequence. The policy is trained with the help of a set of procedurally generated step sequences (also called footstep plans). We show that simply feeding the upcoming 2 steps to the policy is sufﬁcient to achieve omnidirectional walking, turning in place, standing, and climbing stairs. Our method employs curriculum learning on the complexity of terrains, and circumvents the need for reference motions or pre-trained weights. We demonstrate the application of our proposed method to learn RL policies for 2 new robot platforms - HRP5P and JVRC-1 - in the MuJoCo simulation environment. The code for training and evaluation is available online. †.},
	language = {en},
	urldate = {2022-10-04},
	publisher = {arXiv},
	author = {Singh, Rohan Pratap and Benallegue, Mehdi and Morisawa, Mitsuharu and Cisneros, Rafael and Kanehiro, Fumio},
	month = jul,
	year = {2022},
	note = {arXiv:2207.12644 [cs]},
	keywords = {Computer Science - Machine Learning, Computer Science - Robotics, Computer Science - Artificial Intelligence},
	file = {Singh et al. - 2022 - Learning Bipedal Walking On Planned Footsteps For .pdf:/home/dferigo/Zotero/storage/ATTC2SAK/Singh et al. - 2022 - Learning Bipedal Walking On Planned Footsteps For .pdf:application/pdf},
}

@misc{xu_accelerated_2022,
	title = {Accelerated {Policy} {Learning} with {Parallel} {Differentiable} {Simulation}},
	url = {http://arxiv.org/abs/2204.07137},
	doi = {10.48550/arXiv.2204.07137},
	abstract = {Deep reinforcement learning can generate complex control policies, but requires large amounts of training data to work effectively. Recent work has attempted to address this issue by leveraging differentiable simulators. However, inherent problems such as local minima and exploding/vanishing numerical gradients prevent these methods from being generally applied to control tasks with complex contact-rich dynamics, such as humanoid locomotion in classical RL benchmarks. In this work we present a high-performance differentiable simulator and a new policy learning algorithm (SHAC) that can effectively leverage simulation gradients, even in the presence of non-smoothness. Our learning algorithm alleviates problems with local minima through a smooth critic function, avoids vanishing/exploding gradients through a truncated learning window, and allows many physical environments to be run in parallel. We evaluate our method on classical RL control tasks, and show substantial improvements in sample efficiency and wall-clock time over state-of-the-art RL and differentiable simulation-based algorithms. In addition, we demonstrate the scalability of our method by applying it to the challenging high-dimensional problem of muscle-actuated locomotion with a large action space, achieving a greater than 17x reduction in training time over the best-performing established RL algorithm.},
	urldate = {2022-07-28},
	publisher = {arXiv},
	author = {Xu, Jie and Makoviychuk, Viktor and Narang, Yashraj and Ramos, Fabio and Matusik, Wojciech and Garg, Animesh and Macklin, Miles},
	month = apr,
	year = {2022},
	note = {arXiv:2204.07137 [cs]},
	keywords = {Computer Science - Machine Learning, Computer Science - Robotics, Computer Science - Artificial Intelligence, Computer Science - Graphics},
	file = {arXiv.org Snapshot:/home/dferigo/Zotero/storage/RZKKUGFX/2204.html:text/html;Xu et al_2022_Accelerated Policy Learning with Parallel Differentiable Simulation.pdf:/home/dferigo/Zotero/storage/AT2J2BBX/Xu et al_2022_Accelerated Policy Learning with Parallel Differentiable Simulation.pdf:application/pdf},
}

@article{peng_ase_2022,
	title = {{ASE}: {Large}-{Scale} {Reusable} {Adversarial} {Skill} {Embeddings} for {Physically} {Simulated} {Characters}},
	volume = {41},
	issn = {0730-0301, 1557-7368},
	shorttitle = {{ASE}},
	url = {http://arxiv.org/abs/2205.01906},
	doi = {10.1145/3528223.3530110},
	abstract = {The incredible feats of athleticism demonstrated by humans are made possible in part by a vast repertoire of general-purpose motor skills, acquired through years of practice and experience. These skills not only enable humans to perform complex tasks, but also provide powerful priors for guiding their behaviors when learning new tasks. This is in stark contrast to what is common practice in physics-based character animation, where control policies are most typically trained from scratch for each task. In this work, we present a large-scale data-driven framework for learning versatile and reusable skill embeddings for physically simulated characters. Our approach combines techniques from adversarial imitation learning and unsupervised reinforcement learning to develop skill embeddings that produce life-like behaviors, while also providing an easy to control representation for use on new downstream tasks. Our models can be trained using large datasets of unstructured motion clips, without requiring any task-specific annotation or segmentation of the motion data. By leveraging a massively parallel GPU-based simulator, we are able to train skill embeddings using over a decade of simulated experiences, enabling our model to learn a rich and versatile repertoire of skills. We show that a single pre-trained model can be effectively applied to perform a diverse set of new tasks. Our system also allows users to specify tasks through simple reward functions, and the skill embedding then enables the character to automatically synthesize complex and naturalistic strategies in order to achieve the task objectives.},
	number = {4},
	urldate = {2022-07-30},
	journal = {ACM Transactions on Graphics},
	author = {Peng, Xue Bin and Guo, Yunrong and Halper, Lina and Levine, Sergey and Fidler, Sanja},
	month = jul,
	year = {2022},
	note = {arXiv:2205.01906 [cs]},
	keywords = {Computer Science - Machine Learning, Computer Science - Artificial Intelligence, Computer Science - Graphics},
	pages = {1--17},
}

@inproceedings{chou_improving_2017,
	title = {Improving {Stochastic} {Policy} {Gradients} in {Continuous} {Control} with {Deep} {Reinforcement} {Learning} using the {Beta} {Distribution}},
	abstract = {Recently, reinforcement learning with deep neural networks has achieved great success in challenging continuous control problems such as 3D locomotion and robotic manipulation. However, in real-world control problems, the actions one can take are bounded by physical constraints, which introduces a bias when the standard Gaussian distribution is used as the stochastic policy. In this work, we propose to use the Beta distribution as an alternative and analyze the bias and variance of the policy gradients of both policies. We show that the Beta policy is bias-free and provides signiﬁcantly faster convergence and higher scores over the Gaussian policy when both are used with trust region policy optimization (TRPO) and actor critic with experience replay (ACER), the state-of-the-art on- and offpolicy stochastic methods respectively, on OpenAI Gym’s and MuJoCo’s continuous control environments.},
	language = {en},
	author = {Chou, Po-Wei and Maturana, Daniel and Scherer, Sebastian},
	year = {2017},
	pages = {10},
	file = {Chou et al_Improving Stochastic Policy Gradients in Continuous Control with Deep.pdf:/home/dferigo/Zotero/storage/AL92UVVR/Chou et al_Improving Stochastic Policy Gradients in Continuous Control with Deep.pdf:application/pdf},
}

@inproceedings{reda_learning_2020,
	address = {Virtual Event SC USA},
	title = {Learning to {Locomote}: {Understanding} {How} {Environment} {Design} {Matters} for {Deep} {Reinforcement} {Learning}},
	isbn = {978-1-4503-8171-0},
	shorttitle = {Learning to {Locomote}},
	url = {https://dl.acm.org/doi/10.1145/3424636.3426907},
	doi = {10.1145/3424636.3426907},
	abstract = {Learning to locomote is one of the most common tasks in physicsbased animation and deep reinforcement learning (RL). A learned policy is the product of the problem to be solved, as embodied by the RL environment, and the RL algorithm. While enormous attention has been devoted to RL algorithms, much less is known about the impact of design choices for the RL environment. In this paper, we show that environment design matters in significant ways and document how it can contribute to the brittle nature of many RL results. Specifically, we examine choices related to state representations, initial state distributions, reward structure, control frequency, episode termination procedures, curriculum usage, the action space, and the torque limits. We aim to stimulate discussion around such choices, which in practice strongly impact the success of RL when applied to continuous-action control problems of interest to animation, such as learning to locomote.},
	language = {en},
	urldate = {2022-07-30},
	booktitle = {Motion, {Interaction} and {Games}},
	publisher = {ACM},
	author = {Reda, Daniele and Tao, Tianxin and van de Panne, Michiel},
	month = oct,
	year = {2020},
	pages = {1--10},
	file = {Reda et al_2020_Learning to Locomote.pdf:/home/dferigo/Zotero/storage/44VB7PI2/Reda et al_2020_Learning to Locomote.pdf:application/pdf},
}

@article{peng_learning_2017-1,
	title = {Learning {Locomotion} {Skills} using {DeepRL}: does the choice of {Action} {Space} matter?},
	abstract = {The use of deep reinforcement learning allows for high-dimensional state descriptors, but little is known about how the choice of action representation impacts the learning difﬁculty and the resulting performance. We compare the impact of four different action parameterizations (torques, muscle-activations, target joint angles, and target joint-angle velocities) in terms of learning time, policy robustness, motion quality, and policy query rates. Our results are evaluated on a gaitcycle imitation task for multiple planar articulated ﬁgures and multiple gaits. We demonstrate that the local feedback provided by higher-level action parameterizations can signiﬁcantly impact the learning, robustness, and quality of the resulting policies.},
	language = {en},
	author = {Peng, Xue Bin},
	year = {2017},
	pages = {16},
	file = {Peng_2017_Learning Locomotion Skills using DeepRL.pdf:/home/dferigo/Zotero/storage/2ZQWS6JU/Peng_2017_Learning Locomotion Skills using DeepRL.pdf:application/pdf},
}

@article{romualdi_modeling_2021,
	title = {Modeling of {Visco}-{Elastic} {Environments} for {Humanoid} {Robot} {Motion} {Control}},
	volume = {6},
	issn = {2377-3766, 2377-3774},
	url = {http://arxiv.org/abs/2105.14622},
	doi = {10.1109/LRA.2021.3067589},
	abstract = {This manuscript presents a model of compliant contacts for time-critical humanoid robot motion control. The proposed model considers the environment as a continuum of spring-damper systems, which allows us to compute the equivalent contact force and torque that the environment exerts on the contact surface. We show that the proposed model extends the linear and rotational springs and dampers - classically used to characterize soft terrains - to the case of large contact surface orientations. The contact model is then used for the real-time whole-body control of humanoid robots walking on visco-elastic environments. The overall approach is validated by simulating walking motions of the iCub humanoid robot. Furthermore, the paper compares the proposed whole-body control strategy and state of the art approaches. In this respect, we investigate the terrain compliance that makes the classical approaches assuming rigid contacts fail. We finally analyze the robustness of the presented control design with respect to non-parametric uncertainty in the contact-model.},
	number = {3},
	urldate = {2022-07-26},
	journal = {IEEE Robotics and Automation Letters},
	author = {Romualdi, Giulio and Dafarra, Stefano and Pucci, Daniele},
	month = jul,
	year = {2021},
	note = {arXiv:2105.14622 [cs]},
	keywords = {Computer Science - Robotics},
	pages = {4289--4296},
	file = {arXiv.org Snapshot:/home/dferigo/Zotero/storage/PEPFYDNZ/2105.html:text/html;Romualdi et al_2021_Modeling of Visco-Elastic Environments for Humanoid Robot Motion Control.pdf:/home/dferigo/Zotero/storage/2VQZDSNI/Romualdi et al_2021_Modeling of Visco-Elastic Environments for Humanoid Robot Motion Control.pdf:application/pdf},
}

@phdthesis{traversaro_modelling_2017,
	title = {Modelling, {Estimation} and {Identification} of {Humanoid} {Robots} {Dynamics}},
	url = {https://github.com/traversaro/traversaro-phd-thesis},
	urldate = {2020-01-05},
	author = {Traversaro, Silvio},
	year = {2017},
}

@article{farley_how_2022,
	title = {How to pick a mobile robot simulator: {A} quantitative comparison of {CoppeliaSim}, {Gazebo}, {MORSE} and {Webots} with a focus on the accuracy of motion simulations},
	issn = {1569190X},
	shorttitle = {How to pick a mobile robot simulator},
	url = {https://linkinghub.elsevier.com/retrieve/pii/S1569190X22001046},
	doi = {10.1016/j.simpat.2022.102629},
	abstract = {The number of available tools for dynamic simulation of robots has been growing rapidly in recent years. However, to the best of our knowledge, there are very few reported quantitative comparisons of the most widelyused robot simulation tools. This article attempts to partly fill this gap by providing quantitative and objective comparisons of four widely-used simulation packages for mobile robots. The comparisons reported here were conducted by obtaining data from a real Husky A200 mobile robot driving on mixed terrains as ground truth and by simulating a 3D mobile robot model in a developed identical simulation world of these terrains for each simulator. We then compared the simulation outputs with real, measured results by weighted metrics. Based on our experiments and selected metrics, we conclude that CoppeliaSim is currently the best performing simulator, although Gazebo is not far behind and is a good alternative.},
	language = {en},
	urldate = {2022-07-12},
	journal = {Simulation Modelling Practice and Theory},
	author = {Farley, Andrew and Wang, Jie and Marshall, Joshua A.},
	month = jul,
	year = {2022},
	pages = {102629},
	file = {Farley et al_2022_How to pick a mobile robot simulator.pdf:/home/dferigo/Zotero/storage/5MEB9LW7/Farley et al_2022_How to pick a mobile robot simulator.pdf:application/pdf},
}

@article{korber_comparing_2021-1,
	title = {Comparing {Popular} {Simulation} {Environments} in the {Scope} of {Robotics} and {Reinforcement} {Learning}},
	url = {http://arxiv.org/abs/2103.04616},
	abstract = {This letter compares the performance of four different, popular simulation environments for robotics and reinforcement learning (RL) through a series of benchmarks. The benchmarked scenarios are designed carefully with current industrial applications in mind. Given the need to run simulations as fast as possible to reduce the real-world training time of the RL agents, the comparison includes not only different simulation environments but also different hardware configurations, ranging from an entry-level notebook up to a dual CPU high performance server. We show that the chosen simulation environments benefit the most from single core performance. Yet, using a multi core system, multiple simulations could be run in parallel to increase the performance.},
	urldate = {2022-04-20},
	journal = {arXiv:2103.04616 [cs]},
	author = {Körber, Marian and Lange, Johann and Rediske, Stephan and Steinmann, Simon and Glück, Roland},
	month = mar,
	year = {2021},
	note = {arXiv: 2103.04616},
	keywords = {Computer Science - Machine Learning, Computer Science - Robotics, Computer Science - Artificial Intelligence},
	file = {arXiv.org Snapshot:/home/dferigo/Zotero/storage/43R53FNN/2103.html:text/html;Körber et al_2021_Comparing Popular Simulation Environments in the Scope of Robotics and.pdf:/home/dferigo/Zotero/storage/75YCB5Z3/Körber et al_2021_Comparing Popular Simulation Environments in the Scope of Robotics and.pdf:application/pdf},
}

@misc{achiam_spinning_2018,
	title = {Spinning {Up} in {Deep} {RL}},
	url = {https://spinningup.openai.com},
	language = {en},
	urldate = {2022-06-23},
	author = {Achiam, Josh},
	year = {2018},
	file = {Welcome to Spinning Up in Deep RL! — Spinning Up documentation:/home/dferigo/Zotero/storage/WYUQEUMW/latest.html:text/html},
}

@misc{carpentier_pinocchio_2015,
	title = {Pinocchio: fast forward and inverse dynamics for poly-articulated systems},
	url = {https://stack-of-tasks.github.io/pinocchio},
	author = {Carpentier, Justin and Valenza, Florian and Mansard, Nicolas},
	year = {2015},
}

@inproceedings{carpentier_pinocchio_2019,
	title = {The {Pinocchio} {C}++ library : {A} fast and flexible implementation of rigid body dynamics algorithms and their analytical derivatives},
	shorttitle = {The {Pinocchio} {C}++ library},
	doi = {10.1109/SII.2019.8700380},
	abstract = {We introduce Pinocchio, an open-source software framework that implements rigid body dynamics algorithms and their analytical derivatives. Pinocchio does not only include standard algorithms employed in robotics (e.g., forward and inverse dynamics) but provides additional features essential for the control, the planning and the simulation of robots. In this paper, we describe these features and detail the programming patterns and design which make Pinocchio efficient. We evaluate the performances against RBDL, another framework with broad dissemination inside the robotics community. We also demonstrate how the source code generation embedded in Pinocchio outperforms other approaches of state of the art.},
	booktitle = {2019 {IEEE}/{SICE} {International} {Symposium} on {System} {Integration} ({SII})},
	author = {Carpentier, Justin and Saurel, Guilhem and Buondonno, Gabriele and Mirabel, Joseph and Lamiraux, Florent and Stasse, Olivier and Mansard, Nicolas},
	month = jan,
	year = {2019},
	note = {ISSN: 2474-2325},
	keywords = {Computational modeling, Kinematics, Robot kinematics, Heuristic algorithms, Libraries, Software algorithms},
	pages = {614--619},
	file = {Carpentier et al_2019_The Pinocchio C++ library.pdf:/home/dferigo/Zotero/storage/8XHJM8IS/Carpentier et al_2019_The Pinocchio C++ library.pdf:application/pdf;IEEE Xplore Abstract Record:/home/dferigo/Zotero/storage/HGDTSVGR/8700380.html:text/html},
}

@article{sola_quaternion_2017,
	title = {Quaternion kinematics for the error-state {Kalman} ﬁlter},
	language = {en},
	author = {Sola, Joan},
	year = {2017},
}

@book{sutton_reinforcement_2018,
	address = {Cambridge, MA},
	edition = {Second edition},
	series = {Adaptive computation and machine learning series},
	title = {Reinforcement learning: an introduction},
	isbn = {978-0-262-03924-6},
	shorttitle = {Reinforcement learning},
	abstract = {"Reinforcement learning, one of the most active research areas in artificial intelligence, is a computational approach to learning whereby an agent tries to maximize the total amount of reward it receives while interacting with a complex, uncertain environment. In Reinforcement Learning, Richard Sutton and Andrew Barto provide a clear and simple account of the field's key ideas and algorithms."--},
	publisher = {The MIT Press},
	author = {Sutton, Richard S. and Barto, Andrew G.},
	year = {2018},
	keywords = {Reinforcement learning},
}

@book{dong_deep_2020,
	address = {Singapore},
	title = {Deep {Reinforcement} {Learning}: {Fundamentals}, {Research} and {Applications}},
	shorttitle = {Deep {Reinforcement} {Learning}},
	url = {http://link.springer.com/10.1007/978-981-15-4095-0},
	language = {en},
	urldate = {2022-02-24},
	publisher = {Springer Singapore},
	editor = {Dong, Hao and Ding, Zihan and Zhang, Shanghang},
	year = {2020},
	doi = {10.1007/978-981-15-4095-0},
	file = {Dong et al_2020_Deep Reinforcement Learning.pdf:/home/dferigo/Zotero/storage/DNN9KNNP/Dong et al_2020_Deep Reinforcement Learning.pdf:application/pdf},
}

@book{warner_foundations_1983,
	title = {Foundations of {Differentiable} {Manifolds} and {Lie} {Groups}},
	volume = {94},
	url = {https://link.springer.com/book/10.1007/978-1-4757-1799-0},
	language = {en},
	urldate = {2022-06-23},
	publisher = {Springer Science \& Business Media},
	author = {Warner, Frank W.},
	year = {1983},
	file = {Snapshot:/home/dferigo/Zotero/storage/TRCEMDKF/978-1-4757-1799-0.html:text/html},
}

@book{selig_geometric_2005,
	title = {Geometric {Fundamentals} of {Robotics}},
	volume = {128},
	url = {https://link.springer.com/book/10.1007/b138859},
	language = {en},
	urldate = {2022-06-23},
	publisher = {Springer},
	author = {Selig, Jon M.},
	year = {2005},
	file = {Snapshot:/home/dferigo/Zotero/storage/22L6Y9MA/b138859.html:text/html},
}

@book{maruskin_dynamical_2018,
	title = {Dynamical {Systems} and {Geometric} {Mechanics}: {An} {Introduction}},
	isbn = {978-3-11-059780-6},
	shorttitle = {Dynamical {Systems} and {Geometric} {Mechanics}},
	url = {https://www.degruyter.com/document/doi/10.1515/9783110597806/html},
	language = {en},
	urldate = {2022-03-30},
	publisher = {De Gruyter},
	author = {Maruskin, Jared},
	month = aug,
	year = {2018},
	doi = {10.1515/9783110597806},
	file = {Maruskin_2018_Dynamical Systems and Geometric Mechanics.pdf:/home/dferigo/Zotero/storage/E95RJ763/Maruskin_2018_Dynamical Systems and Geometric Mechanics.pdf:application/pdf},
}

@book{bullo_geometric_2004,
	title = {Geometric {Control} of {Mechanical} {Systems}: {Modeling}, {Analysis}, and {Design} for {Simple} {Mechanical} {Control} {Systems}},
	volume = {49},
	publisher = {Springer Science \& Business Media},
	author = {Bullo, Francesco and Lewis, Andrew D.},
	year = {2004},
	file = {Bullo_Lewis_2004_Geometric Control of Mechanical Systems.pdf:/home/dferigo/Zotero/storage/AYJJLTUV/Bullo_Lewis_2004_Geometric Control of Mechanical Systems.pdf:application/pdf},
}

@inproceedings{nava_stability_2016,
	title = {Stability analysis and design of momentum-based controllers for humanoid robots},
	doi = {10.1109/IROS.2016.7759126},
	abstract = {Envisioned applications for humanoid robots call for the design of balancing and walking controllers. While promising results have been recently achieved, robust and reliable controllers are still a challenge for the control community dealing with humanoid robotics. Momentum-based strategies have proven their effectiveness for controlling humanoids balancing, but the stability analysis of these controllers is still missing. The contribution of this paper is twofold. First, we numerically show that the application of state-of-the-art momentum-based control strategies may lead to unstable zero dynamics. Secondly, we propose simple modifications to the control architecture that avoid instabilities at the zero-dynamics level. Asymptotic stability of the closed loop system is shown by means of a Lyapunov analysis on the linearized system's joint space. The theoretical results are validated with both simulations and experiments on the iCub humanoid robot.},
	booktitle = {2016 {IEEE}/{RSJ} {International} {Conference} on {Intelligent} {Robots} and {Systems} ({IROS})},
	author = {Nava, Gabriele and Romano, Francesco and Nori, Francesco and Pucci, Daniele},
	month = oct,
	year = {2016},
	note = {ISSN: 2153-0866},
	keywords = {Humanoid robots, Stability analysis, Legged locomotion, Robot kinematics, humanoid robots, legged locomotion, Acceleration, asymptotic stability, Asymptotic stability, balancing controller design, closed loop system asymptotic stability, closed loop systems, control system synthesis, iCub humanoid robot, linearisation techniques, linearized system joint space, Lyapunov analysis, Lyapunov methods, momentum, momentum-based controller design, robust control, robust controllers, stability analysis, unstable zero dynamics, walking controller design},
	pages = {680--687},
	file = {IEEE Xplore Abstract Record:/home/dferigo/Zotero/storage/2QU4BGKY/7759126.html:text/html;Nava et al_2016_Stability analysis and design of momentum-based controllers for humanoid robots.pdf:/home/dferigo/Zotero/storage/XPQJX7AK/Nava et al_2016_Stability analysis and design of momentum-based controllers for humanoid robots.pdf:application/pdf},
}

@inproceedings{yang_emergence_2017,
	title = {Emergence of human-comparable balancing behaviours by deep reinforcement learning},
	doi = {10.1109/HUMANOIDS.2017.8246900},
	abstract = {This paper presents a hierarchical framework based on deep reinforcement learning that naturally acquires control policies that are capable of performing balancing behaviours such as ankle push-offs for humanoid robots, without explicit human design of controllers. Only the reward for training the neural network is specifically formulated based on the physical principles and quantities, and hence explainable. The successful emergence of human-comparable behaviours through the deep reinforcement learning demonstrates the feasibility of using an AI-based approach for humanoid motion control in a unified framework. Moreover, the balance strategies learned by reinforcement learning provides a larger range of disturbance rejection than that of the zero moment point based methods, suggesting a research direction of using learning-based controls to explore the optimal performance.},
	booktitle = {2017 {IEEE}-{RAS} 17th {International} {Conference} on {Humanoid} {Robotics} ({Humanoids})},
	author = {Yang, Chuanyu and Komura, Taku and Li, Zhibin},
	month = nov,
	year = {2017},
	note = {ISSN: 2164-0580},
	keywords = {Legged locomotion, humanoid robots, learning (artificial intelligence), motion control, robot programming, Machine learning, Foot, control engineering computing, control policies, deep reinforcement learning, human-comparable balancing behaviours, humanoid motion control, Pelvis, Torso, zero moment point},
	pages = {372--377},
	file = {IEEE Xplore Abstract Record:/home/dferigo/Zotero/storage/4CTDBA8W/8246900.html:text/html;Yang et al_2017_Emergence of human-comparable balancing behaviours by deep reinforcement.pdf:/home/dferigo/Zotero/storage/WB86I9KP/Yang et al_2017_Emergence of human-comparable balancing behaviours by deep reinforcement.pdf:application/pdf},
}

@incollection{cangelosi_robot_2022,
	address = {Cambridge, MA, USA},
	series = {Intelligent {Robotics} and {Autonomous} {Agents} series},
	title = {Robot {Platforms} and {Simulators}},
	copyright = {All rights reserved},
	isbn = {978-0-262-04683-1},
	abstract = {The current state of the art in cognitive robotics, covering the challenges of building AI-powered intelligent robots inspired by natural cognitive systems.},
	language = {en},
	booktitle = {Cognitive {Robotics}},
	publisher = {MIT Press},
	author = {Ferigo, Diego and Parmiggiani, Alberto and Rampone, Elena and Tikhanoff, Vadim and Traversaro, Silvio and Pucci, Daniele and Natale, Lorenzo},
	editor = {Cangelosi, Angelo and Asada, Minoru and Arkin, Ronald C.},
	month = may,
	year = {2022},
	file = {Open Access:/home/dferigo/Zotero/storage/MKCT3PSL/Cognitive-Robotics.html:text/html},
}

@techreport{chen_imitation_2022,
	title = {Imitation {Learning} via {Differentiable} {Physics}},
	url = {http://arxiv.org/abs/2206.04873},
	abstract = {Existing imitation learning (IL) methods such as inverse reinforcement learning (IRL) usually have a double-loop training process, alternating between learning a reward function and a policy and tend to suffer long training time and high variance. In this work, we identify the benefits of differentiable physics simulators and propose a new IL method, i.e., Imitation Learning via Differentiable Physics (ILD), which gets rid of the double-loop design and achieves significant improvements in final performance, convergence speed, and stability. The proposed ILD incorporates the differentiable physics simulator as a physics prior into its computational graph for policy learning. It unrolls the dynamics by sampling actions from a parameterized policy, simply minimizing the distance between the expert trajectory and the agent trajectory, and back-propagating the gradient into the policy via temporal physics operators. With the physics prior, ILD policies can not only be transferable to unseen environment specifications but also yield higher final performance on a variety of tasks. In addition, ILD naturally forms a single-loop structure, which significantly improves the stability and training speed. To simplify the complex optimization landscape induced by temporal physics operations, ILD dynamically selects the learning objectives for each state during optimization. In our experiments, we show that ILD outperforms state-of-the-art methods in a variety of continuous control tasks with Brax, requiring only one expert demonstration. In addition, ILD can be applied to challenging deformable object manipulation tasks and can be generalized to unseen configurations.},
	language = {en},
	number = {arXiv:2206.04873},
	urldate = {2022-06-18},
	institution = {arXiv},
	author = {Chen, Siwei and Ma, Xiao and Xu, Zhongwen},
	month = jun,
	year = {2022},
	note = {arXiv:2206.04873 [cs]
type: article},
	keywords = {Computer Science - Artificial Intelligence, Computer Science - Machine Learning, Computer Science - Robotics},
	file = {Chen et al_2022_Imitation Learning via Differentiable Physics.pdf:/home/dferigo/Zotero/storage/AHDG85ED/Chen et al_2022_Imitation Learning via Differentiable Physics.pdf:application/pdf},
}

@misc{bradbury_james_jax_2018,
	title = {{JAX}: composable transformations of {Python}+{NumPy} programs},
	copyright = {Apache-2.0},
	shorttitle = {{JAX}},
	url = {https://github.com/google/jax},
	urldate = {2022-03-07},
	publisher = {Google},
	author = {{Bradbury, James} and {Frostig, Roy} and {Hawkins, Peter} and {Johnson, Matthew James} and {Leary, Chris} and {Maclaurin, Dougal} and {Necula, George} and {Paszke, Adam} and {VanderPlas, James} and {Wanderman-Milne, Skye} and {Zhang, Qiao}},
	year = {2018},
	note = {original-date: 2018-10-25T21:25:02Z},
	keywords = {jax},
}

@inproceedings{darvish_whole-body_2019,
	address = {Toronto, ON, Canada},
	title = {Whole-{Body} {Geometric} {Retargeting} for {Humanoid} {Robots}},
	copyright = {All rights reserved},
	isbn = {978-1-5386-7630-1},
	url = {https://ieeexplore.ieee.org/document/9035059/},
	doi = {10.1109/Humanoids43949.2019.9035059},
	abstract = {Humanoid robot teleoperation allows humans to integrate their cognitive capabilities with the apparatus to perform tasks that need high strength, manoeuvrability and dexterity. This paper presents a framework for teleoperation of humanoid robots using a novel approach for motion retargeting through inverse kinematics over the robot model. The proposed method enhances scalability for retargeting, i.e., it allows teleoperating different robots by different human users with minimal changes to the proposed system. Our framework enables an intuitive and natural interaction between the human operator and the humanoid robot at the conﬁguration space level. We validate our approach by demonstrating whole-body retargeting with multiple robot models. Furthermore, we present experimental validation through teleoperation experiments using two state-of-the-art whole-body controllers for humanoid robots.},
	language = {en},
	urldate = {2021-10-28},
	booktitle = {2019 {IEEE}-{RAS} 19th {International} {Conference} on {Humanoid} {Robots} ({Humanoids})},
	publisher = {IEEE},
	author = {Darvish, Kourosh and Tirupachuri, Yeshasvi and Romualdi, Giulio and Rapetti, Lorenzo and Ferigo, Diego and Chavez, Francisco Javier Andrade and Pucci, Daniele},
	month = oct,
	year = {2019},
	pages = {679--686},
}

@inproceedings{latella_human_2019,
	title = {A {Human} {Wearable} {Framework} for {Physical} {Human}-{Robot} {Interaction}},
	copyright = {Creative Commons Attribution 4.0 International, Open Access},
	url = {https://zenodo.org/record/4782543},
	doi = {10.5281/ZENODO.4782543},
	abstract = {Physical interaction between humans and robots is more and more frequently required in the modern society. We introduce a novel framework endowing robots with the ability to perceive humans while collaborating at the same task. The framework embodies a human wearable system (i.e., motion tracking suit, force/torque sensors shoes) and a probabilistic algorithm for the human ﬂoating-base whole-body kinematics and dynamics estimation.},
	language = {en},
	urldate = {2021-09-08},
	author = {Latella, Claudia and Tirupachuri, Yeshasvi and Rapetti, Lorenzo and Ferigo, Diego and Traversaro, Silvio and Sorrentino, Ines and Chavez, Francisco Javier Andrade and Nori, Francesco and Pucci, Daniele},
	year = {2019},
	note = {Conference Name: I-RIM},
	keywords = {Floating-base human dynamics, human wearable system, real-time human joint torque estimation},
}

@article{ferigo_generic_2020,
	title = {A generic synchronous dataflow architecture to rapidly prototype and deploy robot controllers},
	copyright = {All rights reserved},
	shorttitle = {A generic synchronous dataflow architecture to rapidly prototype and deploy robot controllers},
	url = {https://journals.sagepub.com/doi/10.1177/1729881420921625},
	doi = {10.1177/1729881420921625},
	abstract = {The article presents a software architecture to optimize the process of prototyping and deploying robot controllers that are synthesized using model-based desig...},
	language = {en},
	urldate = {2020-04-30},
	journal = {International Journal of Advanced Robotic Systems},
	author = {Ferigo, Diego and Traversaro, Silvio and Romano, Francesco and Pucci, Daniele},
	year = {2020},
}

@inproceedings{ferigo_gym-ignition_2019,
	title = {Gym-{Ignition}: {Reproducible} {Robotic} {Simulations} for {Reinforcement} {Learning}},
	copyright = {All rights reserved},
	abstract = {This paper presents Gym-Ignition, a new framework to create reproducible robotic environments for reinforcement learning research. It interfaces with the new generation of Gazebo, part of the Ignition Robotics suite. The modular architecture of Ignition Gazebo allows using the simulator as a library, simplifying the interconnection between the simulator and external software. Gym-Ignition enables the creation of environments compatible with OpenAI Gym that are executed in Ignition Gazebo. The Gym-Ignition software architecture can interact natively with programs developed in low-level languages. The advantage is twofold: it permits direct interaction with robots in real-time, and it simpliﬁes the embedding of existing software such as low-level robotic controllers.},
	booktitle = {Robotics: {Science} and {Systems} ({RSS}) - {Workshop} on {Closing} the {Reality} {Gap} in {Sim2real} {Transfer} for {Robotic} {Manipulation}},
	author = {Ferigo, Diego and Traversaro, Silvio and Pucci, Daniele},
	year = {2019},
	keywords = {own},
}

@inproceedings{ferigo_gym-ignition_2020,
	title = {Gym-{Ignition}: {Reproducible} {Robotic} {Simulations} for {Reinforcement} {Learning}},
	copyright = {All rights reserved},
	shorttitle = {Gym-{Ignition}},
	doi = {10.1109/SII46433.2020.9025951},
	abstract = {This paper presents Gym-Ignition, a new framework to create reproducible robotic environments for reinforcement learning research. It interfaces with the new generation of Gazebo, part of the Ignition Robotics suite, which provides three main improvements for reinforcement learning applications compared to the alternatives: 1) the modular architecture enables using the simulator as a C++ library, simplifying the interconnection with external software; 2) multiple physics and rendering engines are supported as plugins, simplifying their selection during the execution; 3) the new distributed simulation capability allows simulating complex scenarios while sharing the load on multiple workers and machines. The core of Gym-Ignition is a component that contains the Ignition Gazebo simulator and exposes a simple interface for its configuration and execution. We provide a Python package that allows developers to create robotic environments simulated in Ignition Gazebo. Environments expose the common OpenAI Gym interface, making them compatible out-of-the-box with third-party frameworks containing reinforcement learning algorithms. Simulations can be executed in both headless and GUI mode, the physics engine can run in accelerated mode, and instances can be parallelized. Furthermore, the Gym-Ignition software architecture provides abstraction of the Robot and the Task, making environments agnostic on the specific runtime. This abstraction allows their execution also in a real-time setting on actual robotic platforms, even if driven by different middlewares.},
	booktitle = {{IEEE}/{SICE} {International} {Symposium} on {System} {Integration} ({SII})},
	author = {Ferigo, Diego and Traversaro, Silvio and Metta, Giorgio and Pucci, Daniele},
	year = {2020},
	note = {ISSN: 2474-2325},
	keywords = {Real-time systems, Robots, Task analysis, Learning (artificial intelligence), Physics, Ignition, own},
	pages = {885--890},
}

@article{ferigo_emergence_2021,
	title = {On the {Emergence} of {Whole}-body {Strategies} from {Humanoid} {Robot} {Push}-recovery {Learning}},
	abstract = {Balancing and push-recovery are essential capabilities enabling humanoid robots to solve complex locomotion tasks. In this context, classical control systems tend to be based on simpliﬁed physical models and hard-coded strategies. Although successful in speciﬁc scenarios, this approach requires demanding tuning of parameters and switching logic between speciﬁcally-designed controllers for handling more general perturbations. We apply model-free Deep Reinforcement Learning for training a general and robust humanoid push-recovery policy in a simulation environment. Our method targets highdimensional whole-body humanoid control and is validated on the iCub humanoid. Reward components incorporating expert knowledge on humanoid control enable fast learning of several robust behaviors by the same policy, spanning the entire body. We validate our method with extensive quantitative analyses in simulation, including out-of-sample tasks which demonstrate policy robustness and generalization, both key requirements towards real-world robot deployment.},
	journal = {IEEE Robotics and Automation Letters},
	author = {Ferigo, Diego and Camoriano, Raffaello and Viceconte, Paolo Maria and Calandriello, Daniele and Traversaro, Silvio and Rosasco, Lorenzo and Pucci, Daniele},
	year = {2021},
	keywords = {own},
}

@inproceedings{tirupachuri_towards_2020,
	address = {Cham},
	series = {Advances in {Intelligent} {Systems} and {Computing}},
	title = {Towards {Partner}-{Aware} {Humanoid} {Robot} {Control} {Under} {Physical} {Interactions}},
	copyright = {All rights reserved},
	isbn = {978-3-030-29513-4},
	doi = {10.1007/978-3-030-29513-4_78},
	abstract = {The topic of physical human-robot interaction received a lot of attention from the robotics community because of many promising application domains. However, studying physical interaction between a robot and an external agent, like a human or another robot, without considering the dynamics of both the systems may lead to many shortcomings in fully exploiting the interaction. In this paper, we present a coupled-dynamics formalism followed by a sound approach in exploiting helpful interaction with a humanoid robot. In particular, we propose the first attempt to define and exploit the human help for the robot to accomplish a specific task. As a result, we present a task-based partner-aware robot control techniques. The theoretical results are validated by conducting experiments with two iCub humanoid robots involved in physical interaction.},
	language = {en},
	booktitle = {Intelligent {Systems} and {Applications}},
	publisher = {Springer International Publishing},
	author = {Tirupachuri, Yeshasvi and Nava, Gabriele and Latella, Claudia and Ferigo, Diego and Rapetti, Lorenzo and Tagliapietra, Luca and Nori, Francesco and Pucci, Daniele},
	editor = {Bi, Yaxin and Bhatia, Rahul and Kapoor, Supriya},
	year = {2020},
	keywords = {Humanoids, Physical human-robot interaction, Physical robot-robot interaction},
	pages = {1073--1092},
}

@article{duan_learning_2020,
	title = {Learning to {Avoid} {Obstacles} {With} {Minimal} {Intervention} {Control}},
	volume = {7},
	issn = {2296-9144},
	url = {https://www.frontiersin.org/article/10.3389/frobt.2020.00060},
	doi = {10.3389/frobt.2020.00060},
	abstract = {Programming by demonstration has received much attention as it offers a general framework which allows robots to efficiently acquire novel motor skills from a human teacher. While traditional imitation learning that only focuses on either Cartesian or joint space might become inappropriate in situations where both spaces are equally important (e.g., writing or striking task), hybrid imitation learning of skills in both Cartesian and joint spaces simultaneously has been studied recently. However, an important issue which often arises in dynamical or unstructured environments is overlooked, namely how can a robot avoid obstacles? In this paper, we aim to address the problem of avoiding obstacles in the context of hybrid imitation learning. Specifically, we propose to tackle three subproblems: (i) designing a proper potential field so as to bypass obstacles, (ii) guaranteeing joint limits are respected when adjusting trajectories in the process of avoiding obstacles, and (iii) determining proper control commands for robots such that potential human-robot interaction is safe. By solving the aforementioned subproblems, the robot is capable of generalizing observed skills to new situations featuring obstacles in a feasible and safe manner. The effectiveness of the proposed method is validated through a toy example as well as a real transportation experiment on the iCub humanoid robot.},
	urldate = {2021-10-28},
	journal = {Frontiers in Robotics and AI},
	author = {Duan, Anqing and Camoriano, Raffaello and Ferigo, Diego and Huang, Yanlong and Calandriello, Daniele and Rosasco, Lorenzo and Pucci, Daniele},
	year = {2020},
	pages = {60},
}

@article{latella_simultaneous_2019,
	title = {Simultaneous {Floating}-{Base} {Estimation} of {Human} {Kinematics} and {Joint} {Torques}},
	volume = {19},
	copyright = {http://creativecommons.org/licenses/by/3.0/},
	url = {https://www.mdpi.com/1424-8220/19/12/2794},
	doi = {10.3390/s19122794},
	abstract = {The paper presents a stochastic methodology for the simultaneous floating-base estimation of the human whole-body kinematics and dynamics (i.e., joint torques, internal and external forces). The paper builds upon our former work where a fixed-base formulation had been developed for the human estimation problem. The presented approach is validated by presenting experimental results of a health subject equipped with a wearable motion tracking system and a pair of shoes sensorized with force/torque sensors while performing different motion tasks, e.g., walking on a treadmill. The results show that joint torque estimates obtained by using floating-base and fixed-base approaches match satisfactorily, thus validating the present approach.},
	language = {en},
	number = {12},
	urldate = {2021-10-28},
	journal = {Sensors},
	author = {Latella, Claudia and Traversaro, Silvio and Ferigo, Diego and Tirupachuri, Yeshasvi and Rapetti, Lorenzo and Andrade Chavez, Francisco Javier and Nori, Francesco and Pucci, Daniele},
	year = {2019},
	note = {Number: 12
Publisher: Multidisciplinary Digital Publishing Institute},
	keywords = {floating-base dynamics estimation, human joint torque analysis, human wearable dynamics},
	pages = {2794},
}

@inproceedings{duan_learning_2019,
	title = {Learning to {Sequence} {Multiple} {Tasks} with {Competing} {Constraints}},
	copyright = {All rights reserved},
	doi = {10.1109/IROS40897.2019.8968496},
	abstract = {Imitation learning offers a general framework where robots can efficiently acquire novel motor skills from demonstrations of a human teacher. While many promising achievements have been shown, the majority of them are only focused on single-stroke movements, without taking into account the problem of multi-tasks sequencing. Conceivably, sequencing different atomic tasks can further augment the robot's capabilities as well as avoid repetitive demonstrations. In this paper, we propose to address the issue of multi-tasks sequencing with emphasis on handling the so-called competing constraints, which emerge due to the existence of the concurrent constraints from Cartesian and joint trajectories. Specifically, we explore the null space of the robot from an information-theoretic perspective in order to maintain imitation fidelity during transition between consecutive tasks. The effectiveness of the proposed method is validated through simulated and real experiments on the iCub humanoid robot.},
	booktitle = {2019 {IEEE}/{RSJ} {International} {Conference} on {Intelligent} {Robots} and {Systems} ({IROS})},
	author = {Duan, Anqing and Camoriano, Raffaello and Ferigo, Diego and Huang, Yanlong and Calandriello, Daniele and Rosasco, Lorenzo and Pucci, Daniele},
	year = {2019},
	note = {ISSN: 2153-0866},
	pages = {2672--2678},
}

@article{viceconte_adherent_2022,
	title = {{ADHERENT}: {Learning} {Human}-like {Trajectory} {Generators} for {Whole}-body {Control} of {Humanoid} {Robots}},
	volume = {7},
	copyright = {All rights reserved},
	issn = {2377-3766},
	shorttitle = {{ADHERENT}},
	doi = {10.1109/LRA.2022.3141658},
	abstract = {Human-like trajectory generation and footstep planning represent challenging problems in humanoid robotics. Recently, research in computer graphics investigated machine-learning methods for character animation based on training human-like models directly on motion capture data. Such methods proved effective in virtual environments, mainly focusing on trajectory visualization. This letter presents ADHERENT, a system architecture integrating machine-learning methods used in computer graphics with whole-body control methods employed in robotics to generate and stabilize human-like trajectories for humanoid robots. Leveraging human motion capture locomotion data, ADHERENT yields a general footstep planner, including forward, sideways, and backward walking trajectories that blend smoothly from one to another. Furthermore, at the joint configuration level, ADHERENT computes data-driven whole-body postural reference trajectories coherent with the generated footsteps, thus increasing the human likeness of the resulting robot motion. Extensive validations of the proposed architecture are presented with both simulations and real experiments on the iCub humanoid robot, thus demonstrating ADHERENT to be robust to varying step sizes and walking speeds.},
	number = {2},
	journal = {IEEE Robotics and Automation Letters},
	author = {Viceconte, Paolo Maria and Camoriano, Raffaello and Romualdi, Giulio and Ferigo, Diego and Dafarra, Stefano and Traversaro, Silvio and Oriolo, Giuseppe and Rosasco, Lorenzo and Pucci, Daniele},
	month = apr,
	year = {2022},
	note = {Conference Name: IEEE Robotics and Automation Letters},
	keywords = {Computer architecture, Robots, Humanoid robots, Legged locomotion, Robot kinematics, Trajectory, Generators, Humanoid robot systems, machine learning for robot control, whole-body motion planning and control},
	pages = {2779--2786},
	file = {IEEE Xplore Abstract Record:/home/dferigo/Zotero/storage/WMUUTDFE/9676410.html:text/html;Viceconte et al_2022_ADHERENT.pdf:/home/dferigo/Zotero/storage/CKLBBNAL/Viceconte et al_2022_ADHERENT.pdf:application/pdf},
}

@inproceedings{dafarra_control_2018,
	title = {A {Control} {Architecture} with {Online} {Predictive} {Planning} for {Position} and {Torque} {Controlled} {Walking} of {Humanoid} {Robots}},
	doi = {10.1109/IROS.2018.8594277},
	abstract = {A common approach to the generation of walking patterns for humanoid robots consists in adopting a layered control architecture. This paper proposes an architecture composed of three nested control loops. The outer loop exploits a robot kinematic model to plan the footstep positions. In the mid layer, a predictive controller generates a Center of Mass trajectory according to the well-known table-cart model. Through a whole-body inverse kinematics algorithm, we can define joint references for position controlled walking. The outcomes of these two loops are then interpreted as inputs of a stack-of-task QP-based torque controller, which represents the inner loop of the presented control architecture. This resulting architecture allows the robot to walk also in torque control, guaranteeing higher level of compliance. Real world experiments have been carried on the humanoid robot iCub.},
	booktitle = {2018 {IEEE}/{RSJ} {International} {Conference} on {Intelligent} {Robots} and {Systems} ({IROS})},
	author = {Dafarra, Stefano and Nava, Gabriele and Charbonneau, Marie and Guedelha, Nuno and Andrade, Francisco and Traversaro, Silvio and Fiorio, Luca and Romano, Francesco and Nori, Francesco and Metta, Giorgio and Pucci, Daniele},
	month = oct,
	year = {2018},
	note = {ISSN: 2153-0866},
	keywords = {Computer architecture, Humanoid robots, Legged locomotion, Robot kinematics, Trajectory, humanoid robots, legged locomotion, Mathematical model, torque control, path planning, center of mass trajectory, control loops, footstep positions, gait analysis, iCub, inverse kinematics algorithm, layered control architecture, online predictive planning, position control, position controlled walking, predictive control, predictive controller, robot kinematic model, robot kinematics, stack-of-task QP-based torque controller, table-cart model, torque controlled walking},
	pages = {1--9},
	file = {Dafarra et al_2018_A Control Architecture with Online Predictive Planning for Position and Torque.pdf:/home/dferigo/Zotero/storage/LH9MSN77/Dafarra et al_2018_A Control Architecture with Online Predictive Planning for Position and Torque.pdf:application/pdf;IEEE Xplore Abstract Record:/home/dferigo/Zotero/storage/96C6RSJ4/8594277.html:text/html},
}

@inproceedings{pucci_highly_2016,
	title = {Highly dynamic balancing via force control},
	doi = {10.1109/HUMANOIDS.2016.7803266},
	abstract = {This video shows the latest results in the whole-body control of humanoid robots achieved by the Dynamic Interaction Control Lab at the Italian Institute of Technology. In particular, the control architecture is composed of two nested control loops. The internal loop, which runs at 1 KHz, is in charge of stabilizing any desired joint torque. This task is achieved thanks to an off-line identification procedure providing us with a reliable model of friction and motor constants. The outer loop, which generates desired joint torques at 100 Hz, is a momentum based control algorithm in the context of free-floating systems. More precisely, the control objective for the outer loop is the stabilization of the robot's linear and angular momentum and the associated zero dynamics. The latter objective can be used to stabilize a desired joint configuration. The stability of the control framework is shown to be in the sense of Lyapunov. The contact forces and torques are regulated so as to break contacts only at desired configurations. Switching between several contacts is taken into account thanks to a finite-state-machine that dictates the constraints acting on the system. The control framework is implemented on the iCub humanoid robot.},
	booktitle = {2016 {IEEE}-{RAS} 16th {International} {Conference} on {Humanoid} {Robots} ({Humanoids})},
	author = {Pucci, Daniele and Romano, Francesco and Traversaro, Silvio and Nori, Francesco},
	month = nov,
	year = {2016},
	note = {ISSN: 2164-0580},
	keywords = {Humanoid robots, Dynamics, Force control, Friction, Reliability, Torque},
	pages = {141--141},
	file = {IEEE Xplore Abstract Record:/home/dferigo/Zotero/storage/LG6JJYV2/7803266.html:text/html},
}

@techreport{blondel_efficient_2022,
	title = {Efficient and {Modular} {Implicit} {Differentiation}},
	url = {http://arxiv.org/abs/2105.15183},
	abstract = {Automatic diﬀerentiation (autodiﬀ) has revolutionized machine learning. It allows to express complex computations by composing elementary ones in creative ways and removes the burden of computing their derivatives by hand. More recently, diﬀerentiation of optimization problem solutions has attracted widespread attention with applications such as optimization layers, and in bi-level problems such as hyper-parameter optimization and meta-learning. However, so far, implicit diﬀerentiation remained diﬃcult to use for practitioners, as it often required case-by-case tedious mathematical derivations and implementations. In this paper, we propose automatic implicit diﬀerentiation, an eﬃcient and modular approach for implicit diﬀerentiation of optimization problems. In our approach, the user deﬁnes directly in Python a function F capturing the optimality conditions of the problem to be diﬀerentiated. Once this is done, we leverage autodiﬀ of F and the implicit function theorem to automatically diﬀerentiate the optimization problem. Our approach thus combines the beneﬁts of implicit diﬀerentiation and autodiﬀ. It is eﬃcient as it can be added on top of any state-of-the-art solver and modular as the optimality condition speciﬁcation is decoupled from the implicit diﬀerentiation mechanism. We show that seemingly simple principles allow to recover many existing implicit diﬀerentiation methods and create new ones easily. We demonstrate the ease of formulating and solving bi-level optimization problems using our framework. We also showcase an application to the sensitivity analysis of molecular dynamics.},
	language = {en},
	number = {arXiv:2105.15183},
	urldate = {2022-06-01},
	institution = {arXiv},
	author = {Blondel, Mathieu and Berthet, Quentin and Cuturi, Marco and Frostig, Roy and Hoyer, Stephan and Llinares-López, Felipe and Pedregosa, Fabian and Vert, Jean-Philippe},
	month = may,
	year = {2022},
	note = {arXiv:2105.15183 [cs, math, stat]
type: article},
	keywords = {Computer Science - Machine Learning, Mathematics - Numerical Analysis, Statistics - Machine Learning},
	file = {Blondel et al. - 2022 - Efficient and Modular Implicit Differentiation.pdf:/home/dferigo/Zotero/storage/QT85GLQN/Blondel et al. - 2022 - Efficient and Modular Implicit Differentiation.pdf:application/pdf},
}

@techreport{sabne_xla_2020,
	title = {{XLA}: {Compiling} {Machine} {Learning} for {Peak} {Performance}},
	shorttitle = {{XLA}},
	author = {Sabne, Amit},
	year = {2020},
}

@article{harris_array_2020,
	title = {Array programming with {NumPy}},
	volume = {585},
	copyright = {2020 The Author(s)},
	issn = {1476-4687},
	url = {https://www.nature.com/articles/s41586-020-2649-2},
	doi = {10.1038/s41586-020-2649-2},
	abstract = {Array programming provides a powerful, compact and expressive syntax for accessing, manipulating and operating on data in vectors, matrices and higher-dimensional arrays. NumPy is the primary array programming library for the Python language. It has an essential role in research analysis pipelines in fields as diverse as physics, chemistry, astronomy, geoscience, biology, psychology, materials science, engineering, finance and economics. For example, in astronomy, NumPy was an important part of the software stack used in the discovery of gravitational waves1 and in the first imaging of a black hole2. Here we review how a few fundamental array concepts lead to a simple and powerful programming paradigm for organizing, exploring and analysing scientific data. NumPy is the foundation upon which the scientific Python ecosystem is constructed. It is so pervasive that several projects, targeting audiences with specialized needs, have developed their own NumPy-like interfaces and array objects. Owing to its central position in the ecosystem, NumPy increasingly acts as an interoperability layer between such array computation libraries and, together with its application programming interface (API), provides a flexible framework to support the next decade of scientific and industrial analysis.},
	language = {en},
	number = {7825},
	urldate = {2022-06-01},
	journal = {Nature},
	author = {Harris, Charles R. and Millman, K. Jarrod and van der Walt, Stéfan J. and Gommers, Ralf and Virtanen, Pauli and Cournapeau, David and Wieser, Eric and Taylor, Julian and Berg, Sebastian and Smith, Nathaniel J. and Kern, Robert and Picus, Matti and Hoyer, Stephan and van Kerkwijk, Marten H. and Brett, Matthew and Haldane, Allan and del Río, Jaime Fernández and Wiebe, Mark and Peterson, Pearu and Gérard-Marchant, Pierre and Sheppard, Kevin and Reddy, Tyler and Weckesser, Warren and Abbasi, Hameer and Gohlke, Christoph and Oliphant, Travis E.},
	month = sep,
	year = {2020},
	note = {Number: 7825
Publisher: Nature Publishing Group},
	keywords = {Software, Computational neuroscience, Computational science, Computer science, Solar physics},
	pages = {357--362},
	file = {Harris et al_2020_Array programming with NumPy.pdf:/home/dferigo/Zotero/storage/7ZRWE676/Harris et al_2020_Array programming with NumPy.pdf:application/pdf},
}

@article{van_der_walt_numpy_2011,
	title = {The {NumPy} {Array}: {A} {Structure} for {Efficient} {Numerical} {Computation}},
	volume = {13},
	issn = {1558-366X},
	shorttitle = {The {NumPy} {Array}},
	doi = {10.1109/MCSE.2011.37},
	abstract = {In the Python world, NumPy arrays are the standard representation for numerical data and enable efficient implementation of numerical computations in a high-level language. As this effort shows, NumPy performance can be improved through three techniques: vectorizing calculations, avoiding copying data in memory, and minimizing operation counts.},
	number = {2},
	journal = {Computing in Science \& Engineering},
	author = {van der Walt, Stefan and Colbert, S. Chris and Varoquaux, Gael},
	month = mar,
	year = {2011},
	note = {Conference Name: Computing in Science \& Engineering},
	keywords = {Arrays, Computational efficiency, Finite element methods, Numerical analysis, numerical computations, NumPy, Performance evaluation, programming libraries, Python, Resource management, scientific programming, Vector quantization},
	pages = {22--30},
	file = {van der Walt et al_2011_The NumPy Array.pdf:/home/dferigo/Zotero/storage/SMLVNJJN/van der Walt et al_2011_The NumPy Array.pdf:application/pdf},
}

@inproceedings{maclaurin_autograd_2015,
	title = {Autograd: {Eﬀortless} {Gradients} in {Numpy}},
	abstract = {Automatic diﬀerentiation can greatly speed up prototyping and implementation of machine learning models. However, most packages are implicitly domain-speciﬁc, requiring the use of a restricted mini-language for specifying functions. We introduce autograd, a package which diﬀerentiates standard Python and Numpy code, and can diﬀerentiate code containing while loops, branches, closures, classes and even its own gradient evaluations.},
	language = {en},
	author = {Maclaurin, Dougal and Duvenaud, David and Adams, Ryan P},
	year = {2015},
	file = {Maclaurin et al. - Autograd Eﬀortless Gradients in Numpy.pdf:/home/dferigo/Zotero/storage/HB32N7M8/Maclaurin et al. - Autograd Eﬀortless Gradients in Numpy.pdf:application/pdf},
}

@article{lee_dart_2018,
	title = {{DART}: {Dynamic} {Animation} and {Robotics} {Toolkit}},
	issn = {2475-9066},
	shorttitle = {{DART}},
	doi = {10.21105/joss.00500},
	language = {en},
	urldate = {2019-08-09},
	journal = {The Journal of Open Source Software},
	author = {Lee, Jeongseok and X. Grey, Michael and Ha, Sehoon and Kunz, Tobias and Jain, Sumit and Ye, Yuting and S. Srinivasa, Siddhartha and Stilman, Mike and Karen Liu, C.},
	month = feb,
	year = {2018},
	file = {Lee et al_2018_DART.pdf:/home/dferigo/Zotero/storage/VGHXAZPR/Lee et al_2018_DART.pdf:application/pdf},
}

@inproceedings{koenig_design_2004,
	address = {Sendai, Japan},
	title = {Design and use paradigms for gazebo, an open-source multi-robot simulator},
	volume = {3},
	isbn = {978-0-7803-8463-7},
	url = {http://ieeexplore.ieee.org/document/1389727/},
	doi = {10.1109/IROS.2004.1389727},
	abstract = {Simulators have played a critical role in robotics research as tools for quick and efﬁcient testing of new concepts, strategies, and algorithms. To date, most simulators have been restricted to 2D worlds, and few have matured to the point where they are both highly capable and easily adaptable. Gazebo is designed to ﬁll this niche by creating a 3D dynamic multi-robot environment capable of recreating the complex worlds that will be encountered by the next generation of mobile robots. Its open source status, ﬁne grained control, and high ﬁdelity place Gazebo in a unique position to become more than just a stepping stone between the drawing board and real hardware: data visualization, simulation of remote environments, and even reverse engineering of blackbox systems are all possible applications.},
	language = {en},
	urldate = {2019-02-07},
	booktitle = {2004 {IEEE}/{RSJ} {International} {Conference} on {Intelligent} {Robots} and {Systems} ({IROS}) ({IEEE} {Cat}. {No}.{04CH37566})},
	publisher = {IEEE},
	author = {Koenig, N. and Howard, A.},
	year = {2004},
	pages = {2149--2154},
	file = {Koenig_Howard_2004_Design and use paradigms for gazebo, an open-source multi-robot simulator.pdf:/home/dferigo/Zotero/storage/9KCY9GLI/Koenig_Howard_2004_Design and use paradigms for gazebo, an open-source multi-robot simulator.pdf:application/pdf},
}

@article{natale_icub_2017,
	title = {{iCub}: {The} not-yet-finished story of building a robot child},
	volume = {2},
	shorttitle = {{iCub}},
	url = {https://www.science.org/doi/full/10.1126/scirobotics.aaq1026},
	doi = {10.1126/scirobotics.aaq1026},
	number = {13},
	urldate = {2022-05-31},
	journal = {Science Robotics},
	author = {Natale, Lorenzo and Bartolozzi, Chiara and Pucci, Daniele and Wykowska, Agnieszka and Metta, Giorgio},
	month = dec,
	year = {2017},
	note = {Publisher: American Association for the Advancement of Science},
	file = {Natale et al_2017_iCub.pdf:/home/dferigo/Zotero/storage/FJ6ZI85X/Natale et al_2017_iCub.pdf:application/pdf},
}

@article{metta_open_2005,
	title = {an open framework for research in embodied cognition},
	abstract = {This paper presents a research initiative on embodied cognition called RobotCub. RobotCub is an EUfunded project that aims at developing an open humanoid robotic platform and, simultaneously, following an original research path in synthetic cognition. We report on the motivations behind the realization of our humanoid robotic platform and the current status of the design just before the construction of the first full prototype.},
	language = {en},
	author = {Metta, Giorgio and Sandini, Giulio and Vernon, David and Caldwell, Darwin and Tsagarakis, Nikolaos and Beira, Ricardo and Santos-Victor, Jose' and Ijspeert, Auke and Righetti, Ludovic and Cappiello, Giovanni and Stellin, Giovanni and Becchi, Francesco},
	year = {2005},
	file = {Metta et al. - 2005 - an open framework for research in embodied cogniti.pdf:/home/dferigo/Zotero/storage/ZWCLYQYE/Metta et al. - 2005 - an open framework for research in embodied cogniti.pdf:application/pdf},
}

@misc{noauthor_icub_nodate,
	title = {{iCub}: {The} not-yet-finished story of building a robot child},
	url = {https://www.science.org/doi/10.1126/scirobotics.aaq1026},
	urldate = {2022-05-31},
}

@techreport{zamora_extending_2017,
	title = {Extending the {OpenAI} {Gym} for robotics: a toolkit for reinforcement learning using {ROS} and {Gazebo}},
	shorttitle = {Extending the {OpenAI} {Gym} for robotics},
	url = {http://arxiv.org/abs/1608.05742},
	abstract = {This paper presents an extension of the OpenAI Gym for robotics using the Robot Operating System (ROS) and the Gazebo simulator. The content discusses the software architecture proposed and the results obtained by using two Reinforcement Learning techniques: Q-Learning and Sarsa. Ultimately, the output of this work presents a benchmarking system for robotics that allows different techniques and algorithms to be compared using the same virtual conditions.},
	number = {arXiv:1608.05742},
	urldate = {2022-05-31},
	institution = {arXiv},
	author = {Zamora, Iker and Lopez, Nestor Gonzalez and Vilches, Victor Mayoral and Cordero, Alejandro Hernandez},
	month = feb,
	year = {2017},
	doi = {10.48550/arXiv.1608.05742},
	note = {arXiv:1608.05742 [cs]
type: article},
	keywords = {Computer Science - Robotics},
	file = {arXiv.org Snapshot:/home/dferigo/Zotero/storage/HH56KUXC/1608.html:text/html;Zamora et al_2017_Extending the OpenAI Gym for robotics.pdf:/home/dferigo/Zotero/storage/563JXI4P/Zamora et al_2017_Extending the OpenAI Gym for robotics.pdf:application/pdf},
}

@techreport{duburcq_reactive_2022,
	title = {Reactive {Stepping} for {Humanoid} {Robots} using {Reinforcement} {Learning}: {Application} to {Standing} {Push} {Recovery} on the {Exoskeleton} {Atalante}},
	shorttitle = {Reactive {Stepping} for {Humanoid} {Robots} using {Reinforcement} {Learning}},
	url = {http://arxiv.org/abs/2203.01148},
	abstract = {State-of-the-art reinforcement learning is now able to learn versatile locomotion, balancing and push-recovery capabilities for bipedal robots in simulation. Yet, the reality gap has mostly been overlooked and the simulated results hardly transfer to real hardware. Either it is unsuccessful in practice because the physics is over-simplified and hardware limitations are ignored, or regularity is not guaranteed and unexpected hazardous motions can occur. This paper presents a reinforcement learning framework capable of learning robust standing push recovery for bipedal robots with a smooth out-of-the-box transfer to reality, requiring only instantaneous proprioceptive observations. By combining original termination conditions and policy smoothness conditioning, we achieve stable learning, sim-to-real transfer and safety using a policy without memory nor observation history. Reward shaping is then used to give insights into how to keep balance. We demonstrate its performance in reality on the lower-limb medical exoskeleton Atalante.},
	number = {arXiv:2203.01148},
	urldate = {2022-05-21},
	institution = {arXiv},
	author = {Duburcq, Alexis and Schramm, Fabian and Boéris, Guilhem and Bredeche, Nicolas and Chevaleyre, Yann},
	month = mar,
	year = {2022},
	doi = {10.48550/arXiv.2203.01148},
	note = {arXiv:2203.01148 [cs]
version: 1
type: article},
	keywords = {Computer Science - Robotics},
	file = {arXiv.org Snapshot:/home/dferigo/Zotero/storage/UBANG96C/2203.html:text/html;Duburcq et al_2022_Reactive Stepping for Humanoid Robots using Reinforcement Learning.pdf:/home/dferigo/Zotero/storage/SQ73JWFF/Duburcq et al_2022_Reactive Stepping for Humanoid Robots using Reinforcement Learning.pdf:application/pdf},
}

@inproceedings{kim_push_2019,
	title = {Push {Recovery} {Control} for {Humanoid} {Robot} {Using} {Reinforcement} {Learning}},
	doi = {10.1109/IRC.2019.00102},
	abstract = {A humanoid robot similar to a human is structurally unstable, so the push recovery control is essential. The proposed push recovery controller consists of a IMU sensor part, a highlevel push recovery controller and a low-level push recovery controller. The IMU sensor part measures the linear velocity and angular velocity and transmits it to the high-level push recovery controller. The high-level push recovery controller selects the strategy of the low-level push recovery controller based on the stability region. The stability region is improved using the DQN(Deep Q-Network) of the reinforcement learning method. The low-level push recovery controller consists of a ankle, hip and step strategies. Each strategy is analyzed using LIPM(Linear Inverted Pendulum Model). Based on the analysis, the actuators corresponding to each strategy are controlled.},
	booktitle = {2019 {Third} {IEEE} {International} {Conference} on {Robotic} {Computing} ({IRC})},
	author = {Kim, Harin and Seo, Donghyeon and Kim, Donghan},
	month = feb,
	year = {2019},
	keywords = {Reinforcement learning, Humanoid robots, reinforcement learning, humanoid robots, learning (artificial intelligence), legged locomotion, humanoid robot, stability, motion control, robot programming, Mathematical model, Hip, linear systems, bipedal robot, control engineering computing, push recovery, Actuators, pendulums, Torso, angular velocity, balancing control, Deep Q-Network, DQN, IMU sensor part, linear inverted pendulum model, linear velocity, nonlinear systems, push recovery control, stability region},
	pages = {488--492},
	file = {IEEE Xplore Abstract Record:/home/dferigo/Zotero/storage/9R2B6JH9/8675597.html:text/html;Kim et al_2019_Push Recovery Control for Humanoid Robot Using Reinforcement Learning.pdf:/home/dferigo/Zotero/storage/I52S4RI7/Kim et al_2019_Push Recovery Control for Humanoid Robot Using Reinforcement Learning.pdf:application/pdf},
}

@inproceedings{wieber_trajectory_2006,
	title = {Trajectory {Free} {Linear} {Model} {Predictive} {Control} for {Stable} {Walking} in the {Presence} of {Strong} {Perturbations}},
	doi = {10.1109/ICHR.2006.321375},
	abstract = {A humanoid walking robot is a highly nonlinear dynamical system that relies strongly on contact forces between its feet and the ground in order to realize stable motions, but these contact forces are unfortunately severely limited. Model predictive control, also known as receding horizon control, is a general control scheme specifically designed to deal with such constrained dynamical systems, with the potential ability to react efficiently to a wide range of situations. Apart from the question of computation time which needs to be taken care of carefully (these schemes can be highly computation intensive), the initial question of which optimal control problems should be considered to be solved online in order to lead to the desired walking movements is still unanswered. A key idea for answering to this problem can be found in the ZMP preview control scheme. After presenting here this scheme with a point of view slightly different from the original one, we focus on the problem of compensating strong perturbations of the dynamics of the robot and propose a new linear model predictive control scheme which is an improvement of the original ZMP preview control scheme.},
	booktitle = {2006 6th {IEEE}-{RAS} {International} {Conference} on {Humanoid} {Robots}},
	author = {Wieber, Pierre-brice},
	month = dec,
	year = {2006},
	note = {ISSN: 2164-0580},
	keywords = {Control systems, Humanoid robots, Legged locomotion, Trajectory, Optimal control, Force control, Motion control, Predictive models, Nonlinear dynamical systems, Predictive control},
	pages = {137--142},
	file = {IEEE Xplore Abstract Record:/home/dferigo/Zotero/storage/6NQ6937N/4115592.html:text/html;Wieber_2006_Trajectory Free Linear Model Predictive Control for Stable Walking in the.pdf:/home/dferigo/Zotero/storage/PHYMXB8L/Wieber_2006_Trajectory Free Linear Model Predictive Control for Stable Walking in the.pdf:application/pdf},
}

@inproceedings{feng_optimization_2014,
	title = {Optimization based full body control for the atlas robot},
	doi = {10.1109/HUMANOIDS.2014.7041347},
	abstract = {One popular approach to controlling humanoid robots is through inverse kinematics (IK) with stiff joint position tracking. On the other hand, inverse dynamics (ID) based approaches have gained increasing acceptance by providing compliant motions and robustness to external perturbations. However, the performance of such methods is heavily dependent on high quality dynamic models, which are often very difficult to produce for a physical robot. IK approaches only require kinematic models, which are much easier to generate in practice. In this paper, we supplement our previous work with ID-based controllers by adding IK, which helps compensate for modeling errors. The proposed full body controller is applied to three tasks in the DARPA Robotics Challenge (DRC) Trials in Dec. 2013.},
	booktitle = {2014 {IEEE}-{RAS} {International} {Conference} on {Humanoid} {Robots}},
	author = {Feng, Siyuan and Whitman, Eric and Xinjilefu, X and Atkeson, Christopher G.},
	month = nov,
	year = {2014},
	note = {ISSN: 2164-0580},
	keywords = {Joints, Robots, Kinematics, Dynamics, Torque, Foot, Jacobian matrices},
	pages = {120--127},
	file = {IEEE Xplore Abstract Record:/home/dferigo/Zotero/storage/QSUVBIRV/7041347.html:text/html},
}

@article{vukobratovic_contribution_1969,
	title = {Contribution to the {Synthesis} of {Biped} {Gait}},
	volume = {BME-16},
	issn = {1558-2531},
	doi = {10.1109/TBME.1969.4502596},
	abstract = {The connection between the dynamics of an object and the algorithmic level has been modified in this paper, based on two-level control. The central modification consists in introducing feedbacks, that is, a system of regulators at the level of the formed typed of gait only. Such a modification originates from the assumption that a very narrow class of gait types needs to be taken into account when generating the gait. In the paper the gait has been formed on the basis of a fixed program having a kinematic-dynamic character. The kinematic part concerns the kinematic programnmed connections for activating the lower extremities, while the dynamic part exposes appropriate changes in the characteristic coordinates of the compensation system. Such a connection with a minimum number of coordinates extends the possibility of solving the problem of equilibrium in motion for one type of gait without any particular algorithm that would take into account the motion coordinates and form out of them a stable motion at a higher algebraic level.},
	number = {1},
	journal = {IEEE Transactions on Biomedical Engineering},
	author = {Vukobratovic, Miomir and Juricic, Davor},
	month = jan,
	year = {1969},
	note = {Conference Name: IEEE Transactions on Biomedical Engineering},
	keywords = {Control systems, Kinematics, Motion control, Aerodynamics, Leg, Automatic control, Control system synthesis, Extremities, MIMO, Organisms},
	pages = {1--6},
	file = {IEEE Xplore Abstract Record:/home/dferigo/Zotero/storage/VPRS9676/4502596.html:text/html},
}

@article{vukobratovic_zero-moment_2004,
	title = {Zero-moment point — thirty five years of its life},
	volume = {01},
	issn = {0219-8436, 1793-6942},
	url = {https://www.worldscientific.com/doi/abs/10.1142/S0219843604000083},
	doi = {10.1142/S0219843604000083},
	abstract = {This paper is devoted to the permanence of the concept of Zero-Moment Point, widely-known by the acronym ZMP. Thirty-five years have elapsed since its implicit presentation (actually before being named ZMP) to the scientific community and thirty-three years since it was explicitly introduced and clearly elaborated, initially in the leading journals published in English. Its first practical demonstration took place in Japan in 1984, at Waseda University, Laboratory of Ichiro Kato, in the first dynamically balanced robot WL-10RD of the robotic family WABOT. The paper gives an in-depth discussion of source results concerning ZMP, paying particular attention to some delicate issues that may lead to confusion if this method is applied in a mechanistic manner onto irregular cases of artificial gait, i.e. in the case of loss of dynamic balance of a humanoid robot.
            After a short survey of the history of the origin of ZMP a very detailed elaboration of ZMP notion is given, with a special review concerning "boundary cases" when the ZMP is close to the edge of the support polygon and "fictious cases" when the ZMP should be outside the support polygon. In addition, the difference between ZMP and the center of pressure is pointed out. Finally, some unresolved or insufficiently treated phenomena that may yield a significant improvement in robot performance are considered.},
	language = {en},
	number = {01},
	urldate = {2022-05-21},
	journal = {International Journal of Humanoid Robotics},
	author = {Vukobratović, Miomir and Borovac, Branislav},
	month = mar,
	year = {2004},
	pages = {157--173},
	file = {Vukobratović and Borovac - 2004 - ZERO-MOMENT POINT — THIRTY FIVE YEARS OF ITS LIFE.pdf:/home/dferigo/Zotero/storage/7BQEUGQN/Vukobratović and Borovac - 2004 - ZERO-MOMENT POINT — THIRTY FIVE YEARS OF ITS LIFE.pdf:application/pdf},
}

@article{vukobratovic_zero-moment_2004-1,
	title = {Zero-moment point — thirty five years of its life},
	volume = {01},
	issn = {0219-8436},
	url = {https://www.worldscientific.com/doi/abs/10.1142/S0219843604000083},
	doi = {10.1142/S0219843604000083},
	abstract = {This paper is devoted to the permanence of the concept of Zero-Moment Point, widely-known by the acronym ZMP. Thirty-five years have elapsed since its implicit presentation (actually before being named ZMP) to the scientific community and thirty-three years since it was explicitly introduced and clearly elaborated, initially in the leading journals published in English. Its first practical demonstration took place in Japan in 1984, at Waseda University, Laboratory of Ichiro Kato, in the first dynamically balanced robot WL-10RD of the robotic family WABOT. The paper gives an in-depth discussion of source results concerning ZMP, paying particular attention to some delicate issues that may lead to confusion if this method is applied in a mechanistic manner onto irregular cases of artificial gait, i.e. in the case of loss of dynamic balance of a humanoid robot.

After a short survey of the history of the origin of ZMP a very detailed elaboration of ZMP notion is given, with a special review concerning "boundary cases" when the ZMP is close to the edge of the support polygon and "fictious cases" when the ZMP should be outside the support polygon. In addition, the difference between ZMP and the center of pressure is pointed out. Finally, some unresolved or insufficiently treated phenomena that may yield a significant improvement in robot performance are considered.},
	number = {01},
	urldate = {2022-05-21},
	journal = {International Journal of Humanoid Robotics},
	author = {Vukobratović, Miomir and Borovac, Branislav},
	month = mar,
	year = {2004},
	note = {Publisher: World Scientific Publishing Co.},
	keywords = {zero-moment point, Biped locomotion, dynamically balanced gait, support polygon},
	pages = {157--173},
}

@inproceedings{kajita_3d_2001,
	address = {Maui, HI, USA},
	title = {The {3D} linear inverted pendulum mode: a simple modeling for a biped walking pattern generation},
	volume = {1},
	isbn = {978-0-7803-6612-1},
	shorttitle = {The {3D} linear inverted pendulum mode},
	url = {http://ieeexplore.ieee.org/document/973365/},
	doi = {10.1109/IROS.2001.973365},
	abstract = {F or 3D walkingcontr ol of a biped robot we analyze the dynamics of a three-dimensional inverted pendulum in which motion is constr aine d to move along an arbitr arily de ned plane. This analysis leads us a simple line ar dynamics, the There-Dimensional Linear Inverte d Pendulum Mode (3D-LIPM). Geometric nature of trajectories under the 3D-LIPM and a method for walking pattern gener ationare discusse d. A simulation r esult of a walking contr ol using a 12 d.o.f. biped robot model is also shown.},
	language = {en},
	urldate = {2022-05-21},
	booktitle = {Proceedings 2001 {IEEE}/{RSJ} {International} {Conference} on {Intelligent} {Robots} and {Systems}. {Expanding} the {Societal} {Role} of {Robotics} in the the {Next} {Millennium} ({Cat}. {No}.{01CH37180})},
	publisher = {IEEE},
	author = {Kajita, S. and Kanehiro, F. and Kaneko, K. and Yokoi, K. and Hirukawa, H.},
	year = {2001},
	pages = {239--246},
	file = {Kajita et al. - 2001 - The 3D linear inverted pendulum mode a simple mod.pdf:/home/dferigo/Zotero/storage/UWHTKZPV/Kajita et al. - 2001 - The 3D linear inverted pendulum mode a simple mod.pdf:application/pdf},
}

@article{maki_role_1997,
	title = {The role of limb movements in maintaining upright stance: the "change-in-support" strategy},
	volume = {77},
	issn = {0031-9023},
	shorttitle = {The role of limb movements in maintaining upright stance},
	doi = {10.1093/ptj/77.5.488},
	abstract = {Change-in-support strategies, involving stepping or grasping movements of the limbs, are prevalent reactions to instability and appear to play a more important functional role in maintaining upright stance than has generally been appreciated. Contrary to traditional views, change-in-support reactions are not just strategies of last resort, but are often initiated well before the center of mass is near the stability limits of the base of support. Furthermore, it appears that subjects, when given the option, will select these reactions in preference to the fixed-support "hip strategy" that has been purported to be of functional importance. The rapid speed of compensatory change-in-support reactions distinguishes them from "volitional" arm and leg movements. In addition, compensatory stepping reactions often lack the anticipatory control elements that are invariably present in non-compensatory stepping, such as gait initiation. Even when present, these anticipatory adjustments appear to have little functional value during rapid compensatory movements. Lateral destabilization complicates the control of compensatory stepping, a finding that may be particularly relevant to the problem of falls and hip fractures in elderly people. Older adults appear to have problems in controlling lateral stability when stepping to recover balance, even when responding to anteroposterior perturbation. Increased understanding and awareness of change-in-support reactions should lead to development of new diagnostic and therapeutic approaches for detecting and treating specific causes of imbalance and falling in elderly people and in patients with balance impairments.},
	language = {eng},
	number = {5},
	journal = {Physical Therapy},
	author = {Maki, B. E. and McIlroy, W. E.},
	month = may,
	year = {1997},
	pmid = {9149760},
	keywords = {Humans, Leg, Arm, Adaptation, Physiological, Adult, Aged, Aging, Middle Aged, Movement, Postural Balance, Posture, Sensation Disorders},
	pages = {488--507},
	file = {Maki_McIlroy_1997_The role of limb movements in maintaining upright stance.pdf:/home/dferigo/Zotero/storage/I2REVTU8/Maki_McIlroy_1997_The role of limb movements in maintaining upright stance.pdf:application/pdf},
}

@article{nashner_organization_1985,
	title = {The organization of human postural movements: a formal basis and experimental synthesis},
	volume = {8},
	shorttitle = {The organization of human postural movements},
	number = {1},
	journal = {Behavioral and brain sciences},
	author = {Nashner, Lewis M. and McCollum, Gin},
	year = {1985},
	note = {Publisher: Cambridge University Press},
	pages = {135--150},
}

@article{lee_learning_2020,
	title = {Learning quadrupedal locomotion over challenging terrain},
	volume = {5},
	issn = {2470-9476},
	url = {https://www.science.org/doi/10.1126/scirobotics.abc5986},
	doi = {10.1126/scirobotics.abc5986},
	abstract = {A learning-based locomotion controller enables a quadrupedal ANYmal robot to traverse challenging natural environments.
          , 
            Legged locomotion can extend the operational domain of robots to some of the most challenging environments on Earth. However, conventional controllers for legged locomotion are based on elaborate state machines that explicitly trigger the execution of motion primitives and reflexes. These designs have increased in complexity but fallen short of the generality and robustness of animal locomotion. Here, we present a robust controller for blind quadrupedal locomotion in challenging natural environments. Our approach incorporates proprioceptive feedback in locomotion control and demonstrates zero-shot generalization from simulation to natural environments. The controller is trained by reinforcement learning in simulation. The controller is driven by a neural network policy that acts on a stream of proprioceptive signals. The controller retains its robustness under conditions that were never encountered during training: deformable terrains such as mud and snow, dynamic footholds such as rubble, and overground impediments such as thick vegetation and gushing water. The presented work indicates that robust locomotion in natural environments can be achieved by training in simple domains.},
	language = {en},
	number = {47},
	urldate = {2022-05-21},
	journal = {Science Robotics},
	author = {Lee, Joonho and Hwangbo, Jemin and Wellhausen, Lorenz and Koltun, Vladlen and Hutter, Marco},
	month = oct,
	year = {2020},
	file = {Lee et al. - 2020 - Learning quadrupedal locomotion over challenging t.pdf:/home/dferigo/Zotero/storage/U25M66GV/Lee et al. - 2020 - Learning quadrupedal locomotion over challenging t.pdf:application/pdf},
}

@article{miki_learning_2022,
	title = {Learning robust perceptive locomotion for quadrupedal robots in the wild},
	volume = {7},
	url = {https://www.science.org/doi/abs/10.1126/scirobotics.abk2822},
	doi = {10.1126/scirobotics.abk2822},
	number = {62},
	urldate = {2022-05-21},
	journal = {Science Robotics},
	author = {Miki, Takahiro and Lee, Joonho and Hwangbo, Jemin and Wellhausen, Lorenz and Koltun, Vladlen and Hutter, Marco},
	month = jan,
	year = {2022},
	note = {Publisher: American Association for the Advancement of Science},
	pages = {eabk2822},
	file = {Miki et al_2022_Learning robust perceptive locomotion for quadrupedal robots in the wild.pdf:/home/dferigo/Zotero/storage/VF6EF3ZW/Miki et al_2022_Learning robust perceptive locomotion for quadrupedal robots in the wild.pdf:application/pdf},
}

@inproceedings{koolen_balance_2016,
	title = {Balance control using center of mass height variation: {Limitations} imposed by unilateral contact},
	shorttitle = {Balance control using center of mass height variation},
	doi = {10.1109/HUMANOIDS.2016.7803247},
	abstract = {Maintaining balance is fundamental to legged robots. The most commonly used mechanisms for balance control are taking a step, regulating the center of pressure (`ankle strategies'), and to a lesser extent, changing centroidal angular momentum (e.g., `hip strategies'). In this paper, we disregard these three mechanisms, instead focusing on a fourth: varying center of mass height. We study a 2D variable-height center of mass model, and analyze how center of mass height variation can be used to achieve balance, in the sense of convergence to a fixed point of the dynamics. In this analysis, we pay special attention to the constraint of unilateral contact forces. We first derive a necessary condition that must be satisfied to be able to achieve balance. We then present two control laws, and derive their regions of attraction in closed form. We show that one of the control laws achieves balance from any state satisfying the necessary condition for balance. Finally, we briefly discuss the relative importance of CoM height variation and other balance mechanisms.},
	booktitle = {2016 {IEEE}-{RAS} 16th {International} {Conference} on {Humanoid} {Robots} ({Humanoids})},
	author = {Koolen, Twan and Posa, Michael and Tedrake, Russ},
	month = nov,
	year = {2016},
	note = {ISSN: 2164-0580},
	keywords = {Legged locomotion, Trajectory, Dynamics, Lips, Convergence, Two dimensional displays},
	pages = {8--15},
	file = {IEEE Xplore Abstract Record:/home/dferigo/Zotero/storage/9A9HYM9S/7803247.html:text/html;Koolen et al_2016_Balance control using center of mass height variation.pdf:/home/dferigo/Zotero/storage/NNEK5L6N/Koolen et al_2016_Balance control using center of mass height variation.pdf:application/pdf},
}

@inproceedings{stephens_humanoid_2007,
	address = {Pittsburgh, PA, USA},
	title = {Humanoid push recovery},
	isbn = {978-1-4244-1861-9},
	url = {http://ieeexplore.ieee.org/document/4813931/},
	doi = {10.1109/ICHR.2007.4813931},
	abstract = {We extend simple models previously developed for humanoids to large push recovery. Using these simple models, we develop analytic decision surfaces that are functions of reference points, such as the center of mass and center of pressure, that predict whether or not a fall is inevitable. We explore three strategies for recovery: 1) using ankle torques, 2) moving internal joints, and 3) taking a step. These models can be used in robot controllers or in analysis of human balance and locomotion.},
	language = {en},
	urldate = {2022-05-21},
	booktitle = {2007 7th {IEEE}-{RAS} {International} {Conference} on {Humanoid} {Robots}},
	publisher = {IEEE},
	author = {Stephens, Benjamin},
	month = nov,
	year = {2007},
	pages = {589--595},
	file = {Stephens - 2007 - Humanoid push recovery.pdf:/home/dferigo/Zotero/storage/W4D9H84Y/Stephens - 2007 - Humanoid push recovery.pdf:application/pdf},
}

@article{shafiee-ashtiani_push_2016,
	title = {Push {Recovery} of a {Humanoid} {Robot} {Based} on {Model} {Predictive} {Control} and {Capture} {Point}},
	url = {http://arxiv.org/abs/1612.08034},
	abstract = {The three bio-inspired strategies that have been used for balance recovery of biped robots are the ankle, hip and stepping Strategies. However, there are several cases for a biped robot where stepping is not possible, e. g. when the available contact surfaces are limited. In this situation, the balance recovery by modulating the angular momentum of the upper body (Hip-strategy) or the Zero Moment Point (ZMP) (Ankle strategy) is essential. In this paper, a single Model Predictive Control (MPC) scheme is employed for controlling the Capture Point (CP) to a desired position by modulating both the ZMP and the Centroidal Moment Pivot (CMP). The goal of the proposed controller is to control the CP, employing the CMP when the CP is out of the support polygon, and/or the ZMP when the CP is inside the support polygon. The proposed algorithm is implemented on an abstract model of the SURENA III humanoid robot. Obtained results show the effectiveness of the proposed approach in the presence of severe pushes, even when the support polygon is shrunken to a point or a line.},
	language = {en},
	urldate = {2022-05-21},
	journal = {arXiv:1612.08034 [cs]},
	author = {Shafiee-Ashtiani, Milad and Yousefi-Koma, Aghil and Shariat-Panahi, Masoud and Khadiv, Majid},
	month = dec,
	year = {2016},
	note = {arXiv: 1612.08034},
	keywords = {Computer Science - Robotics},
	file = {Shafiee-Ashtiani et al. - 2016 - Push Recovery of a Humanoid Robot Based on Model P.pdf:/home/dferigo/Zotero/storage/V5B979ZC/Shafiee-Ashtiani et al. - 2016 - Push Recovery of a Humanoid Robot Based on Model P.pdf:application/pdf},
}

@inproceedings{mcgreavy_unified_2020,
	title = {Unified {Push} {Recovery} {Fundamentals}: {Inspiration} from {Human} {Study}},
	shorttitle = {Unified {Push} {Recovery} {Fundamentals}},
	doi = {10.1109/ICRA40945.2020.9196911},
	abstract = {Currently for balance recovery, humans outperform humanoid robots which use hand-designed controllers in terms of the diverse actions. This study aims to close this gap by finding core control principles that are shared across ankle, hip, toe and stepping strategies by formulating experiments to test human balance recoveries and define criteria to quantify the strategy in use. To reveal fundamental principles of balance strategies, our study shows that a minimum jerk controller can accurately replicate comparable human behaviour at the Centre of Mass level. Therefore, we formulate a general Model-Predictive Control (MPC) framework to produce recovery motions in any system, including legged machines, where the framework parameters are tuned for time-optimal performance in robotic systems.},
	booktitle = {2020 {IEEE} {International} {Conference} on {Robotics} and {Automation} ({ICRA})},
	author = {McGreavy, Christopher and Yuan, Kai and Gordon, Daniel and Tan, Kang and Wolfslag, Wouter J and Vijayakumar, Sethu and Li, Zhibin},
	month = may,
	year = {2020},
	note = {ISSN: 2577-087X},
	keywords = {Robots, Hip, Foot, Force, Modulation, Stability criteria},
	pages = {10876--10882},
	file = {IEEE Xplore Abstract Record:/home/dferigo/Zotero/storage/2YFRVS3Q/9196911.html:text/html;McGreavy et al_2020_Unified Push Recovery Fundamentals.pdf:/home/dferigo/Zotero/storage/W2PNKYLM/McGreavy et al_2020_Unified Push Recovery Fundamentals.pdf:application/pdf},
}

@article{jeong_robust_2019,
	title = {A {Robust} {Walking} {Controller} {Based} on {Online} {Optimization} of {Ankle}, {Hip}, and {Stepping} {Strategies}},
	volume = {35},
	issn = {1941-0468},
	doi = {10.1109/TRO.2019.2926487},
	abstract = {In this paper, we propose a biped walking controller that optimized three push recovery strategies: the ankle, hip, and stepping strategies. We suggested formulations that related the effects of each strategy to the stability of walking based on the linear inverted pendulum with flywheel model. With these relations, we could set up an optimization problem that integrates all the strategies, including step time change. These strategies are not applied hierarchically, but applied according to each weighting factor. Various combinations of weighting factors can be used to determine how the robot should respond to an external push. The optimization problem derived here includes many nonlinear components, but it has been linearized though some assumptions and it can be applied to a robot in real time. Our method is designed to be robust to modeling errors or weak perturbations, by exploiting the advantages of the foot. Hence, it is very practical to apply this algorithm to a real robot. The effectiveness of the walking controller has been verified through abstracted model simulation, full dynamics simulation, and a practical robot experiments.},
	number = {6},
	journal = {IEEE Transactions on Robotics},
	author = {Jeong, Hyobin and Lee, Inho and Oh, Jaesung and Lee, Kang Kyu and Oh, Jun-Ho},
	month = dec,
	year = {2019},
	note = {Conference Name: IEEE Transactions on Robotics},
	keywords = {Biped walking, Humanoid robots, Legged locomotion, humanoid robot, Torque, Hip, Foot, Optimization, divergent component of motion (DCM), walking control},
	pages = {1367--1386},
	file = {Jeong et al_2019_A Robust Walking Controller Based on Online Optimization of Ankle, Hip, and.pdf:/home/dferigo/Zotero/storage/J53BAQWP/Jeong et al_2019_A Robust Walking Controller Based on Online Optimization of Ankle, Hip, and.pdf:application/pdf},
}

@inproceedings{aftab_ankle_2012,
	title = {Ankle, hip and stepping strategies for humanoid balance recovery with a single {Model} {Predictive} {Control} scheme},
	doi = {10.1109/HUMANOIDS.2012.6651514},
	abstract = {While humans are highly efficient in dealing with balance perturbations, current biped humanoid robots are far from showing similar skills. This is mainly due to the limited capacity of current robot controllers to deal with the inherently complex dynamics of biped robots. Though Model Predictive Control schemes have shown improved robustness to perturbations, they still suffer from a few shortcomings such as not considering the upper body inertial effects or non-optimal step durations. We propose here a Model Predictive Control scheme that specifically addresses these shortcomings and generates human-like responses to perturbations, involving appropriate combinations of ankle, hip and stepping strategies, with automatically adjusted step durations. The emphasis of this paper is on modeling and analyzing the effects of different cost functions and coefficients on the behavior of the controller while leaving real-time implementations and experiments for later work.},
	booktitle = {2012 12th {IEEE}-{RAS} {International} {Conference} on {Humanoid} {Robots} ({Humanoids} 2012)},
	author = {Aftab, Zohaib and Robert, Thomas and Wieber, Pierre-Brice},
	month = nov,
	year = {2012},
	note = {ISSN: 2164-0580},
	keywords = {Robots, Stability analysis, Acceleration, Hip, Foot, Predictive control, Linear programming},
	pages = {159--164},
	file = {Aftab et al_2012_Ankle, hip and stepping strategies for humanoid balance recovery with a single.pdf:/home/dferigo/Zotero/storage/FGKZFRTL/Aftab et al_2012_Ankle, hip and stepping strategies for humanoid balance recovery with a single.pdf:application/pdf;IEEE Xplore Abstract Record:/home/dferigo/Zotero/storage/LJPL5LQR/6651514.html:text/html},
}

@inproceedings{abdallah_biomechanically_2005,
	title = {A {Biomechanically} {Motivated} {Two}-{Phase} {Strategy} for {Biped} {Upright} {Balance} {Control}},
	doi = {10.1109/ROBOT.2005.1570406},
	abstract = {Balance maintenance and upright posture recovery under unexpected environmental forces are key requirements for safe and successful co-existence of humanoid robots in normal human environments. In this paper we present a two-phase control strategy for robust balance maintenance under a force disturbance. The first phase, called the reflex phase, is designed to withstand the immediate effect of the force. The second phase is the recovery phase where the system is steered back to a statically stable “home” posture. The reflex control law employs angular momentum and is characterized by its counter-intuitive quality of “yielding” to the disturbance. The recovery control employs a general scheme of seeking to maximize the potential energy and is robust to local ground surface feature. Biomechanics literature indicates a similar strategy in play during human balance maintenance.},
	booktitle = {Proceedings of the 2005 {IEEE} {International} {Conference} on {Robotics} and {Automation}},
	author = {Abdallah, M. and Goswami, A.},
	month = apr,
	year = {2005},
	note = {ISSN: 1050-4729},
	keywords = {Robust control, Biped robot, Humanoid robots, Force control, Hip, Foot, Biological system modeling, Humans, Biomechanics, balance, disturbance rejection, posture recovery, potential energy, Potential energy, Robotics and automation},
	pages = {1996--2001},
	file = {Abdallah_Goswami_2005_A Biomechanically Motivated Two-Phase Strategy for Biped Upright Balance Control.pdf:/home/dferigo/Zotero/storage/CQLPYTNX/Abdallah_Goswami_2005_A Biomechanically Motivated Two-Phase Strategy for Biped Upright Balance Control.pdf:application/pdf;IEEE Xplore Abstract Record:/home/dferigo/Zotero/storage/IGRBKH2M/1570406.html:text/html},
}

@article{shafiee_online_2019,
	title = {Online {DCM} {Trajectory} {Generation} for {Push} {Recovery} of {Torque}-{Controlled} {Humanoid} {Robots}},
	url = {http://arxiv.org/abs/1909.10403},
	abstract = {We present a computationally efﬁcient method for online planning of bipedal walking trajectories with push recovery. In particular, the proposed methodology ﬁts control architectures where the Divergent-Component-of-Motion (DCM) is planned beforehand, and adds a step adapter to adjust the planned trajectories and achieve push recovery. Assuming that the robot is in a single support state, the step adapter generates new positions and timings for the next step. The step adapter is active in single support phases only, but the proposed torque-control architecture considers double support phases too. The key idea for the design of the step adapter is to impose both initial and ﬁnal DCM step values using an exponential interpolation of the time varying ZMP trajectory. This allows us to cast the push recovery problem as a Quadratic Programming (QP) one, and to solve it online with stateof-the-art optimisers. The overall approach is validated with simulations of the torque-controlled 33 kg humanoid robot iCub. Results show that the proposed strategy prevents the humanoid robot from falling while walking at 0.28 m/s and pushed with external forces up to 150 Newton for 0.05 seconds.},
	language = {en},
	urldate = {2022-05-21},
	journal = {arXiv:1909.10403 [cs]},
	author = {Shafiee, Milad and Romualdi, Giulio and Dafarra, Stefano and Chavez, Francisco Javier Andrade and Pucci, Daniele},
	month = oct,
	year = {2019},
	note = {arXiv: 1909.10403},
	keywords = {Computer Science - Robotics},
	file = {Shafiee et al. - 2019 - Online DCM Trajectory Generation for Push Recovery.pdf:/home/dferigo/Zotero/storage/SC6TWNXN/Shafiee et al. - 2019 - Online DCM Trajectory Generation for Push Recovery.pdf:application/pdf},
}

@article{shafiee-ashtiani_push_2017,
	title = {Push {Recovery} of a {Position}-{Controlled} {Humanoid} {Robot} {Based} on {Capture} {Point} {Feedback} {Control}},
	url = {http://arxiv.org/abs/1710.10598},
	abstract = {In this paper, a combination of ankle and hip strategy is used for push recovery of a position-controlled humanoid robot. Ankle strategy and hip strategy are equivalent to Center of Pressure (CoP) and Centroidal Moment Pivot (CMP) regulation respectively. For controlling the CMP and CoP we need a torque-controlled robot, however most of the conventional humanoid robots are position controlled. In this regard, we present an efﬁcient way for implementation of the hip and ankle strategies on a position controlled humanoid robot. We employ a feedback controller to compensate the capture point error. Using our scheme, a simple and practical push recovery controller is designed which can be implemented on the most of the conventional humanoid robots without the need for torque sensors. The effectiveness of the proposed approach is veriﬁed through push recovery experiments on SURENA-Mini humanoid robot under severe pushes.},
	language = {en},
	urldate = {2022-05-21},
	journal = {arXiv:1710.10598 [cs]},
	author = {Shafiee-Ashtiani, Milad and Yousefi-Koma, Aghil and Mirjalili, Reihaneh and Maleki, Hessam and Karimi, Mojtaba},
	month = oct,
	year = {2017},
	note = {arXiv: 1710.10598},
	keywords = {Computer Science - Robotics},
	file = {Shafiee-Ashtiani et al. - 2017 - Push Recovery of a Position-Controlled Humanoid Ro.pdf:/home/dferigo/Zotero/storage/BFPJNTZT/Shafiee-Ashtiani et al. - 2017 - Push Recovery of a Position-Controlled Humanoid Ro.pdf:application/pdf},
}

@inproceedings{pratt_capture_2006,
	title = {Capture {Point}: {A} {Step} toward {Humanoid} {Push} {Recovery}},
	shorttitle = {Capture {Point}},
	doi = {10.1109/ICHR.2006.321385},
	abstract = {It is known that for a large magnitude push a human or a humanoid robot must take a step to avoid a fall. Despite some scattered results, a principled approach towards "when and where to take a step" has not yet emerged. Towards this goal, we present methods for computing capture points and the capture region, the region on the ground where a humanoid must step to in order to come to a complete stop. The intersection between the capture region and the base of support determines which strategy the robot should adopt to successfully stop in a given situation. Computing the capture region for a humanoid, in general, is very difficult. However, with simple models of walking, computation of the capture region is simplified. We extend the well-known linear inverted pendulum model to include a flywheel body and show how to compute exact solutions of the capture region for this model. Adding rotational inertia enables the humanoid to control its centroidal angular momentum, much like the way human beings do, significantly enlarging the capture region. We present simulations of a simple planar biped that can recover balance after a push by stepping to the capture region and using internal angular momentum. Ongoing work involves applying the solution from the simple model as an approximate solution to more complex simulations of bipedal walking, including a 3D biped with distributed mass.},
	booktitle = {2006 6th {IEEE}-{RAS} {International} {Conference} on {Humanoid} {Robots}},
	author = {Pratt, Jerry and Carff, John and Drakunov, Sergey and Goswami, Ambarish},
	month = dec,
	year = {2006},
	note = {ISSN: 2164-0580},
	keywords = {Computational modeling, Humanoid robots, Legged locomotion, humanoid robots, legged locomotion, humanoid robot, Acceleration, nonlinear control systems, Foot, torque control, position control, linear inverted pendulum model, Biological system modeling, biped robot, capture point, capture region, centroidal angular momentum, Closed-form solution, Cognition, flywheel body, Flywheels, humanoid push recovery, Humans, rotational inertia},
	pages = {200--207},
	file = {IEEE Xplore Abstract Record:/home/dferigo/Zotero/storage/JNWUQT8M/4115602.html:text/html;Pratt et al_2006_Capture Point.pdf:/home/dferigo/Zotero/storage/2M2Q2H7D/Pratt et al_2006_Capture Point.pdf:application/pdf},
}

@inproceedings{traversaro_unied_2017,
	title = {A {Uniﬁed} {View} of the {Equations} of {Motion} used for {Control} {Design} of {Humanoid} {Robots}},
	abstract = {This paper contributes towards the development of a uniﬁed standpoint on the equations of motion used for the control of free-ﬂoating mechanical systems. In particular, the contribution of the manuscript is twofold. First, we show how to write the system equations of motion for any choice of the base frame, without the need of re-applying algorithms for evaluating the mass, coriolis, and gravity matrix. A particular attention is paid to the properties associated with the mechanical systems, which are shown to be invariant with respect to the base frame choice. Secondly, we show that the so-called centroidal dynamics can be obtained from any expression of the equations of motion via an appropriate system state transformation. In this case, we show that the mass matrix associated with the new state is block diagonal, and the new base velocity corresponds to the so-called average 6D velocity.},
	language = {en},
	author = {Traversaro, Silvio and Pucci, Daniele and Nori, Francesco},
	year = {2017},
}

@article{baydin_automatic_2018,
	title = {Automatic {Diﬀerentiation} in {Machine} {Learning}: a {Survey}},
	abstract = {Derivatives, mostly in the form of gradients and Hessians, are ubiquitous in machine learning. Automatic diﬀerentiation (AD), also called algorithmic diﬀerentiation or simply “autodiﬀ”, is a family of techniques similar to but more general than backpropagation for eﬃciently and accurately evaluating derivatives of numeric functions expressed as computer programs. AD is a small but established ﬁeld with applications in areas including computational ﬂuid dynamics, atmospheric sciences, and engineering design optimization. Until very recently, the ﬁelds of machine learning and AD have largely been unaware of each other and, in some cases, have independently discovered each other’s results. Despite its relevance, general-purpose AD has been missing from the machine learning toolbox, a situation slowly changing with its ongoing adoption under the names “dynamic computational graphs” and “diﬀerentiable programming”. We survey the intersection of AD and machine learning, cover applications where AD has direct relevance, and address the main implementation techniques. By precisely deﬁning the main diﬀerentiation techniques and their interrelationships, we aim to bring clarity to the usage of the terms “autodiﬀ”, “automatic diﬀerentiation”, and “symbolic diﬀerentiation” as these are encountered more and more in machine learning settings.},
	language = {en},
	author = {Baydin, Atılım Gunes and Pearlmutter, Barak A and Radul, Alexey Andreyevich and Siskind, Jeﬀrey Mark},
	year = {2018},
	pages = {43},
	file = {Baydin et al_Automatic Diﬀerentiation in Machine Learning.pdf:/home/dferigo/Zotero/storage/9SE98F69/Baydin et al_Automatic Diﬀerentiation in Machine Learning.pdf:application/pdf},
}

@article{frostig_compiling_2018,
	title = {Compiling machine learning programs via high-level tracing},
	abstract = {We describe JAX, a domain-specific tracing JIT compiler for generating high-performance accelerator code from pure Python and Numpy machine learning programs. JAX uses the XLA compiler infrastructure to generate optimized code for the program subroutines that are most favorable for acceleration, and these optimized subroutines can be called and orchestrated by arbitrary Python. Because the system is fully compatible with Autograd, it allows forward- and reverse-mode automatic differentiation of Python functions to arbitrary order. Because JAX supports structured control flow, it can generate code for sophisticated machine learning algorithms while maintaining high performance. We show that by combining JAX with Autograd and Numpy we get an easily programmable and highly performant ML system that targets CPUs, GPUs, and TPUs, capable of scaling to multi-core Cloud TPUs.},
	language = {en},
	author = {Frostig, Roy and Johnson, Matthew James and Leary, Chris},
	year = {2018},
	file = {Frostig et al. - Compiling machine learning programs via high-level.pdf:/home/dferigo/Zotero/storage/R9ZJ9L5K/Frostig et al. - Compiling machine learning programs via high-level.pdf:application/pdf},
}

@article{choi_use_2021,
	title = {On the use of simulation in robotics: {Opportunities}, challenges, and suggestions for moving forward},
	copyright = {Article is made available in accordance with the publisher's policy and may be subject to US copyright law. Please refer to the publisher's site for terms of use.},
	shorttitle = {On the use of simulation in robotics},
	url = {https://dspace.mit.edu/handle/1721.1/139616},
	abstract = {© 2021 National Academy of Sciences. All rights reserved. The last five years marked a surge in interest for and use of smart robots, which operate in dynamic and unstructured environments and might interact with humans. We posit that well-validated computer simulation can provide a virtual proving ground that in many cases is instrumental in understanding safely, faster, at lower costs, and more thoroughly how the robots of the future should be designed and controlled for safe operation and improved performance. Against this backdrop, we discuss how simulation can help in robotics, barriers that currently prevent its broad adoption, and potential steps that can eliminate some of these barriers. The points and recommendations made concern the following simulation-in-robotics aspects: simulation of the dynamics of the robot; simulation of the virtual world; simulation of the sensing of this virtual world; simulation of the interaction between the human and the robot; and, in less depth, simulation of the communication between robots. This Perspectives contribution summarizes the points of view that coalesced during a 2018 National Science Foundation/Department of Defense/National Institute for Standards and Technology workshop dedicated to the topic at hand. The meeting brought together participants from a range of organizations, disciplines, and application fields, with expertise at the intersection of robotics, machine learning, and physics-based simulation.},
	language = {en},
	urldate = {2022-05-06},
	journal = {PNAS},
	author = {Choi, HeeSun and Crump, Cindy and Duriez, Christian and Elmquist, Asher and Hager, Gregory and Han, David and Hearl, Frank and Hodgins, Jessica and Jain, Abhinandan and Leve, Frederick and Li, Chen and Meier, Franziska and Negrut, Dan and Righetti, Ludovic and Rodriguez, Alberto and Tan, Jie and Trinkle, Jeff},
	year = {2021},
	note = {Accepted: 2022-01-14T19:58:12Z
Publisher: Proceedings of the National Academy of Sciences},
	file = {Choi et al_2021_On the use of simulation in robotics.pdf:/home/dferigo/Zotero/storage/2Z2DTR3H/Choi et al_2021_On the use of simulation in robotics.pdf:application/pdf;Snapshot:/home/dferigo/Zotero/storage/D7Q7XUG8/139616.html:text/html},
}

@article{james_pyrep_2019,
	title = {{PyRep}: {Bringing} {V}-{REP} to {Deep} {Robot} {Learning}},
	shorttitle = {{PyRep}},
	url = {http://arxiv.org/abs/1906.11176},
	abstract = {PyRep is a toolkit for robot learning research, built on top of the virtual robotics experimentation platform (V-REP). Through a series of modifications and additions, we have created a tailored version of V-REP built with robot learning in mind. The new PyRep toolkit offers three improvements: (1) a simple and flexible API for robot control and scene manipulation, (2) a new rendering engine, and (3) speed boosts upwards of 10,000x in comparison to the previous Python Remote API. With these improvements, we believe PyRep is the ideal toolkit to facilitate rapid prototyping of learning algorithms in the areas of reinforcement learning, imitation learning, state estimation, mapping, and computer vision.},
	urldate = {2022-05-02},
	journal = {arXiv:1906.11176 [cs]},
	author = {James, Stephen and Freese, Marc and Davison, Andrew J.},
	month = jun,
	year = {2019},
	note = {arXiv: 1906.11176},
	keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning, Computer Science - Robotics},
	file = {arXiv.org Snapshot:/home/dferigo/Zotero/storage/B7I7BMTG/1906.html:text/html;James et al_2019_PyRep.pdf:/home/dferigo/Zotero/storage/72C2E4NJ/James et al_2019_PyRep.pdf:application/pdf},
}

@article{lucchi_robo-gym_2020,
	title = {robo-gym -- {An} {Open} {Source} {Toolkit} for {Distributed} {Deep} {Reinforcement} {Learning} on {Real} and {Simulated} {Robots}},
	url = {http://arxiv.org/abs/2007.02753},
	abstract = {Applying Deep Reinforcement Learning (DRL) to complex tasks in the field of robotics has proven to be very successful in the recent years. However, most of the publications focus either on applying it to a task in simulation or to a task in a real world setup. Although there are great examples of combining the two worlds with the help of transfer learning, it often requires a lot of additional work and fine-tuning to make the setup work effectively. In order to increase the use of DRL with real robots and reduce the gap between simulation and real world robotics, we propose an open source toolkit: robo-gym. We demonstrate a unified setup for simulation and real environments which enables a seamless transfer from training in simulation to application on the robot. We showcase the capabilities and the effectiveness of the framework with two real world applications featuring industrial robots: a mobile robot and a robot arm. The distributed capabilities of the framework enable several advantages like using distributed algorithms, separating the workload of simulation and training on different physical machines as well as enabling the future opportunity to train in simulation and real world at the same time. Finally we offer an overview and comparison of robo-gym with other frequently used state-of-the-art DRL frameworks.},
	urldate = {2022-05-02},
	journal = {arXiv:2007.02753 [cs]},
	author = {Lucchi, Matteo and Zindler, Friedemann and Mühlbacher-Karrer, Stephan and Pichler, Horst},
	month = nov,
	year = {2020},
	note = {arXiv: 2007.02753},
	keywords = {Computer Science - Machine Learning, Computer Science - Robotics, Computer Science - Artificial Intelligence, Computer Science - Distributed, Parallel, and Cluster Computing, Computer Science - Software Engineering},
	file = {Lucchi et al_2020_robo-gym -- An Open Source Toolkit for Distributed Deep Reinforcement Learning.pdf:/home/dferigo/Zotero/storage/R5BMV45S/Lucchi et al_2020_robo-gym -- An Open Source Toolkit for Distributed Deep Reinforcement Learning.pdf:application/pdf},
}

@inproceedings{delhaisse_pyrobolearn_2019,
	title = {{PyRoboLearn}: {A} {Python} {Framework} for {Robot} {Learning} {Practitioners}},
	abstract = {On the quest for building autonomous robots, several robot learning frameworks with different functionalities have recently been developed. Yet, frameworks that combine diverse learning paradigms (such as imitation and reinforcement learning) into a common place are scarce. Existing ones tend to be robot-speciﬁc, and often require time-consuming work to be used with other robots. Also, their architecture is often weakly structured, mainly because of a lack of modularity and ﬂexibility. This leads users to reimplement several pieces of code to integrate them into their own experimental or benchmarking work. To overcome these issues, we introduce PyRoboLearn, a new Python robot learning framework that combines different learning paradigms into a single framework. Our framework provides a plethora of robotic environments, learning models and algorithms. PyRoboLearn is developed with a particular focus on modularity, ﬂexibility, generality, and simplicity to favor (re)usability. This is achieved by abstracting each key concept, undertaking a modular programming approach, minimizing the coupling among the different modules, and favoring composition over inheritance for better ﬂexibility. We demonstrate the different features and utility of our framework through different use cases.},
	language = {en},
	author = {Delhaisse, Brian and Rozo, Leonel and Caldwell, Darwin G},
	year = {2019},
	pages = {11},
	file = {Delhaisse et al_2019_PyRoboLearn.pdf:/home/dferigo/Zotero/storage/DP65NMCS/Delhaisse et al_2019_PyRoboLearn.pdf:application/pdf},
}

@article{hwangbo_per-contact_2018,
	title = {Per-{Contact} {Iteration} {Method} for {Solving} {Contact} {Dynamics}},
	volume = {3},
	issn = {2377-3766},
	doi = {10.1109/LRA.2018.2792536},
	abstract = {This letter introduces a new iterative method for contact dynamics problems. The proposed method is based on an efficient bisection method which iterates over each contact. We compared our approach to two existing ones for the same model and found that it is about twice as fast as the existing ones. We also introduce four different robotic simulation experiments and compare the proposed method to the most common contact solver, the projected Gauss-Seidel (PGS) method. We show that, while both methods are very efficient in solving simple problems, the proposed method significantly outperforms the PGS method in more complicated contact scenarios. Simulating one time step of an 18-DOF quadruped robot with multiple contacts took less than 20 μs with a single core of a CPU. This is at least an order of magnitude faster than many other simulators which employ multiple relaxation methods to the major dynamic principles in order to boost their computational speed. The proposed simulation method is also stable at 50 Hz due to its strict adherence to the dynamical principles. Although the accuracy might be compromised at such a low update rate, this means that we can simulate an 18-DOF robot more than thousand times faster than the real time.},
	number = {2},
	journal = {IEEE Robotics and Automation Letters},
	author = {Hwangbo, J. and Lee, J. and Hutter, M.},
	month = apr,
	year = {2018},
	keywords = {Legged locomotion, Robot kinematics, legged locomotion, legged robots, Friction, Mathematical model, iterative methods, robot dynamics, robot kinematics, mechanical contact, 18-DOF quadruped robot, bisection method, collision avoidance, complicated contact scenarios, contact dynamics problems, contact modeling, contact solver, CPU, dynamical principles, frequency 50.0 Hz, Indexes, multiple contacts, multiple relaxation methods, per-contact iteration method, PGS method, projected Gauss-Seidel method, robotic simulation experiments, Simulation and animation, simulation method},
	pages = {895--902},
	file = {Hwangbo et al_2018_Per-Contact Iteration Method for Solving Contact Dynamics.pdf:/home/dferigo/Zotero/storage/EM7MBJHW/Hwangbo et al_2018_Per-Contact Iteration Method for Solving Contact Dynamics.pdf:application/pdf;IEEE Xplore Abstract Record:/home/dferigo/Zotero/storage/KYQ3U5SM/8255551.html:text/html},
}

@book{coumans_pybullet_2016,
	title = {Pybullet, a python module for physics simulation in robotics, games and machine learning},
	author = {Coumans, Erwin and Bai, Yunfei},
	year = {2016},
}

@inproceedings{todorov_mujoco_2012,
	title = {{MuJoCo}: {A} physics engine for model-based control},
	shorttitle = {{MuJoCo}},
	doi = {10.1109/IROS.2012.6386109},
	abstract = {We describe a new physics engine tailored to model-based control. Multi-joint dynamics are represented in generalized coordinates and computed via recursive algorithms. Contact responses are computed via efficient new algorithms we have developed, based on the modern velocity-stepping approach which avoids the difficulties with spring-dampers. Models are specified using either a high-level C++ API or an intuitive XML file format. A built-in compiler transforms the user model into an optimized data structure used for runtime computation. The engine can compute both forward and inverse dynamics. The latter are well-defined even in the presence of contacts and equality constraints. The model can include tendon wrapping as well as actuator activation states (e.g. pneumatic cylinders or muscles). To facilitate optimal control applications and in particular sampling and finite differencing, the dynamics can be evaluated for different states and controls in parallel. Around 400,000 dynamics evaluations per second are possible on a 12-core machine, for a 3D homanoid with 18 dofs and 6 active contacts. We have already used the engine in a number of control applications. It will soon be made publicly available.},
	booktitle = {2012 {IEEE}/{RSJ} {International} {Conference} on {Intelligent} {Robots} and {Systems}},
	author = {Todorov, E. and Erez, T. and Tassa, Y.},
	month = oct,
	year = {2012},
	keywords = {Computational modeling, humanoid robots, Dynamics, application program interfaces, Heuristic algorithms, Mathematical model, optimal control, control engineering computing, 12-core machine, 3D humanoid, active contacts, actuator activation states, built-in compiler transforms, C++ language, data structures, Engines, finite difference methods, finite differencing, high-level C++ API, intuitive XML file format, model-based control, MuJoCo, multijoint dynamics, optimal control applications, Optimization, optimized data structure, physics engine, program compilers, recursive algorithms, runtime computation, shock absorbers, spring-dampers, tendon wrapping, velocity-stepping approach, XML},
	pages = {5026--5033},
	file = {IEEE Xplore Abstract Record:/home/dferigo/Zotero/storage/UYHBLVES/6386109.html:text/html;Todorov et al_2012_MuJoCo.pdf:/home/dferigo/Zotero/storage/MH7NEVNR/Todorov et al_2012_MuJoCo.pdf:application/pdf},
}

@article{peng_sim--real_2018,
	title = {Sim-to-{Real} {Transfer} of {Robotic} {Control} with {Dynamics} {Randomization}},
	doi = {10.1109/ICRA.2018.8460528},
	abstract = {Simulations are attractive environments for training agents as they provide an abundant source of data and alleviate certain safety concerns during the training process. But the behaviours developed by agents in simulation are often specific to the characteristics of the simulator. Due to modeling error, strategies that are successful in simulation may not transfer to their real world counterparts. In this paper, we demonstrate a simple method to bridge this "reality gap". By randomizing the dynamics of the simulator during training, we are able to develop policies that are capable of adapting to very different dynamics, including ones that differ significantly from the dynamics on which the policies were trained. This adaptivity enables the policies to generalize to the dynamics of the real world without any training on the physical system. Our approach is demonstrated on an object pushing task using a robotic arm. Despite being trained exclusively in simulation, our policies are able to maintain a similar level of performance when deployed on a real robot, reliably moving an object to a desired location from random initial configurations. We explore the impact of various design decisions and show that the resulting policies are robust to significant calibration error.},
	urldate = {2019-05-22},
	journal = {IEEE International Conference on Robotics and Automation (ICRA)},
	author = {Peng, Xue Bin and Andrychowicz, Marcin and Zaremba, Wojciech and Abbeel, Pieter},
	month = may,
	year = {2018},
	note = {arXiv: 1710.06537},
	keywords = {Computer Science - Robotics, Computer Science - Systems and Control},
	file = {arXiv.org Snapshot:/home/dferigo/Zotero/storage/2MBLKJ24/1710.html:text/html;Peng et al_2018_Sim-to-Real Transfer of Robotic Control with Dynamics Randomization.pdf:/home/dferigo/Zotero/storage/BUBU4LU3/Peng et al_2018_Sim-to-Real Transfer of Robotic Control with Dynamics Randomization.pdf:application/pdf},
}

@inproceedings{krammer_standardized_2019,
	title = {Standardized {Integration} of {Real}-{Time} and {Non}-{Real}-{Time} {Systems}: {The} {Distributed} {Co}-{Simulation} {Protocol}},
	shorttitle = {Standardized {Integration} of {Real}-{Time} and {Non}-{Real}-{Time} {Systems}},
	doi = {10.3384/ecp1915787},
	abstract = {Co-simulation techniques have evolved signiﬁcantly over the last 10 years. System simulation and hardware-in-the-loop testing are used to develop complex products in many industrial sectors. The Functional Mock-Up Interface (FMI) represents a standardized solution for integration of simulation models, tools and solvers. In practice the integration and coupling of heterogeneous systems still require enormous eﬀorts. Until now no standardized interface or protocol speciﬁcation is available, which allows the interaction of real-time and non-real-time systems of diﬀerent vendors. This paper presents selected technical aspects of the novel Distributed Co-simulation Protocol (DCP) and highlights primary application possibilities. The DCP consists of a data model, a ﬁnite state machine, and a communication protocol including a set of protocol data units. It supports a master-slave architecture for simulation setup and control. The DCP was developed in context of the ACOSAR project and was subsequently adopted by Modelica Association as a Modelica Association Project (MAP). It may be used in numerous industrial and scientiﬁc applications. The standardization of the DCP allows for a modular and interoperable development between system providers and integrators. In the end, this will lead to more eﬃcient product development and testing.},
	language = {en},
	urldate = {2019-08-08},
	booktitle = {13th {International} {Modelica} {Conference}},
	author = {Krammer, Martin and Schuch, Klaus and Kater, Christian and Alekeish, Khaled and Blochwitz, Torsten and Materne, Stefan and Soppa, Andreas and Benedikt, Martin},
	month = feb,
	year = {2019},
	file = {Krammer et al_2019_Standardized Integration of Real-Time and Non-Real-Time Systems.pdf:/home/dferigo/Zotero/storage/B7937HYG/Krammer et al_2019_Standardized Integration of Real-Time and Non-Real-Time Systems.pdf:application/pdf},
}

@inproceedings{xia_gibson_2018,
	title = {Gibson {Env}: {Real}-{World} {Perception} for {Embodied} {Agents}},
	isbn = {978-1-5386-6420-9},
	shorttitle = {Gibson {Env}},
	doi = {10.1109/CVPR.2018.00945},
	language = {en},
	urldate = {2019-05-21},
	booktitle = {{IEEE}/{CVF} {Conference} on {Computer} {Vision} and {Pattern} {Recognition}},
	author = {Xia, Fei and Zamir, Amir R. and He, Zhiyang and Sax, Alexander and Malik, Jitendra and Savarese, Silvio},
	month = jun,
	year = {2018},
	file = {Xia et al_2018_Gibson Env.pdf:/home/dferigo/Zotero/storage/X7M9GV3X/Xia et al_2018_Gibson Env.pdf:application/pdf},
}

@article{ramos_bayessim_2019,
	title = {{BayesSim}: adaptive domain randomization via probabilistic inference for robotics simulators},
	shorttitle = {{BayesSim}},
	abstract = {We introduce BayesSim 1, a framework for robotics simulations allowing a full Bayesian treatment for the parameters of the simulator. As simulators become more sophisticated and able to represent the dynamics more accurately, fundamental problems in robotics such as motion planning and perception can be solved in simulation and solutions transferred to the physical robot. However, even the most complex simulator might still not be able to represent reality in all its details either due to inaccurate parametrization or simplistic assumptions in the dynamic models. BayesSim provides a principled framework to reason about the uncertainty of simulation parameters. Given a black box simulator (or generative model) that outputs trajectories of state and action pairs from unknown simulation parameters, followed by trajectories obtained with a physical robot, we develop a likelihood-free inference method that computes the posterior distribution of simulation parameters. This posterior can then be used in problems where Sim2Real is critical, for example in policy search. We compare the performance of BayesSim in obtaining accurate posteriors in a number of classical control and robotics problems. Results show that the posterior computed from BayesSim can be used for domain randomization outperforming alternative methods that randomize based on uniform priors.},
	language = {en},
	urldate = {2019-08-09},
	author = {Ramos, Fabio and Possas, Rafael Carvalhaes and Fox, Dieter},
	month = jun,
	year = {2019},
	note = {arXiv: 1906.01728},
	keywords = {Computer Science - Machine Learning, Computer Science - Robotics},
	file = {Ramos et al_2019_BayesSim.pdf:/home/dferigo/Zotero/storage/2GPXV65N/Ramos et al_2019_BayesSim.pdf:application/pdf},
}

@inproceedings{parker_optix_2010,
	title = {{OptiX}: {A} {General} {Purpose} {Ray} {Tracing} {Engine}},
	isbn = {978-1-4503-0210-4},
	shorttitle = {{OptiX}},
	doi = {10.1145/1833349.1778803},
	abstract = {The NVIDIA® OptiX™ ray tracing engine is a programmable system designed for NVIDIA GPUs and other highly parallel architectures. The OptiX engine builds on the key observation that most ray tracing algorithms can be implemented using a small set of programmable operations. Consequently, the core of OptiX is a domain-specific just-in-time compiler that generates custom ray tracing kernels by combining user-supplied programs for ray generation, material shading, object intersection, and scene traversal. This enables the implementation of a highly diverse set of ray tracing-based algorithms and applications, including interactive rendering, offline rendering, collision detection systems, artificial intelligence queries, and scientific simulations such as sound propagation. OptiX achieves high performance through a compact object model and application of several ray tracing-specific compiler optimizations. For ease of use it exposes a single-ray programming model with full support for recursion and a dynamic dispatch mechanism similar to virtual function calls.},
	urldate = {2019-05-22},
	author = {Parker, Steven G. and Bigler, James and Dietrich, Andreas and Friedrich, Heiko and Hoberock, Jared and Luebke, David and McAllister, David and McGuire, Morgan and Morley, Keith and Robison, Austin and Stich, Martin},
	year = {2010},
	keywords = {graphics hardware, graphics systems, ray tracing},
}

@article{lopez_gym-gazebo2_2019,
	title = {gym-gazebo2, a toolkit for reinforcement learning using {ROS} 2 and {Gazebo}},
	url = {http://arxiv.org/abs/1903.06278},
	abstract = {This paper presents an upgraded, real world application oriented version of gym-gazebo, the Robot Operating System (ROS) and Gazebo based Reinforcement Learning (RL) toolkit, which complies with OpenAI Gym. The content discusses the new ROS 2 based software architecture and summarizes the results obtained using Proximal Policy Optimization (PPO). Ultimately, the output of this work presents a benchmarking system for robotics that allows different techniques and algorithms to be compared using the same virtual conditions. We have evaluated environments with different levels of complexity of the Modular Articulated Robotic Arm (MARA), reaching accuracies in the millimeter scale. The converged results show the feasibility and usefulness of the gym-gazebo 2 toolkit, its potential and applicability in industrial use cases, using modular robots.},
	urldate = {2019-03-18},
	journal = {arXiv:1903.06278 [cs]},
	author = {Lopez, Nestor Gonzalez and Nuin, Yue Leire Erro and Moral, Elias Barba and Juan, Lander Usategui San and Rueda, Alejandro Solano and Vilches, Víctor Mayoral and Kojcev, Risto},
	month = mar,
	year = {2019},
	note = {arXiv: 1903.06278},
	keywords = {Computer Science - Machine Learning, Computer Science - Robotics, Computer Science - Artificial Intelligence},
	file = {arXiv.org Snapshot:/home/dferigo/Zotero/storage/ISB6DK6W/1903.html:text/html;Lopez et al_2019_gym-gazebo2, a toolkit for reinforcement learning using ROS 2 and Gazebo.pdf:/home/dferigo/Zotero/storage/QVQB9GJE/Lopez et al_2019_gym-gazebo2, a toolkit for reinforcement learning using ROS 2 and Gazebo.pdf:application/pdf},
}

@article{juliani_unity_2018,
	title = {Unity: {A} {General} {Platform} for {Intelligent} {Agents}},
	shorttitle = {Unity},
	url = {http://arxiv.org/abs/1809.02627},
	abstract = {Recent advances in Deep Reinforcement Learning and Robotics have been driven by the presence of increasingly realistic and complex simulation environments. Many of the existing platforms, however, provide either unrealistic visuals, inaccurate physics, low task complexity, or a limited capacity for interaction among artificial agents. Furthermore, many platforms lack the ability to flexibly configure the simulation, hence turning the simulation environment into a black-box from the perspective of the learning system. Here we describe a new open source toolkit for creating and interacting with simulation environments using the Unity platform: Unity ML-Agents Toolkit. By taking advantage of Unity as a simulation platform, the toolkit enables the development of learning environments which are rich in sensory and physical complexity, provide compelling cognitive challenges, and support dynamic multi-agent interaction. We detail the platform design, communication protocol, set of example environments, and variety of training scenarios made possible via the toolkit.},
	urldate = {2019-05-03},
	journal = {arXiv:1809.02627 [cs, stat]},
	author = {Juliani, Arthur and Berges, Vincent-Pierre and Vckay, Esh and Gao, Yuan and Henry, Hunter and Mattar, Marwan and Lange, Danny},
	month = sep,
	year = {2018},
	note = {arXiv: 1809.02627},
	keywords = {Computer Science - Machine Learning, Computer Science - Artificial Intelligence, Statistics - Machine Learning, Computer Science - Neural and Evolutionary Computing},
	file = {arXiv.org Snapshot:/home/dferigo/Zotero/storage/DT6IGGHT/1809.html:text/html;Juliani et al_2018_Unity.pdf:/home/dferigo/Zotero/storage/DBUW2T99/Juliani et al_2018_Unity.pdf:application/pdf},
}

@article{nori_icub_2015,
	title = {{iCub} {Whole}-{Body} {Control} through {Force} {Regulation} on {Rigid} {Non}-{Coplanar} {Contacts}},
	abstract = {This paper details the implementation on the humanoid robot iCub of state-of-the-art algorithms for whole-body control. We regulate the forces between the robot and its surrounding environment to stabilize a desired robot posture. We assume that the forces and torques are exerted on rigid contacts. The validity of this assumption is guaranteed by constraining the contact forces and torques, e.g. the contact forces must belong to the associated friction cones. The implementation of this control strategy requires to estimate the external forces acting on the robot, and the internal joint torques. We then detail algorithms to obtain these estimations when using a robot with an iCub-like sensor set, i.e. distributed six-axis force-torque sensors and whole-body tactile sensors. A general theory for identifying the robot inertial parameters is also presented. From an actuation standpoint, we show how to implement a joint torque control in the case of DC brushless motors. In addition, the coupling mechanism of the iCub torso is investigated. The soundness of the entire control architecture is validated in a real scenario involving the robot iCub balancing and making contacts at both arms.},
	urldate = {2020-07-22},
	journal = {Frontiers in Robotics and AI},
	author = {Nori, Francesco and Traversaro, Silvio and Eljaik, Jorhabib and Romano, Francesco and Del Prete, Andrea and Pucci, Daniele},
	year = {2015},
	keywords = {whole-body control, floating base robots, force sensors., noncoplanar contact, Rigid contacts, Tactile sensors},
}

@book{featherstone_rigid_2008,
	title = {Rigid body dynamics algorithms},
	abstract = {"Rigid Body Dynamics presents the subject of computational rigid-body dynamics through the medium of spatial 6D vector notation. It explains how to model a rigid-body system and how to analyze it, and it presents the most comprehensive collection of the best rigid-body dynamics algorithms to be found in a single source. The use of spatial vector notation greatly reduces the volume of algebra which allows systems to be described using fewer equations and fewer quantities. It also allows problems to be solved in fewer steps, and solutions to be expressed more succinctly. In addition algorithms are explained simply and clearly, and are expressed in a compact form. The use of spatial vector notation facilitates the implementation of dynamics algorithms on a computer: shorter, simpler code that is easier to write, understand and debug, with no loss of efficiency." "Rigid Body Dynamics Algorithms is aimed at readers who already have some elementary knowledge of rigid-body dynamics, and are interested in calculating the dynamics of a rigid-body system. This book serves as an algorithms recipe book as well as a guide to the analysis and deeper understanding of rigid-body systems."--BOOK JACKET},
	language = {en},
	publisher = {Springer},
	author = {Featherstone, Roy},
	year = {2008},
	note = {OCLC: ocn190774140},
	keywords = {Dynamics, Dynamics, Rigid, Recursive functions, Robots},
}

@article{brockman_openai_2016,
	title = {{OpenAI} {Gym}},
	url = {http://arxiv.org/abs/1606.01540},
	abstract = {OpenAI Gym is a toolkit for reinforcement learning research. It includes a growing collection of benchmark problems that expose a common interface, and a website where people can share their results and compare the performance of algorithms. This whitepaper discusses the components of OpenAI Gym and the design decisions that went into the software.},
	urldate = {2019-05-22},
	journal = {arXiv:1606.01540 [cs]},
	author = {Brockman, Greg and Cheung, Vicki and Pettersson, Ludwig and Schneider, Jonas and Schulman, John and Tang, Jie and Zaremba, Wojciech},
	month = jun,
	year = {2016},
	note = {arXiv: 1606.01540},
	keywords = {Computer Science - Machine Learning, Computer Science - Artificial Intelligence},
	file = {arXiv.org Snapshot:/home/dferigo/Zotero/storage/MPYQGNAJ/1606.html:text/html;Brockman et al_2016_OpenAI Gym.pdf:/home/dferigo/Zotero/storage/Q2ANSE79/Brockman et al_2016_OpenAI Gym.pdf:application/pdf},
}

@article{collins_review_2021,
	title = {A {Review} of {Physics} {Simulators} for {Robotic} {Applications}},
	volume = {9},
	issn = {2169-3536},
	doi = {10.1109/ACCESS.2021.3068769},
	abstract = {The use of simulators in robotics research is widespread, underpinning the majority of recent advances in the field. There are now more options available to researchers than ever before, however navigating through the plethora of choices in search of the right simulator is often non-trivial. Depending on the field of research and the scenario to be simulated there will often be a range of suitable physics simulators from which it is difficult to ascertain the most relevant one. We have compiled a broad review of physics simulators for use within the major fields of robotics research. More specifically, we navigate through key sub-domains and discuss the features, benefits, applications and use-cases of the different simulators categorised by the respective research communities. Our review provides an extensive index of the leading physics simulators applicable to robotics researchers and aims to assist them in choosing the best simulator for their use case.},
	journal = {IEEE Access},
	author = {Collins, J. and Chand, S. and Vanderkop, A. and Howard, D.},
	year = {2021},
	keywords = {Robots, Sensors, Legged locomotion, Robot sensing systems, robotics, review, Simulation, aerial robotics, field robotics, manipulation, marine robotics, Mobile robots, Navigation, Physics, robotic learning, soft robotics, surgical robotics},
	pages = {51416--51431},
}

@article{heiden_neuralsim_2021,
	title = {{NeuralSim}: {Augmenting} {Differentiable} {Simulators} with {Neural} {Networks}},
	url = {http://arxiv.org/abs/2011.04217},
	abstract = {Differentiable simulators provide an avenue for closing the sim-to-real gap by enabling the use of efficient, gradient-based optimization algorithms to find the simulation parameters that best fit the observed sensor readings. Nonetheless, these analytical models can only predict the dynamical behavior of systems for which they have been designed. In this work, we study the augmentation of a novel differentiable rigid-body physics engine via neural networks that is able to learn nonlinear relationships between dynamic quantities and can thus learn effects not accounted for in traditional simulators.Such augmentations require less data to train and generalize better compared to entirely data-driven models. Through extensive experiments, we demonstrate the ability of our hybrid simulator to learn complex dynamics involving frictional contacts from real data, as well as match known models of viscous friction, and present an approach for automatically discovering useful augmentations. We show that, besides benefiting dynamics modeling, inserting neural networks can accelerate model-based control architectures. We observe a ten-fold speed-up when replacing the QP solver inside a model-predictive gait controller for quadruped robots with a neural network, allowing us to significantly improve control delays as we demonstrate in real-hardware experiments. We publish code, additional results and videos from our experiments on our project webpage at https://sites.google.com/usc.edu/neuralsim.},
	urldate = {2022-03-01},
	journal = {arXiv:2011.04217 [cs]},
	author = {Heiden, Eric and Millard, David and Coumans, Erwin and Sheng, Yizhou and Sukhatme, Gaurav S.},
	month = may,
	year = {2021},
	note = {arXiv: 2011.04217},
	keywords = {Computer Science - Robotics},
	file = {arXiv.org Snapshot:/home/dferigo/Zotero/storage/M9F3TAP8/2011.html:text/html;Heiden et al_2021_NeuralSim.pdf:/home/dferigo/Zotero/storage/BZ9QUVGR/Heiden et al_2021_NeuralSim.pdf:application/pdf},
}

@article{makoviychuk_isaac_2021,
	title = {Isaac {Gym}: {High} {Performance} {GPU}-{Based} {Physics} {Simulation} {For} {Robot} {Learning}},
	url = {http://arxiv.org/abs/2108.10470},
	abstract = {Isaac Gym offers a high performance learning platform to train policies for wide variety of robotics tasks directly on GPU. Both physics simulation and the neural network policy training reside on GPU and communicate by directly passing data from physics buffers to PyTorch tensors without ever going through any CPU bottlenecks. This leads to blazing fast training times for complex robotics tasks on a single GPU with 2-3 orders of magnitude improvements compared to conventional RL training that uses a CPU based simulator and GPU for neural networks. We host the results and videos at https://sites.google.com/view/ isaacgym-nvidia and isaac gym can be downloaded at https://developer. nvidia.com/isaac-gym.},
	language = {en},
	urldate = {2022-03-01},
	journal = {arXiv:2108.10470 [cs]},
	author = {Makoviychuk, Viktor and Wawrzyniak, Lukasz and Guo, Yunrong and Lu, Michelle and Storey, Kier and Macklin, Miles and Hoeller, David and Rudin, Nikita and Allshire, Arthur and Handa, Ankur and State, Gavriel},
	month = aug,
	year = {2021},
	note = {arXiv: 2108.10470},
	keywords = {Computer Science - Machine Learning, Computer Science - Robotics},
}

@article{freeman_brax_2021,
	title = {Brax -- {A} {Differentiable} {Physics} {Engine} for {Large} {Scale} {Rigid} {Body} {Simulation}},
	url = {http://arxiv.org/abs/2106.13281},
	abstract = {We present Brax, an open source library for rigid body simulation with a focus on performance and parallelism on accelerators, written in JAX. We present results on a suite of tasks inspired by the existing reinforcement learning literature, but remade in our engine. Additionally, we provide reimplementations of PPO, SAC, ES, and direct policy optimization in JAX that compile alongside our environments, allowing the learning algorithm and the environment processing to occur on the same device, and to scale seamlessly on accelerators. Finally, we include notebooks that facilitate training of performant policies on common OpenAI Gym MuJoCo-like tasks in minutes.},
	language = {en},
	urldate = {2021-12-09},
	journal = {arXiv:2106.13281 [cs]},
	author = {Freeman, C. Daniel and Frey, Erik and Raichuk, Anton and Girgin, Sertan and Mordatch, Igor and Bachem, Olivier},
	month = jun,
	year = {2021},
	note = {arXiv: 2106.13281},
	keywords = {Computer Science - Robotics, Computer Science - Artificial Intelligence},
}

@book{cellier_continuous_2006,
	address = {New York},
	title = {Continuous system simulation},
	language = {en},
	publisher = {Springer},
	author = {Cellier, François E. and Kofman, Ernesto},
	year = {2006},
	keywords = {Computer simulation, Mathematical models, Mathematics, Simulation methods},
	file = {Cellier and Kofman - 2006 - Continuous system simulation.pdf:/home/dferigo/Zotero/storage/R84I784E/Cellier and Kofman - 2006 - Continuous system simulation.pdf:application/pdf},
}

@book{friedland_control_2005,
	address = {Mineola, NY},
	edition = {Dover ed},
	title = {Control system design: an introduction to state-space methods},
	isbn = {978-0-486-44278-5},
	shorttitle = {Control system design},
	language = {en},
	publisher = {Dover Publications},
	author = {Friedland, Bernard},
	year = {2005},
	keywords = {Control theory, Automatic control, State-space methods, System design},
	file = {Friedland - 2005 - Control system design an introduction to state-sp.pdf:/home/dferigo/Zotero/storage/UHRJEGUE/Friedland - 2005 - Control system design an introduction to state-sp.pdf:application/pdf},
}

@article{andrle_geometric_2013,
	title = {Geometric {Integration} of {Quaternions}},
	language = {en},
	journal = {Journal of Guidance, Control, and Dynamics},
	author = {Andrle, Michael S and Crassidis, John L},
	year = {2013},
	pages = {10},
	file = {Andrle and Crassidis - Geometric Integration of Quaternions.pdf:/home/dferigo/Zotero/storage/B8PXR3LE/Andrle and Crassidis - Geometric Integration of Quaternions.pdf:application/pdf},
}

@book{bishop_pattern_2006,
	address = {New York},
	series = {Information science and statistics},
	title = {Pattern recognition and machine learning},
	isbn = {978-0-387-31073-2},
	language = {en},
	publisher = {Springer},
	author = {Bishop, Christopher M.},
	year = {2006},
	keywords = {Machine learning, Pattern perception},
	file = {Bishop - 2006 - Pattern recognition and machine learning.pdf:/home/dferigo/Zotero/storage/Y5DW5TID/Bishop - 2006 - Pattern recognition and machine learning.pdf:application/pdf},
}

@book{puterman_markov_2005,
	address = {Hoboken, NJ},
	series = {Wiley series in probability and statistics},
	title = {Markov decision processes: discrete stochastic dynamic programming},
	isbn = {978-0-471-72782-8},
	shorttitle = {Markov decision processes},
	language = {en},
	publisher = {Wiley-Interscience},
	author = {Puterman, Martin L.},
	year = {2005},
	file = {Puterman - 2005 - Markov decision processes discrete stochastic dyn.pdf:/home/dferigo/Zotero/storage/4NCDPP3U/Puterman - 2005 - Markov decision processes discrete stochastic dyn.pdf:application/pdf},
}

@article{jaakkola_reinforcement_1994,
	title = {Reinforcement {Learning} {Algorithm} for {Partially} {Observable} {Markov} {Decision} {Problems}},
	abstract = {Increasing attention has been paid to reinforcement learning algorithms in recent years, partly due to successes in the theoretical analysis of their behavior in Markov environments. If the Markov assumption is removed, however, neither generally the algorithms nor the analyses continue to be usable. We propose and analyze a new learning algorithm to solve a certain class of non-Markov decision problems. Our algorithm applies to problems in which the environment is Markov, but the learner has restricted access to state information. The algorithm involves a Monte-Carlo policy evaluation combined with a policy improvement method that is similar to that of Markov decision problems and is guaranteed to converge to a local maximum. The algorithm operates in the space of stochastic policies, a space which can yield a policy that performs considerably better than any deterministic policy. Although the space of stochastic policies is continuous-even for a discrete action space-our algorithm is computationally tractable.},
	language = {en},
	journal = {Advances in neural information processing systems},
	author = {Jaakkola, Tommi and Singh, Satinder P and Jordan, Michael I},
	year = {1994},
	file = {Jaakkola et al. - Reinforcement Learning Algorithm for Partially Obs.pdf:/home/dferigo/Zotero/storage/D5UK3Q5E/Jaakkola et al. - Reinforcement Learning Algorithm for Partially Obs.pdf:application/pdf},
}

@article{lovejoy_survey_1991,
	title = {A survey of algorithmic methods for partially observed {Markov} decision processes},
	volume = {28},
	issn = {0254-5330, 1572-9338},
	url = {http://link.springer.com/10.1007/BF02055574},
	doi = {10.1007/BF02055574},
	language = {en},
	number = {1},
	urldate = {2023-01-07},
	journal = {Annals of Operations Research},
	author = {Lovejoy, William S.},
	month = dec,
	year = {1991},
	pages = {47--65},
	file = {Lovejoy - 1991 - A survey of algorithmic methods for partially obse.pdf:/home/dferigo/Zotero/storage/E6ISAHRJ/Lovejoy - 1991 - A survey of algorithmic methods for partially obse.pdf:application/pdf},
}

@book{marsden_jerrold_e_introduction_2013,
	title = {Introduction to {Mechanics} and {Symmetry}: a {Basic} {Exposition} of {Classical} {Mechanical} {Systems}.},
	publisher = {Springer Science \& Business Media},
	author = {{Marsden, Jerrold E.} and {Ratiu, Tudor S.}},
	year = {2013},
	file = {(Texts in Applied Mathematics) Jerrold E. Marsden, Tudor S. Ratiu - Introduction To Mechanics And Symmetry A Basic Exposition of Classical Mechanical Systems-Springer (2010).pdf:/home/dferigo/Zotero/storage/2QH6GPAH/(Texts in Applied Mathematics) Jerrold E. Marsden, Tudor S. Ratiu - Introduction To Mechanics And Symmetry A Basic Exposition of Classical Mechanical Systems-Springer (2010).pdf:application/pdf},
}

@phdthesis{vousten_simulating_2022,
	title = {Simulating {Box} {Impact} {Dynamics} in {MuJoCo}},
	url = {https://pure.tue.nl/ws/portalfiles/portal/212923254/1462253_Impacts_in_MuJoCo.pdf},
	urldate = {2023-05-25},
	school = {Eindhoven University of Technology},
	author = {Vousten, Laurens C.J.M.},
	year = {2022},
	file = {1462253_Impacts_in_MuJoCo.pdf:/home/dferigo/Zotero/storage/WT5I7HRD/1462253_Impacts_in_MuJoCo.pdf:application/pdf},
}

@article{yoon_comparative_2023,
	title = {Comparative {Study} of {Physics} {Engines} for {Robot} {Simulation} with {Mechanical} {Interaction}},
	volume = {13},
	issn = {2076-3417},
	url = {https://www.mdpi.com/2076-3417/13/2/680},
	doi = {10.3390/app13020680},
	abstract = {Simulation with a reasonable physical model is important to develop control algorithms for robots quickly, accurately, and safely without damaging the associated physical systems in various environments. However, it is difﬁcult to choose the suitable tool for simulating a speciﬁc project. To help users in selecting the best tool when simulating a given project, we compare the performance of the four widely used physics engines, namely, ODE, Bullet, Vortex, and MoJoco, for various simple and complex industrial scenarios. We ﬁrst summarize the technical algorithms implemented in each physics engine. We also designed four simulation scenarios ranging from simple scenarios for which analytic solution exists to complex industrial scenarios to compare the performance of each physics engine. We then present the simulation results in the default settings of all the physics engines, and analyze the behavior and contact force of the simulated objects.},
	language = {en},
	number = {2},
	urldate = {2023-05-25},
	journal = {Applied Sciences},
	author = {Yoon, Jaemin and Son, Bukun and Lee, Dongjun},
	month = jan,
	year = {2023},
	pages = {680},
	file = {Yoon et al. - 2023 - Comparative Study of Physics Engines for Robot Sim.pdf:/home/dferigo/Zotero/storage/8TQL87NM/Yoon et al. - 2023 - Comparative Study of Physics Engines for Robot Sim.pdf:application/pdf},
}

@inproceedings{todorov_convex_2014,
	address = {Hong Kong, China},
	title = {Convex and analytically-invertible dynamics with contacts and constraints: {Theory} and implementation in {MuJoCo}},
	isbn = {978-1-4799-3685-4},
	shorttitle = {Convex and analytically-invertible dynamics with contacts and constraints},
	url = {http://ieeexplore.ieee.org/document/6907751/},
	doi = {10.1109/ICRA.2014.6907751},
	abstract = {We describe a full-featured simulation pipeline implemented in the MuJoCo physics engine. It includes multi-joint dynamics in generalized coordinates, holonomic constraints, dry joint friction, joint and tendon limits, frictionless and frictional contacts that can have sliding, torsional and rolling friction. The forward dynamics of a 27-dof humanoid with 10 contacts are evaluated in 0.1 msec. Since the simulation is stable at 10 msec timesteps, it can run 100 times faster than real-time on a single core of a desktop processor. Furthermore the entire simulation pipeline can be inverted analytically, an order-ofmagnitude faster than the corresponding forward dynamics. We soften all constraints, in a way that avoids instabilities and unrealistic penetrations associated with earlier spring-damper methods and yet is sufﬁcient to allow inversion. Constraints are imposed via impulses, using an extended version of the velocitystepping approach. For holomonic constraints the extension involves a soft version of the Gauss principle. For all other constraints we extend our earlier work on complementarity-free contact dynamics – which were already known to be invertible via an iterative solver – and develop a new formulation allowing analytical inversion.},
	language = {en},
	urldate = {2023-05-25},
	booktitle = {2014 {IEEE} {International} {Conference} on {Robotics} and {Automation} ({ICRA})},
	publisher = {IEEE},
	author = {Todorov, Emanuel},
	month = may,
	year = {2014},
	pages = {6054--6061},
	file = {Todorov - 2014 - Convex and analytically-invertible dynamics with c.pdf:/home/dferigo/Zotero/storage/P5DDZM5I/Todorov - 2014 - Convex and analytically-invertible dynamics with c.pdf:application/pdf},
}

@article{raffin_stable-baselines3_2021,
	title = {Stable-{Baselines3}: {Reliable} {Reinforcement} {Learning} {Implementations}},
	volume = {22},
	issn = {1533-7928},
	shorttitle = {Stable-{Baselines3}},
	url = {http://jmlr.org/papers/v22/20-1364.html},
	abstract = {Stable-Baselines3 provides open-source implementations of deep reinforcement learning (RL) algorithms in Python. The implementations have been benchmarked against reference codebases, and automated unit tests cover 95\% of the code. The algorithms follow a consistent interface and are accompanied by extensive documentation, making it simple to train and compare different RL algorithms. Our documentation, examples, and source-code are available at https://github.com/DLR-RM/stable-baselines3.},
	number = {268},
	urldate = {2023-05-26},
	journal = {Journal of Machine Learning Research},
	author = {Raffin, Antonin and Hill, Ashley and Gleave, Adam and Kanervisto, Anssi and Ernestus, Maximilian and Dormann, Noah},
	year = {2021},
	pages = {1--8},
	file = {Full Text PDF:/home/dferigo/Zotero/storage/VYS4Q5C5/Raffin et al. - 2021 - Stable-Baselines3 Reliable Reinforcement Learning.pdf:application/pdf;Source Code:/home/dferigo/Zotero/storage/5FJR88C5/stable-baselines3.html:text/html},
}

@misc{kingma_adam_2017,
	title = {Adam: {A} {Method} for {Stochastic} {Optimization}},
	shorttitle = {Adam},
	url = {http://arxiv.org/abs/1412.6980},
	doi = {10.48550/arXiv.1412.6980},
	abstract = {We introduce Adam, an algorithm for first-order gradient-based optimization of stochastic objective functions, based on adaptive estimates of lower-order moments. The method is straightforward to implement, is computationally efficient, has little memory requirements, is invariant to diagonal rescaling of the gradients, and is well suited for problems that are large in terms of data and/or parameters. The method is also appropriate for non-stationary objectives and problems with very noisy and/or sparse gradients. The hyper-parameters have intuitive interpretations and typically require little tuning. Some connections to related algorithms, on which Adam was inspired, are discussed. We also analyze the theoretical convergence properties of the algorithm and provide a regret bound on the convergence rate that is comparable to the best known results under the online convex optimization framework. Empirical results demonstrate that Adam works well in practice and compares favorably to other stochastic optimization methods. Finally, we discuss AdaMax, a variant of Adam based on the infinity norm.},
	urldate = {2023-05-26},
	publisher = {arXiv},
	author = {Kingma, Diederik P. and Ba, Jimmy},
	month = jan,
	year = {2017},
	note = {arXiv:1412.6980 [cs]},
	keywords = {Computer Science - Machine Learning},
	file = {arXiv Fulltext PDF:/home/dferigo/Zotero/storage/S4R6CY99/Kingma and Ba - 2017 - Adam A Method for Stochastic Optimization.pdf:application/pdf;arXiv.org Snapshot:/home/dferigo/Zotero/storage/SPL9QAXB/1412.html:text/html},
}

@article{salvato_crossing_2021,
	title = {Crossing the {Reality} {Gap}: {A} {Survey} on {Sim}-to-{Real} {Transferability} of {Robot} {Controllers} in {Reinforcement} {Learning}},
	volume = {9},
	issn = {2169-3536},
	shorttitle = {Crossing the {Reality} {Gap}},
	doi = {10.1109/ACCESS.2021.3126658},
	abstract = {The growing demand for robots able to act autonomously in complex scenarios has widely accelerated the introduction of Reinforcement Learning (RL) in robots control applications. However, the trial and error intrinsic nature of RL may result in long training time on real robots and, moreover, it may lead to dangerous outcomes. While simulators are useful tools to accelerate RL training and to ensure safety, they often are provided only with an approximated model of robot dynamics and of its interaction with the surrounding environment, thus resulting in what is called the reality gap (RG): a mismatch of simulated and real control-law performances caused by the inaccurate representation of the real environment in simulation. The most undesirable result occurs when the controller learnt in simulation fails the task on the real robot, thus resulting in an unsuccessful sim-to-real transfer. The goal of the present survey is threefold: (1) to identify the main approaches to face the RG problem in the context of robot control with RL, (2) to point out their shortcomings, and (3) to outline new potential research areas.},
	journal = {IEEE Access},
	author = {Salvato, Erica and Fenu, Gianfranco and Medvet, Eric and Pellegrino, Felice Andrea},
	year = {2021},
	note = {Conference Name: IEEE Access},
	keywords = {Faces, Process control, Reality gap, reinforcement learning, Reinforcement learning, Robot control, robotics, sim-to-real, Task analysis, Training},
	file = {IEEE Xplore Abstract Record:/home/dferigo/Zotero/storage/HC4H6JDQ/stamp.html:text/html;IEEE Xplore Full Text PDF:/home/dferigo/Zotero/storage/W7KM2P45/Salvato et al. - 2021 - Crossing the Reality Gap A Survey on Sim-to-Real .pdf:application/pdf},
}

@misc{bellegarda_robust_2021-1,
	title = {Robust {Quadruped} {Jumping} via {Deep} {Reinforcement} {Learning}},
	url = {http://arxiv.org/abs/2011.07089},
	doi = {10.48550/arXiv.2011.07089},
	abstract = {In this paper we consider a general task of jumping varying distances and heights for a quadrupedal robot in noisy environments, such as off of uneven terrain and with variable robot dynamics parameters. To accurately jump in such conditions, we propose a framework using deep reinforcement learning to leverage the complex solution of nonlinear trajectory optimization for quadrupedal jumping. While the standalone optimization limits jumping to take-off from flat ground and requires accurate assumption of robot dynamics, our proposed approach improves the robustness to allow jumping off of significantly uneven terrain with variable robot dynamical parameters. Through our method, the quadruped is able to jump distances of up to 1 m and heights of up to 0.4 m, while being robust to environment noise of foot disturbances of up to 0.1 m in height as well as with 5\% variability of its body mass and inertia. This behavior is learned through just a few thousand simulated jumps in PyBullet, and we perform a sim-to-sim transfer to Gazebo. Video results can be found at https://youtu.be/jkzvL2o3g-s.},
	urldate = {2023-05-30},
	publisher = {arXiv},
	author = {Bellegarda, Guillaume and Nguyen, Quan},
	month = mar,
	year = {2021},
	note = {arXiv:2011.07089 [cs, eess]},
	keywords = {Computer Science - Machine Learning, Computer Science - Robotics, Electrical Engineering and Systems Science - Systems and Control},
	file = {arXiv Fulltext PDF:/home/dferigo/Zotero/storage/NZQEVQ5U/Bellegarda and Nguyen - 2021 - Robust Quadruped Jumping via Deep Reinforcement Le.pdf:application/pdf;arXiv.org Snapshot:/home/dferigo/Zotero/storage/ZKZEFEYY/2011.html:text/html},
}

@inproceedings{du_auto-tuned_2021,
	title = {Auto-{Tuned} {Sim}-to-{Real} {Transfer}},
	doi = {10.1109/ICRA48506.2021.9562091},
	abstract = {Policies trained in simulation often fail when transferred to the real world due to the ‘reality gap’ where the simulator is unable to accurately capture the dynamics and visual properties of the real world. Current approaches to tackle this problem, such as domain randomization, require prior knowledge and engineering to determine how much to randomize system parameters in order to learn a policy that is robust to sim-to-real transfer while also not being too conservative. We propose a method for automatically tuning simulator system parameters to match the real world using only raw RGB images of the real world without the need to define rewards or estimate state. Our key insight is to reframe the auto-tuning of parameters as a search problem where we iteratively shift the simulation system parameters to approach the real world system parameters. We propose a Search Param Model (SPM) that, given a sequence of observations and actions and a set of system parameters, predicts whether the given parameters are higher or lower than the true parameters used to generate the observations. We evaluate our method on multiple robotic control tasks in both sim-to-sim and sim-to-real transfer, demonstrating significant improvement over naive domain randomization. Project videos at https://yuqingd.github.io/autotuned-sim2real/.},
	booktitle = {2021 {IEEE} {International} {Conference} on {Robotics} and {Automation} ({ICRA})},
	author = {Du, Yuqing and Watkins, Olivia and Darrell, Trevor and Abbeel, Pieter and Pathak, Deepak},
	month = may,
	year = {2021},
	note = {ISSN: 2577-087X},
	keywords = {Conferences, Task analysis, Predictive models, Automation, Knowledge engineering, Search problems, Visualization},
	pages = {1290--1296},
	file = {IEEE Xplore Abstract Record:/home/dferigo/Zotero/storage/SHA7TE7Y/stamp.html:text/html;Submitted Version:/home/dferigo/Zotero/storage/NNM4JVB8/Du et al. - 2021 - Auto-Tuned Sim-to-Real Transfer.pdf:application/pdf},
}

@article{muratore_robot_2022-1,
	title = {Robot {Learning} {From} {Randomized} {Simulations}: {A} {Review}},
	volume = {9},
	issn = {2296-9144},
	shorttitle = {Robot {Learning} {From} {Randomized} {Simulations}},
	url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9038844/},
	doi = {10.3389/frobt.2022.799893},
	abstract = {The rise of deep learning has caused a paradigm shift in robotics research, favoring methods that require large amounts of data. Unfortunately, it is prohibitively expensive to generate such data sets on a physical platform. Therefore, state-of-the-art approaches learn in simulation where data generation is fast as well as inexpensive and subsequently transfer the knowledge to the real robot (sim-to-real). Despite becoming increasingly realistic, all simulators are by construction based on models, hence inevitably imperfect. This raises the question of how simulators can be modified to facilitate learning robot control policies and overcome the mismatch between simulation and reality, often called the “reality gap.” We provide a comprehensive review of sim-to-real research for robotics, focusing on a technique named “domain randomization” which is a method for learning from randomized simulations.},
	urldate = {2023-05-30},
	journal = {Frontiers in Robotics and AI},
	author = {Muratore, Fabio and Ramos, Fabio and Turk, Greg and Yu, Wenhao and Gienger, Michael and Peters, Jan},
	month = apr,
	year = {2022},
	pmid = {35494543},
	pmcid = {PMC9038844},
	pages = {799893},
	file = {PubMed Central Full Text PDF:/home/dferigo/Zotero/storage/74G2AAQN/Muratore et al. - 2022 - Robot Learning From Randomized Simulations A Revi.pdf:application/pdf},
}